LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
232 ISD::SETCC}) {
233 setOperationAction(Opc, MVT::bf16, Promote);
234 }
235
237
239 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
240
244
245 // We only need to custom lower because we can't specify an action for bf16
246 // sources.
249 }
250
251 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
252 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
253 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
254 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
255 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
256 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
257 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
258 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
261 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
262 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
263 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
264 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
265 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
266 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
267
268 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
269 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
270 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
271 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
272 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
274 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
275
276 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
277 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
278
282 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
283
284 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
285
287 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
288
290 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
291 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
292
294 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
295 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
296 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
297 Expand);
299 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
300 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
301 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
302 Expand);
303
305 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
306 MVT::v3i16, MVT::v4i16, MVT::Other},
307 Custom);
308
311 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
312
314
316
318 Expand);
319
320#if 0
322#endif
323
324 // We only support LOAD/STORE and vector manipulation ops for vectors
325 // with > 4 elements.
326 for (MVT VT :
327 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
328 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
329 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
330 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
331 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
332 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
333 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
334 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
335 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
336 switch (Op) {
337 case ISD::LOAD:
338 case ISD::STORE:
340 case ISD::BITCAST:
341 case ISD::UNDEF:
345 case ISD::IS_FPCLASS:
346 break;
351 break;
352 default:
354 break;
355 }
356 }
357 }
358
360
361 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
362 // is expanded to avoid having two separate loops in case the index is a VGPR.
363
364 // Most operations are naturally 32-bit vector operations. We only support
365 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
366 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
378 }
379
380 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
392 }
393
394 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
406 }
407
408 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
420 }
421
422 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
424 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
425
427 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
428
430 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
431
433 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
434 }
435
437 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
438 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
439 Custom);
440
441 if (Subtarget->hasPkMovB32()) {
442 // TODO: 16-bit element vectors should be legal with even aligned elements.
443 // TODO: Can be legal with wider source types than the result with
444 // subregister extracts.
445 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
446 }
447
449 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
450 // instead lower to cndmask in SITargetLowering::LowerSELECT().
452 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
453 // alignbit.
454 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
455
456 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
457 Custom);
458
459 // Avoid stack access for these.
460 // TODO: Generalize to more vector types.
462 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
463 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 Custom);
465
466 // Deal with vec3 vector operations when widened to vec4.
468 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
469
470 // Deal with vec5/6/7 vector operations when widened to vec8.
472 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
473 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
474 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
475 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
476 Custom);
477
478 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
479 // and output demarshalling
480 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
481
482 // We can't return success/failure, only the old value,
483 // let LLVM add the comparison
485 Expand);
486
487 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
488
489 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
490
491 // FIXME: This should be narrowed to i32, but that only happens if i64 is
492 // illegal.
493 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
494 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
495
496 // On SI this is s_memtime and s_memrealtime on VI.
498
499 if (Subtarget->hasSMemRealTime() ||
500 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
503
504 if (Subtarget->has16BitInsts()) {
507 } else {
509 }
510
511 if (Subtarget->hasMadMacF32Insts())
513
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
526
527 // Clamp modifier on add/sub
528 if (Subtarget->hasIntClamp())
530
531 if (Subtarget->hasAddNoCarry())
532 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
533 Legal);
534
537 {MVT::f32, MVT::f64}, Custom);
538
539 // These are really only legal for ieee_mode functions. We should be avoiding
540 // them for functions that don't have ieee_mode enabled, so just say they are
541 // legal.
543 {MVT::f32, MVT::f64}, Legal);
544
545 if (Subtarget->haveRoundOpsF64())
547 Legal);
548 else
550 MVT::f64, Custom);
551
553 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
554 Legal);
555 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
556
559
560 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
561 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562
563 // Custom lower these because we can't specify a rule based on an illegal
564 // source bf16.
567
568 if (Subtarget->has16BitInsts()) {
571 MVT::i16, Legal);
572
573 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
574
576 MVT::i16, Expand);
577
581 ISD::CTPOP},
582 MVT::i16, Promote);
583
585
586 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
587
589 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
591 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
592
596
598
599 // F16 - Constant Actions.
602
603 // F16 - Load/Store Actions.
605 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
607 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
608
609 // BF16 - Load/Store Actions.
611 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
613 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
614
615 // F16 - VOP1 Actions.
618 MVT::f16, Custom);
619
620 // BF16 - VOP1 Actions.
621 if (Subtarget->hasBF16TransInsts())
623
626
627 // F16 - VOP2 Actions.
628 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
629 Expand);
633
634 // F16 - VOP3 Actions.
636 if (STI.hasMadF16())
638
639 for (MVT VT :
640 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
641 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
642 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
643 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
644 switch (Op) {
645 case ISD::LOAD:
646 case ISD::STORE:
648 case ISD::BITCAST:
649 case ISD::UNDEF:
654 case ISD::IS_FPCLASS:
655 break;
658 case ISD::FSIN:
659 case ISD::FCOS:
661 break;
662 default:
664 break;
665 }
666 }
667 }
668
669 // v_perm_b32 can handle either of these.
670 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
672
673 // XXX - Do these do anything? Vector constants turn into build_vector.
674 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
675
676 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
677 Legal);
678
680 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
682 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
683
685 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
687 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
688
689 setOperationAction(ISD::AND, MVT::v2i16, Promote);
690 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
691 setOperationAction(ISD::OR, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
693 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
694 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
695
697 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
699 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
700 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
701 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
702
704 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
706 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
708 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
709
711 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
713 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
714 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
720 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721
723 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
725 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
727 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
728
729 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
731 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
733 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
734 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
735
737 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
739 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
740 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
741 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
742
743 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
744 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
745 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
747 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
748 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
749
751 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
753 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
754 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
755 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
756
758 MVT::v2i32, Expand);
760
762 MVT::v4i32, Expand);
763
765 MVT::v8i32, Expand);
766
767 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
768 Subtarget->hasVOP3PInsts() ? Legal : Custom);
769
770 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
771 // This isn't really legal, but this avoids the legalizer unrolling it (and
772 // allows matching fneg (fabs x) patterns)
773 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
774
775 // Can do this in one BFI plus a constant materialize.
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
780 Custom);
781
784 MVT::f16, Custom);
786
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
790 Custom);
791
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
794 Expand);
795
796 for (MVT Vec16 :
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
801 Vec16, Custom);
803 }
804 }
805
806 if (Subtarget->hasVOP3PInsts()) {
810 MVT::v2i16, Legal);
811
814 MVT::v2f16, Legal);
815
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
818
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
823 Custom);
824
825 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
826 // Split vector operations.
831 VT, Custom);
832
833 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
834 // Split vector operations.
836 VT, Custom);
837
840 {MVT::v2f16, MVT::v4f16}, Custom);
841
842 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
843 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
844 Custom);
845
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
848 // Split vector operations.
850 VT, Custom);
851 }
852
853 if (Subtarget->hasPackedFP32Ops()) {
855 MVT::v2f32, Legal);
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
858 Custom);
859 }
860 }
861
863
864 if (Subtarget->has16BitInsts()) {
866 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
868 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
869 } else {
870 // Legalization hack.
871 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
872
874 }
875
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
881 Custom);
882
884
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
889
890 if (Subtarget->hasMad64_32())
892
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
899 } else {
900 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
901 if (Subtarget->hasMinimum3Maximum3F32())
903
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906
907 // If only the vector form is available, we need to widen to a vector.
908 if (!Subtarget->hasMinimum3Maximum3F16())
910 }
911 }
912
913 if (Subtarget->hasVOP3PInsts()) {
914 // We want to break these into v2f16 pieces, not scalarize.
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 Custom);
918 }
919
920 if (Subtarget->hasIntMinMax64())
922 Legal);
923
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
927 MVT::i8},
928 Custom);
929
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
935 Custom);
936
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
942 Custom);
943
949
950 // TODO: Could move this to custom lowering, could benefit from combines on
951 // extract of relevant bits.
953
955
956 if (Subtarget->hasBF16ConversionInsts()) {
957 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
959 }
960
961 if (Subtarget->hasBF16PackedInsts()) {
964 MVT::v2bf16, Legal);
965 }
966
967 if (Subtarget->hasBF16TransInsts()) {
969 }
970
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
974 Custom);
975 }
976
980 ISD::SUB,
982 ISD::MUL,
983 ISD::FADD,
984 ISD::FSUB,
985 ISD::FDIV,
986 ISD::FMUL,
995 ISD::FMA,
996 ISD::SMIN,
997 ISD::SMAX,
998 ISD::UMIN,
999 ISD::UMAX,
1000 ISD::SETCC,
1002 ISD::SMIN,
1003 ISD::SMAX,
1004 ISD::UMIN,
1005 ISD::UMAX,
1006 ISD::AND,
1007 ISD::OR,
1008 ISD::XOR,
1009 ISD::SHL,
1010 ISD::SRL,
1011 ISD::SRA,
1012 ISD::FSHR,
1022
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1025
1026 // All memory operations. Some folding on the pointer operand is done to help
1027 // matching the constant offsets in the addressing modes.
1029 ISD::STORE,
1054
1055 // FIXME: In other contexts we pretend this is a per-function property.
1057
1059}
1060
1061const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1062
1064 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1065 return RCRegs;
1066}
1067
1068//===----------------------------------------------------------------------===//
1069// TargetLowering queries
1070//===----------------------------------------------------------------------===//
1071
1072// v_mad_mix* support a conversion from f16 to f32.
1073//
1074// There is only one special case when denormals are enabled we don't currently,
1075// where this is OK to use.
1076bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1077 EVT DestVT, EVT SrcVT) const {
1078 return DestVT.getScalarType() == MVT::f32 &&
1079 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1081 SrcVT.getScalarType() == MVT::f16) ||
1082 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1083 SrcVT.getScalarType() == MVT::bf16)) &&
1084 // TODO: This probably only requires no input flushing?
1086}
1087
1089 LLT DestTy, LLT SrcTy) const {
1090 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1091 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1092 DestTy.getScalarSizeInBits() == 32 &&
1093 SrcTy.getScalarSizeInBits() == 16 &&
1094 // TODO: This probably only requires no input flushing?
1095 denormalModeIsFlushAllF32(*MI.getMF());
1096}
1097
1099 // SI has some legal vector types, but no legal vector operations. Say no
1100 // shuffles are legal in order to prefer scalarizing some vector operations.
1101 return false;
1102}
1103
1105 CallingConv::ID CC,
1106 EVT VT) const {
1108 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1109
1110 if (VT.isVector()) {
1111 EVT ScalarVT = VT.getScalarType();
1112 unsigned Size = ScalarVT.getSizeInBits();
1113 if (Size == 16) {
1114 if (Subtarget->has16BitInsts())
1115 return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
1116 return VT.isInteger() ? MVT::i32 : MVT::f32;
1117 }
1118
1119 if (Size < 16)
1120 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1121 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1122 }
1123
1124 if (VT.getSizeInBits() > 32)
1125 return MVT::i32;
1126
1127 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1128}
1129
1131 CallingConv::ID CC,
1132 EVT VT) const {
1134 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1135
1136 if (VT.isVector()) {
1137 unsigned NumElts = VT.getVectorNumElements();
1138 EVT ScalarVT = VT.getScalarType();
1139 unsigned Size = ScalarVT.getSizeInBits();
1140
1141 // FIXME: Should probably promote 8-bit vectors to i16.
1142 if (Size == 16 && Subtarget->has16BitInsts())
1143 return (NumElts + 1) / 2;
1144
1145 if (Size <= 32)
1146 return NumElts;
1147
1148 if (Size > 32)
1149 return NumElts * ((Size + 31) / 32);
1150 } else if (VT.getSizeInBits() > 32)
1151 return (VT.getSizeInBits() + 31) / 32;
1152
1153 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1154}
1155
1157 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1158 unsigned &NumIntermediates, MVT &RegisterVT) const {
1159 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1160 unsigned NumElts = VT.getVectorNumElements();
1161 EVT ScalarVT = VT.getScalarType();
1162 unsigned Size = ScalarVT.getSizeInBits();
1163 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1164 // support, but unless we can properly handle 3-vectors, it will be still be
1165 // inconsistent.
1166 if (Size == 16 && Subtarget->has16BitInsts()) {
1167 RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
1168 IntermediateVT = RegisterVT;
1169 NumIntermediates = (NumElts + 1) / 2;
1170 return NumIntermediates;
1171 }
1172
1173 if (Size == 32) {
1174 RegisterVT = ScalarVT.getSimpleVT();
1175 IntermediateVT = RegisterVT;
1176 NumIntermediates = NumElts;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size < 16 && Subtarget->has16BitInsts()) {
1181 // FIXME: Should probably form v2i16 pieces
1182 RegisterVT = MVT::i16;
1183 IntermediateVT = ScalarVT;
1184 NumIntermediates = NumElts;
1185 return NumIntermediates;
1186 }
1187
1188 if (Size != 16 && Size <= 32) {
1189 RegisterVT = MVT::i32;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size > 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = RegisterVT;
1198 NumIntermediates = NumElts * ((Size + 31) / 32);
1199 return NumIntermediates;
1200 }
1201 }
1202
1204 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1205}
1206
1208 const DataLayout &DL, Type *Ty,
1209 unsigned MaxNumLanes) {
1210 assert(MaxNumLanes != 0);
1211
1212 LLVMContext &Ctx = Ty->getContext();
1213 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1214 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1215 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1216 NumElts);
1217 }
1218
1219 return TLI.getValueType(DL, Ty);
1220}
1221
1222// Peek through TFE struct returns to only use the data size.
1224 const DataLayout &DL, Type *Ty,
1225 unsigned MaxNumLanes) {
1226 auto *ST = dyn_cast<StructType>(Ty);
1227 if (!ST)
1228 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1229
1230 // TFE intrinsics return an aggregate type.
1231 assert(ST->getNumContainedTypes() == 2 &&
1232 ST->getContainedType(1)->isIntegerTy(32));
1233 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1234}
1235
1236/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1237/// in-memory representation. This return value is a custom type because there
1238/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1239/// could cause issues during codegen, these address space 7 pointers will be
1240/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1241/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1242/// for cost modeling, to work. (This also sets us up decently for doing the
1243/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1245 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1246 return MVT::amdgpuBufferFatPointer;
1248 DL.getPointerSizeInBits(AS) == 192)
1249 return MVT::amdgpuBufferStridedPointer;
1251}
1252/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1253/// v8i32 when padding is added.
1254/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1255/// also v8i32 with padding.
1257 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1258 DL.getPointerSizeInBits(AS) == 160) ||
1260 DL.getPointerSizeInBits(AS) == 192))
1261 return MVT::v8i32;
1263}
1264
1265static unsigned getIntrMemWidth(unsigned IntrID) {
1266 switch (IntrID) {
1267 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1268 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1270 return 8;
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1272 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1273 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1274 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1275 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1276 return 32;
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1282 return 64;
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1288 return 128;
1289 default:
1290 llvm_unreachable("Unknown width");
1291 }
1292}
1293
1294static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1296 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1297 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1298 switch (AtomicOrderingCABI(Ord)) {
1301 break;
1304 break;
1307 break;
1308 default:
1310 break;
1311 }
1312
1313 Info.flags =
1315 Info.flags |= MOCooperative;
1316
1317 MDNode *ScopeMD = cast<MDNode>(
1318 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1319 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1320 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1321}
1322
1324 const CallBase &CI,
1325 MachineFunction &MF,
1326 unsigned IntrID) const {
1327 Info.flags = MachineMemOperand::MONone;
1328 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1329 Info.flags |= MachineMemOperand::MOInvariant;
1330 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1332 Info.flags |= getTargetMMOFlags(CI);
1333
1334 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1336 AttributeSet Attr =
1338 MemoryEffects ME = Attr.getMemoryEffects();
1339 if (ME.doesNotAccessMemory())
1340 return false;
1341
1342 // TODO: Should images get their own address space?
1343 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1344
1345 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1346 if (RsrcIntr->IsImage) {
1347 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1349 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1350 Info.align.reset();
1351 }
1352
1353 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1354 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1355 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1356 // We conservatively set the memory operand of a buffer intrinsic to the
1357 // base resource pointer, so that we can access alias information about
1358 // those pointers. Cases like "this points at the same value
1359 // but with a different offset" are handled in
1360 // areMemAccessesTriviallyDisjoint.
1361 Info.ptrVal = RsrcArg;
1362 }
1363
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1365 if (!IsSPrefetch) {
1366 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1367 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1368 Info.flags |= MachineMemOperand::MOVolatile;
1369 }
1370
1372 if (ME.onlyReadsMemory()) {
1373 if (RsrcIntr->IsImage) {
1374 unsigned MaxNumLanes = 4;
1375
1376 if (!BaseOpcode->Gather4) {
1377 // If this isn't a gather, we may have excess loaded elements in the
1378 // IR type. Check the dmask for the real number of elements loaded.
1379 unsigned DMask =
1380 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1381 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1382 }
1383
1384 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1385 CI.getType(), MaxNumLanes);
1386 } else {
1387 Info.memVT =
1389 std::numeric_limits<unsigned>::max());
1390 }
1391
1392 // FIXME: What does alignment mean for an image?
1393 Info.opc = ISD::INTRINSIC_W_CHAIN;
1394 Info.flags |= MachineMemOperand::MOLoad;
1395 } else if (ME.onlyWritesMemory()) {
1396 Info.opc = ISD::INTRINSIC_VOID;
1397
1398 Type *DataTy = CI.getArgOperand(0)->getType();
1399 if (RsrcIntr->IsImage) {
1400 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1401 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1402 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1403 DMaskLanes);
1404 } else
1405 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1406
1407 Info.flags |= MachineMemOperand::MOStore;
1408 } else {
1409 // Atomic, NoReturn Sampler or prefetch
1410 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1412 Info.flags |=
1414
1415 if (!IsSPrefetch)
1416 Info.flags |= MachineMemOperand::MOStore;
1417
1418 switch (IntrID) {
1419 default:
1420 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1421 // Fake memory access type for no return sampler intrinsics
1422 Info.memVT = MVT::i32;
1423 } else {
1424 // XXX - Should this be volatile without known ordering?
1425 Info.flags |= MachineMemOperand::MOVolatile;
1426 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1427 }
1428 break;
1429 case Intrinsic::amdgcn_raw_buffer_load_lds:
1430 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1431 case Intrinsic::amdgcn_struct_buffer_load_lds:
1432 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1433 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1434 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1435 Info.ptrVal = CI.getArgOperand(1);
1436 return true;
1437 }
1438 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1439 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1440 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1441 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1442 Info.memVT =
1444 std::numeric_limits<unsigned>::max());
1445 Info.flags &= ~MachineMemOperand::MOStore;
1446 return true;
1447 }
1448 }
1449 }
1450 return true;
1451 }
1452
1453 switch (IntrID) {
1454 case Intrinsic::amdgcn_ds_ordered_add:
1455 case Intrinsic::amdgcn_ds_ordered_swap: {
1456 Info.opc = ISD::INTRINSIC_W_CHAIN;
1457 Info.memVT = MVT::getVT(CI.getType());
1458 Info.ptrVal = CI.getOperand(0);
1459 Info.align.reset();
1461
1462 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1463 if (!Vol->isZero())
1464 Info.flags |= MachineMemOperand::MOVolatile;
1465
1466 return true;
1467 }
1468 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1469 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1470 Info.opc = ISD::INTRINSIC_W_CHAIN;
1471 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1472 Info.ptrVal = nullptr;
1473 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1475 return true;
1476 }
1477 case Intrinsic::amdgcn_ds_append:
1478 case Intrinsic::amdgcn_ds_consume: {
1479 Info.opc = ISD::INTRINSIC_W_CHAIN;
1480 Info.memVT = MVT::getVT(CI.getType());
1481 Info.ptrVal = CI.getOperand(0);
1482 Info.align.reset();
1484
1485 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1486 if (!Vol->isZero())
1487 Info.flags |= MachineMemOperand::MOVolatile;
1488
1489 return true;
1490 }
1491 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1492 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1493 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1496 Info.memVT = MVT::getVT(CI.getType());
1497 Info.ptrVal = CI.getOperand(0);
1498 Info.memVT = MVT::i64;
1499 Info.size = 8;
1500 Info.align.reset();
1502 return true;
1503 }
1504 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1505 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1506 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1507 Info.opc = ISD::INTRINSIC_W_CHAIN;
1508 Info.memVT =
1509 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1510 ? CI.getType()
1512 ->getElementType(0)); // XXX: what is correct VT?
1513
1514 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1515 Info.align.reset();
1516 Info.flags |=
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_global_atomic_fmin_num:
1521 case Intrinsic::amdgcn_global_atomic_fmax_num:
1522 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1523 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1524 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1525 Info.opc = ISD::INTRINSIC_W_CHAIN;
1526 Info.memVT = MVT::getVT(CI.getType());
1527 Info.ptrVal = CI.getOperand(0);
1528 Info.align.reset();
1532 return true;
1533 }
1534 case Intrinsic::amdgcn_flat_load_monitor_b32:
1535 case Intrinsic::amdgcn_flat_load_monitor_b64:
1536 case Intrinsic::amdgcn_flat_load_monitor_b128:
1537 case Intrinsic::amdgcn_global_load_monitor_b32:
1538 case Intrinsic::amdgcn_global_load_monitor_b64:
1539 case Intrinsic::amdgcn_global_load_monitor_b128:
1540 case Intrinsic::amdgcn_cluster_load_b32:
1541 case Intrinsic::amdgcn_cluster_load_b64:
1542 case Intrinsic::amdgcn_cluster_load_b128:
1543 case Intrinsic::amdgcn_ds_load_tr6_b96:
1544 case Intrinsic::amdgcn_ds_load_tr4_b64:
1545 case Intrinsic::amdgcn_ds_load_tr8_b64:
1546 case Intrinsic::amdgcn_ds_load_tr16_b128:
1547 case Intrinsic::amdgcn_global_load_tr6_b96:
1548 case Intrinsic::amdgcn_global_load_tr4_b64:
1549 case Intrinsic::amdgcn_global_load_tr_b64:
1550 case Intrinsic::amdgcn_global_load_tr_b128:
1551 case Intrinsic::amdgcn_ds_read_tr4_b64:
1552 case Intrinsic::amdgcn_ds_read_tr6_b96:
1553 case Intrinsic::amdgcn_ds_read_tr8_b64:
1554 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1555 Info.opc = ISD::INTRINSIC_W_CHAIN;
1556 Info.memVT = MVT::getVT(CI.getType());
1557 Info.ptrVal = CI.getOperand(0);
1558 Info.align.reset();
1559 Info.flags |= MachineMemOperand::MOLoad;
1560 return true;
1561 }
1562 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1563 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1564 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1565 Info.opc = ISD::INTRINSIC_W_CHAIN;
1566 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1567 Info.ptrVal = CI.getOperand(0);
1568 Info.align.reset();
1569 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1570 return true;
1571 }
1572 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1573 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1574 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1575 Info.opc = ISD::INTRINSIC_VOID;
1576 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1577 Info.ptrVal = CI.getArgOperand(0);
1578 Info.align.reset();
1579 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1580 return true;
1581 }
1582 case Intrinsic::amdgcn_ds_gws_init:
1583 case Intrinsic::amdgcn_ds_gws_barrier:
1584 case Intrinsic::amdgcn_ds_gws_sema_v:
1585 case Intrinsic::amdgcn_ds_gws_sema_br:
1586 case Intrinsic::amdgcn_ds_gws_sema_p:
1587 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1588 Info.opc = ISD::INTRINSIC_VOID;
1589
1590 const GCNTargetMachine &TM =
1591 static_cast<const GCNTargetMachine &>(getTargetMachine());
1592
1594 Info.ptrVal = MFI->getGWSPSV(TM);
1595
1596 // This is an abstract access, but we need to specify a type and size.
1597 Info.memVT = MVT::i32;
1598 Info.size = 4;
1599 Info.align = Align(4);
1600
1601 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1602 Info.flags |= MachineMemOperand::MOLoad;
1603 else
1604 Info.flags |= MachineMemOperand::MOStore;
1605 return true;
1606 }
1607 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1608 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1609 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1610 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1611 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1612 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1613 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1614 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1615 Info.opc = ISD::INTRINSIC_VOID;
1616 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1617 Info.ptrVal = CI.getArgOperand(1);
1619 return true;
1620 }
1621 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1622 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1623 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1624 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1625 Info.opc = ISD::INTRINSIC_VOID;
1626 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1627 Info.ptrVal = CI.getArgOperand(0);
1629 return true;
1630 }
1631 case Intrinsic::amdgcn_load_to_lds:
1632 case Intrinsic::amdgcn_global_load_lds: {
1633 Info.opc = ISD::INTRINSIC_VOID;
1634 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1635 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1636 Info.ptrVal = CI.getArgOperand(1);
1638 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1639 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1640 Info.flags |= MachineMemOperand::MOVolatile;
1641 return true;
1642 }
1643 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1644 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1645 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1646 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1647 Info.opc = ISD::INTRINSIC_W_CHAIN;
1648
1649 const GCNTargetMachine &TM =
1650 static_cast<const GCNTargetMachine &>(getTargetMachine());
1651
1653 Info.ptrVal = MFI->getGWSPSV(TM);
1654
1655 // This is an abstract access, but we need to specify a type and size.
1656 Info.memVT = MVT::i32;
1657 Info.size = 4;
1658 Info.align = Align(4);
1659
1661 return true;
1662 }
1663 case Intrinsic::amdgcn_s_prefetch_data:
1664 case Intrinsic::amdgcn_flat_prefetch:
1665 case Intrinsic::amdgcn_global_prefetch: {
1666 Info.opc = ISD::INTRINSIC_VOID;
1667 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1668 Info.ptrVal = CI.getArgOperand(0);
1669 Info.flags |= MachineMemOperand::MOLoad;
1670 return true;
1671 }
1672 default:
1673 return false;
1674 }
1675}
1676
1678 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1680 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1681 // The DAG's ValueType loses the addrspaces.
1682 // Add them as 2 extra Constant operands "from" and "to".
1683 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1684 unsigned DstAS = I.getType()->getPointerAddressSpace();
1685 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1686 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1687 break;
1688 }
1689 default:
1690 break;
1691 }
1692}
1693
1696 Type *&AccessTy) const {
1697 Value *Ptr = nullptr;
1698 switch (II->getIntrinsicID()) {
1699 case Intrinsic::amdgcn_cluster_load_b128:
1700 case Intrinsic::amdgcn_cluster_load_b64:
1701 case Intrinsic::amdgcn_cluster_load_b32:
1702 case Intrinsic::amdgcn_ds_append:
1703 case Intrinsic::amdgcn_ds_consume:
1704 case Intrinsic::amdgcn_ds_load_tr8_b64:
1705 case Intrinsic::amdgcn_ds_load_tr16_b128:
1706 case Intrinsic::amdgcn_ds_load_tr4_b64:
1707 case Intrinsic::amdgcn_ds_load_tr6_b96:
1708 case Intrinsic::amdgcn_ds_read_tr4_b64:
1709 case Intrinsic::amdgcn_ds_read_tr6_b96:
1710 case Intrinsic::amdgcn_ds_read_tr8_b64:
1711 case Intrinsic::amdgcn_ds_read_tr16_b64:
1712 case Intrinsic::amdgcn_ds_ordered_add:
1713 case Intrinsic::amdgcn_ds_ordered_swap:
1714 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1715 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1716 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1717 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1718 case Intrinsic::amdgcn_flat_load_monitor_b128:
1719 case Intrinsic::amdgcn_flat_load_monitor_b32:
1720 case Intrinsic::amdgcn_flat_load_monitor_b64:
1721 case Intrinsic::amdgcn_global_atomic_fmax_num:
1722 case Intrinsic::amdgcn_global_atomic_fmin_num:
1723 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1724 case Intrinsic::amdgcn_global_load_monitor_b128:
1725 case Intrinsic::amdgcn_global_load_monitor_b32:
1726 case Intrinsic::amdgcn_global_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_load_tr_b64:
1728 case Intrinsic::amdgcn_global_load_tr_b128:
1729 case Intrinsic::amdgcn_global_load_tr4_b64:
1730 case Intrinsic::amdgcn_global_load_tr6_b96:
1731 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1732 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1733 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1734 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1735 Ptr = II->getArgOperand(0);
1736 break;
1737 case Intrinsic::amdgcn_load_to_lds:
1738 case Intrinsic::amdgcn_global_load_lds:
1739 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1740 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1741 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1742 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1743 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1744 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1745 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1746 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1747 Ptr = II->getArgOperand(1);
1748 break;
1749 default:
1750 return false;
1751 }
1752 AccessTy = II->getType();
1753 Ops.push_back(Ptr);
1754 return true;
1755}
1756
1758 unsigned AddrSpace) const {
1759 if (!Subtarget->hasFlatInstOffsets()) {
1760 // Flat instructions do not have offsets, and only have the register
1761 // address.
1762 return AM.BaseOffs == 0 && AM.Scale == 0;
1763 }
1764
1765 decltype(SIInstrFlags::FLAT) FlatVariant =
1769
1770 return AM.Scale == 0 &&
1771 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1772 AM.BaseOffs, AddrSpace, FlatVariant));
1773}
1774
1776 if (Subtarget->hasFlatGlobalInsts())
1778
1779 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1780 // Assume the we will use FLAT for all global memory accesses
1781 // on VI.
1782 // FIXME: This assumption is currently wrong. On VI we still use
1783 // MUBUF instructions for the r + i addressing mode. As currently
1784 // implemented, the MUBUF instructions only work on buffer < 4GB.
1785 // It may be possible to support > 4GB buffers with MUBUF instructions,
1786 // by setting the stride value in the resource descriptor which would
1787 // increase the size limit to (stride * 4GB). However, this is risky,
1788 // because it has never been validated.
1790 }
1791
1792 return isLegalMUBUFAddressingMode(AM);
1793}
1794
1795bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1796 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1797 // additionally can do r + r + i with addr64. 32-bit has more addressing
1798 // mode options. Depending on the resource constant, it can also do
1799 // (i64 r0) + (i32 r1) * (i14 i).
1800 //
1801 // Private arrays end up using a scratch buffer most of the time, so also
1802 // assume those use MUBUF instructions. Scratch loads / stores are currently
1803 // implemented as mubuf instructions with offen bit set, so slightly
1804 // different than the normal addr64.
1805 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1806 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1807 return false;
1808
1809 // FIXME: Since we can split immediate into soffset and immediate offset,
1810 // would it make sense to allow any immediate?
1811
1812 switch (AM.Scale) {
1813 case 0: // r + i or just i, depending on HasBaseReg.
1814 return true;
1815 case 1:
1816 return true; // We have r + r or r + i.
1817 case 2:
1818 if (AM.HasBaseReg) {
1819 // Reject 2 * r + r.
1820 return false;
1821 }
1822
1823 // Allow 2 * r as r + r
1824 // Or 2 * r + i is allowed as r + r + i.
1825 return true;
1826 default: // Don't allow n * r
1827 return false;
1828 }
1829}
1830
1832 const AddrMode &AM, Type *Ty,
1833 unsigned AS,
1834 Instruction *I) const {
1835 // No global is ever allowed as a base.
1836 if (AM.BaseGV)
1837 return false;
1838
1839 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1840 return isLegalGlobalAddressingMode(AM);
1841
1842 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1846 // If the offset isn't a multiple of 4, it probably isn't going to be
1847 // correctly aligned.
1848 // FIXME: Can we get the real alignment here?
1849 if (AM.BaseOffs % 4 != 0)
1850 return isLegalMUBUFAddressingMode(AM);
1851
1852 if (!Subtarget->hasScalarSubwordLoads()) {
1853 // There are no SMRD extloads, so if we have to do a small type access we
1854 // will use a MUBUF load.
1855 // FIXME?: We also need to do this if unaligned, but we don't know the
1856 // alignment here.
1857 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1858 return isLegalGlobalAddressingMode(AM);
1859 }
1860
1861 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1862 // SMRD instructions have an 8-bit, dword offset on SI.
1863 if (!isUInt<8>(AM.BaseOffs / 4))
1864 return false;
1865 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1866 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1867 // in 8-bits, it can use a smaller encoding.
1868 if (!isUInt<32>(AM.BaseOffs / 4))
1869 return false;
1870 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1871 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1872 if (!isUInt<20>(AM.BaseOffs))
1873 return false;
1874 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1875 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1876 // for S_BUFFER_* instructions).
1877 if (!isInt<21>(AM.BaseOffs))
1878 return false;
1879 } else {
1880 // On GFX12, all offsets are signed 24-bit in bytes.
1881 if (!isInt<24>(AM.BaseOffs))
1882 return false;
1883 }
1884
1885 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1887 AM.BaseOffs < 0) {
1888 // Scalar (non-buffer) loads can only use a negative offset if
1889 // soffset+offset is non-negative. Since the compiler can only prove that
1890 // in a few special cases, it is safer to claim that negative offsets are
1891 // not supported.
1892 return false;
1893 }
1894
1895 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1896 return true;
1897
1898 if (AM.Scale == 1 && AM.HasBaseReg)
1899 return true;
1900
1901 return false;
1902 }
1903
1904 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1905 return Subtarget->enableFlatScratch()
1907 : isLegalMUBUFAddressingMode(AM);
1908
1909 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1910 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1911 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1912 // field.
1913 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1914 // an 8-bit dword offset but we don't know the alignment here.
1915 if (!isUInt<16>(AM.BaseOffs))
1916 return false;
1917
1918 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1919 return true;
1920
1921 if (AM.Scale == 1 && AM.HasBaseReg)
1922 return true;
1923
1924 return false;
1925 }
1926
1928 // For an unknown address space, this usually means that this is for some
1929 // reason being used for pure arithmetic, and not based on some addressing
1930 // computation. We don't have instructions that compute pointers with any
1931 // addressing modes, so treat them as having no offset like flat
1932 // instructions.
1934 }
1935
1936 // Assume a user alias of global for unknown address spaces.
1937 return isLegalGlobalAddressingMode(AM);
1938}
1939
1941 const MachineFunction &MF) const {
1943 return (MemVT.getSizeInBits() <= 4 * 32);
1944 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1945 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1946 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1947 }
1949 return (MemVT.getSizeInBits() <= 2 * 32);
1950 return true;
1951}
1952
1954 unsigned Size, unsigned AddrSpace, Align Alignment,
1955 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1956 if (IsFast)
1957 *IsFast = 0;
1958
1959 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1960 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1961 // Check if alignment requirements for ds_read/write instructions are
1962 // disabled.
1963 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1964 return false;
1965
1966 Align RequiredAlignment(
1967 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1968 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1969 Alignment < RequiredAlignment)
1970 return false;
1971
1972 // Either, the alignment requirements are "enabled", or there is an
1973 // unaligned LDS access related hardware bug though alignment requirements
1974 // are "disabled". In either case, we need to check for proper alignment
1975 // requirements.
1976 //
1977 switch (Size) {
1978 case 64:
1979 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1980 // address is negative, then the instruction is incorrectly treated as
1981 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1982 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1983 // load later in the SILoadStoreOptimizer.
1984 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1985 return false;
1986
1987 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1988 // can do a 4 byte aligned, 8 byte access in a single operation using
1989 // ds_read2/write2_b32 with adjacent offsets.
1990 RequiredAlignment = Align(4);
1991
1992 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1993 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1994 // ds_write2_b32 depending on the alignment. In either case with either
1995 // alignment there is no faster way of doing this.
1996
1997 // The numbers returned here and below are not additive, it is a 'speed
1998 // rank'. They are just meant to be compared to decide if a certain way
1999 // of lowering an operation is faster than another. For that purpose
2000 // naturally aligned operation gets it bitsize to indicate that "it
2001 // operates with a speed comparable to N-bit wide load". With the full
2002 // alignment ds128 is slower than ds96 for example. If underaligned it
2003 // is comparable to a speed of a single dword access, which would then
2004 // mean 32 < 128 and it is faster to issue a wide load regardless.
2005 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2006 // wider load which will not be aligned anymore the latter is slower.
2007 if (IsFast)
2008 *IsFast = (Alignment >= RequiredAlignment) ? 64
2009 : (Alignment < Align(4)) ? 32
2010 : 1;
2011 return true;
2012 }
2013
2014 break;
2015 case 96:
2016 if (!Subtarget->hasDS96AndDS128())
2017 return false;
2018
2019 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2020 // gfx8 and older.
2021
2022 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2023 // Naturally aligned access is fastest. However, also report it is Fast
2024 // if memory is aligned less than DWORD. A narrow load or store will be
2025 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2026 // be more of them, so overall we will pay less penalty issuing a single
2027 // instruction.
2028
2029 // See comment on the values above.
2030 if (IsFast)
2031 *IsFast = (Alignment >= RequiredAlignment) ? 96
2032 : (Alignment < Align(4)) ? 32
2033 : 1;
2034 return true;
2035 }
2036
2037 break;
2038 case 128:
2039 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2040 return false;
2041
2042 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2043 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2044 // single operation using ds_read2/write2_b64.
2045 RequiredAlignment = Align(8);
2046
2047 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2048 // Naturally aligned access is fastest. However, also report it is Fast
2049 // if memory is aligned less than DWORD. A narrow load or store will be
2050 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2051 // will be more of them, so overall we will pay less penalty issuing a
2052 // single instruction.
2053
2054 // See comment on the values above.
2055 if (IsFast)
2056 *IsFast = (Alignment >= RequiredAlignment) ? 128
2057 : (Alignment < Align(4)) ? 32
2058 : 1;
2059 return true;
2060 }
2061
2062 break;
2063 default:
2064 if (Size > 32)
2065 return false;
2066
2067 break;
2068 }
2069
2070 // See comment on the values above.
2071 // Note that we have a single-dword or sub-dword here, so if underaligned
2072 // it is a slowest possible access, hence returned value is 0.
2073 if (IsFast)
2074 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2075
2076 return Alignment >= RequiredAlignment ||
2077 Subtarget->hasUnalignedDSAccessEnabled();
2078 }
2079
2080 // FIXME: We have to be conservative here and assume that flat operations
2081 // will access scratch. If we had access to the IR function, then we
2082 // could determine if any private memory was used in the function.
2083 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2084 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2085 bool AlignedBy4 = Alignment >= Align(4);
2086 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2087 if (IsFast)
2088 *IsFast = AlignedBy4 ? Size : 1;
2089 return true;
2090 }
2091
2092 if (IsFast)
2093 *IsFast = AlignedBy4;
2094
2095 return AlignedBy4;
2096 }
2097
2098 // So long as they are correct, wide global memory operations perform better
2099 // than multiple smaller memory ops -- even when misaligned
2100 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2101 if (IsFast)
2102 *IsFast = Size;
2103
2104 return Alignment >= Align(4) ||
2105 Subtarget->hasUnalignedBufferAccessEnabled();
2106 }
2107
2108 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2109 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2110 // out-of-bounds behavior, but in the edge case where an access starts
2111 // out-of-bounds and then enter in-bounds, the entire access would be treated
2112 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2113 // natural alignment of buffer accesses.
2114 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2115 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2116 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2117 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2118 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2119 return false;
2120 }
2121
2122 // Smaller than dword value must be aligned.
2123 if (Size < 32)
2124 return false;
2125
2126 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2127 // byte-address are ignored, thus forcing Dword alignment.
2128 // This applies to private, global, and constant memory.
2129 if (IsFast)
2130 *IsFast = 1;
2131
2132 return Size >= 32 && Alignment >= Align(4);
2133}
2134
2136 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2137 unsigned *IsFast) const {
2139 Alignment, Flags, IsFast);
2140}
2141
2143 LLVMContext &Context, const MemOp &Op,
2144 const AttributeList &FuncAttributes) const {
2145 // FIXME: Should account for address space here.
2146
2147 // The default fallback uses the private pointer size as a guess for a type to
2148 // use. Make sure we switch these to 64-bit accesses.
2149
2150 if (Op.size() >= 16 &&
2151 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2152 return MVT::v4i32;
2153
2154 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2155 return MVT::v2i32;
2156
2157 // Use the default.
2158 return MVT::Other;
2159}
2160
2162 const MemSDNode *MemNode = cast<MemSDNode>(N);
2163 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2164}
2165
2170
2172 unsigned DestAS) const {
2173 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2174 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2175 Subtarget->hasGloballyAddressableScratch()) {
2176 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2177 return false;
2178 }
2179
2180 // Flat -> private/local is a simple truncate.
2181 // Flat -> global is no-op
2182 return true;
2183 }
2184
2185 const GCNTargetMachine &TM =
2186 static_cast<const GCNTargetMachine &>(getTargetMachine());
2187 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2188}
2189
2197
2199 Type *Ty) const {
2200 // FIXME: Could be smarter if called for vector constants.
2201 return true;
2202}
2203
2205 unsigned Index) const {
2207 return false;
2208
2209 // TODO: Add more cases that are cheap.
2210 return Index == 0;
2211}
2212
2213bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2214 // TODO: This should be more aggressive, particular for 16-bit element
2215 // vectors. However there are some mixed improvements and regressions.
2216 EVT EltTy = VT.getVectorElementType();
2217 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2218 return EltTy.getSizeInBits() % MinAlign == 0;
2219}
2220
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2223 switch (Op) {
2224 case ISD::LOAD:
2225 case ISD::STORE:
2226 return true;
2227 default:
2228 return false;
2229 }
2230 }
2231
2232 // SimplifySetCC uses this function to determine whether or not it should
2233 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2234 if (VT == MVT::i1 && Op == ISD::SETCC)
2235 return false;
2236
2238}
2239
2242 // This isn't really a constant pool but close enough.
2245 return PtrInfo;
2246}
2247
2248SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2249 const SDLoc &SL,
2250 SDValue Chain,
2251 uint64_t Offset) const {
2252 const DataLayout &DL = DAG.getDataLayout();
2256
2257 auto [InputPtrReg, RC, ArgTy] =
2258 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2259
2260 // We may not have the kernarg segment argument if we have no kernel
2261 // arguments.
2262 if (!InputPtrReg)
2263 return DAG.getConstant(Offset, SL, PtrVT);
2264
2266 SDValue BasePtr = DAG.getCopyFromReg(
2267 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2268
2269 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2270}
2271
2272SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2273 const SDLoc &SL) const {
2276 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2277}
2278
2279SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2281
2283 std::optional<uint32_t> KnownSize =
2285 if (KnownSize.has_value())
2286 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2287 return SDValue();
2288}
2289
2290SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2291 const SDLoc &SL, SDValue Val,
2292 bool Signed,
2293 const ISD::InputArg *Arg) const {
2294 // First, if it is a widened vector, narrow it.
2295 if (VT.isVector() &&
2297 EVT NarrowedVT =
2300 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2301 DAG.getConstant(0, SL, MVT::i32));
2302 }
2303
2304 // Then convert the vector elements or scalar value.
2305 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2306 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2307 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2308 }
2309
2310 if (MemVT.isFloatingPoint()) {
2311 if (VT.isFloatingPoint()) {
2312 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2313 } else {
2314 assert(!MemVT.isVector());
2315 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2316 SDValue Cast = DAG.getBitcast(IntVT, Val);
2317 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2318 }
2319 } else if (Signed)
2320 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2321 else
2322 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2323
2324 return Val;
2325}
2326
2327SDValue SITargetLowering::lowerKernargMemParameter(
2328 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2329 uint64_t Offset, Align Alignment, bool Signed,
2330 const ISD::InputArg *Arg) const {
2331
2332 MachinePointerInfo PtrInfo =
2334
2335 // Try to avoid using an extload by loading earlier than the argument address,
2336 // and extracting the relevant bits. The load should hopefully be merged with
2337 // the previous argument.
2338 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2339 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2340 int64_t AlignDownOffset = alignDown(Offset, 4);
2341 int64_t OffsetDiff = Offset - AlignDownOffset;
2342
2343 EVT IntVT = MemVT.changeTypeToInteger();
2344
2345 // TODO: If we passed in the base kernel offset we could have a better
2346 // alignment than 4, but we don't really need it.
2347 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2348 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2349 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2352
2353 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2354 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2355
2356 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2357 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2358 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2359
2360 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2361 }
2362
2363 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2364 SDValue Load = DAG.getLoad(
2365 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2367
2368 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2369 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2370}
2371
2372/// Coerce an argument which was passed in a different ABI type to the original
2373/// expected value type.
2374SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2375 SDValue Val,
2376 CCValAssign &VA,
2377 const SDLoc &SL) const {
2378 EVT ValVT = VA.getValVT();
2379
2380 // If this is an 8 or 16-bit value, it is really passed promoted
2381 // to 32 bits. Insert an assert[sz]ext to capture this, then
2382 // truncate to the right size.
2383 switch (VA.getLocInfo()) {
2384 case CCValAssign::Full:
2385 return Val;
2386 case CCValAssign::BCvt:
2387 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2388 case CCValAssign::SExt:
2389 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2390 DAG.getValueType(ValVT));
2391 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2392 case CCValAssign::ZExt:
2393 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2394 DAG.getValueType(ValVT));
2395 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2396 case CCValAssign::AExt:
2397 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2398 default:
2399 llvm_unreachable("Unknown loc info!");
2400 }
2401}
2402
2403SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2404 CCValAssign &VA, const SDLoc &SL,
2405 SDValue Chain,
2406 const ISD::InputArg &Arg) const {
2407 MachineFunction &MF = DAG.getMachineFunction();
2408 MachineFrameInfo &MFI = MF.getFrameInfo();
2409
2410 if (Arg.Flags.isByVal()) {
2411 unsigned Size = Arg.Flags.getByValSize();
2412 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2413 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2414 }
2415
2416 unsigned ArgOffset = VA.getLocMemOffset();
2417 unsigned ArgSize = VA.getValVT().getStoreSize();
2418
2419 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2420
2421 // Create load nodes to retrieve arguments from the stack.
2422 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2423
2424 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2426 MVT MemVT = VA.getValVT();
2427
2428 switch (VA.getLocInfo()) {
2429 default:
2430 break;
2431 case CCValAssign::BCvt:
2432 MemVT = VA.getLocVT();
2433 break;
2434 case CCValAssign::SExt:
2435 ExtType = ISD::SEXTLOAD;
2436 break;
2437 case CCValAssign::ZExt:
2438 ExtType = ISD::ZEXTLOAD;
2439 break;
2440 case CCValAssign::AExt:
2441 ExtType = ISD::EXTLOAD;
2442 break;
2443 }
2444
2445 SDValue ArgValue = DAG.getExtLoad(
2446 ExtType, SL, VA.getLocVT(), Chain, FIN,
2448
2449 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2450 if (ConvertedVal == ArgValue)
2451 return ConvertedVal;
2452
2453 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2454}
2455
2456SDValue SITargetLowering::lowerWorkGroupId(
2457 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2460 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2461 if (!Subtarget->hasClusters())
2462 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2463
2464 // Clusters are supported. Return the global position in the grid. If clusters
2465 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2466
2467 // WorkGroupIdXYZ = ClusterId == 0 ?
2468 // ClusterIdXYZ :
2469 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2470 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2471 SDLoc SL(ClusterIdXYZ);
2472 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2473 SDValue One = DAG.getConstant(1, SL, VT);
2474 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2475 SDValue ClusterWorkGroupIdXYZ =
2476 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2477 SDValue GlobalIdXYZ =
2478 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2479 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2480
2481 switch (MFI.getClusterDims().getKind()) {
2484 return GlobalIdXYZ;
2486 return ClusterIdXYZ;
2488 using namespace AMDGPU::Hwreg;
2489 SDValue ClusterIdField =
2490 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2491 SDNode *GetReg =
2492 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2493 SDValue ClusterId(GetReg, 0);
2494 SDValue Zero = DAG.getConstant(0, SL, VT);
2495 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2496 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2497 }
2498 }
2499
2500 llvm_unreachable("nothing should reach here");
2501}
2502
2503SDValue SITargetLowering::getPreloadedValue(
2504 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2506 const ArgDescriptor *Reg = nullptr;
2507 const TargetRegisterClass *RC;
2508 LLT Ty;
2509
2511 const ArgDescriptor WorkGroupIDX =
2512 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2513 // If GridZ is not programmed in an entry function then the hardware will set
2514 // it to all zeros, so there is no need to mask the GridY value in the low
2515 // order bits.
2516 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2517 AMDGPU::TTMP7,
2518 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2519 const ArgDescriptor WorkGroupIDZ =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2521 const ArgDescriptor ClusterWorkGroupIDX =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2523 const ArgDescriptor ClusterWorkGroupIDY =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2525 const ArgDescriptor ClusterWorkGroupIDZ =
2526 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2527 const ArgDescriptor ClusterWorkGroupMaxIDX =
2528 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2529 const ArgDescriptor ClusterWorkGroupMaxIDY =
2530 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2531 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2532 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2533 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2534 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2535
2536 auto LoadConstant = [&](unsigned N) {
2537 return DAG.getConstant(N, SDLoc(), VT);
2538 };
2539
2540 if (Subtarget->hasArchitectedSGPRs() &&
2542 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2543 bool HasFixedDims = ClusterDims.isFixedDims();
2544
2545 switch (PVID) {
2547 Reg = &WorkGroupIDX;
2548 RC = &AMDGPU::SReg_32RegClass;
2549 Ty = LLT::scalar(32);
2550 break;
2552 Reg = &WorkGroupIDY;
2553 RC = &AMDGPU::SReg_32RegClass;
2554 Ty = LLT::scalar(32);
2555 break;
2557 Reg = &WorkGroupIDZ;
2558 RC = &AMDGPU::SReg_32RegClass;
2559 Ty = LLT::scalar(32);
2560 break;
2562 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2563 return LoadConstant(0);
2564 Reg = &ClusterWorkGroupIDX;
2565 RC = &AMDGPU::SReg_32RegClass;
2566 Ty = LLT::scalar(32);
2567 break;
2569 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2570 return LoadConstant(0);
2571 Reg = &ClusterWorkGroupIDY;
2572 RC = &AMDGPU::SReg_32RegClass;
2573 Ty = LLT::scalar(32);
2574 break;
2576 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2577 return LoadConstant(0);
2578 Reg = &ClusterWorkGroupIDZ;
2579 RC = &AMDGPU::SReg_32RegClass;
2580 Ty = LLT::scalar(32);
2581 break;
2583 if (HasFixedDims)
2584 return LoadConstant(ClusterDims.getDims()[0] - 1);
2585 Reg = &ClusterWorkGroupMaxIDX;
2586 RC = &AMDGPU::SReg_32RegClass;
2587 Ty = LLT::scalar(32);
2588 break;
2590 if (HasFixedDims)
2591 return LoadConstant(ClusterDims.getDims()[1] - 1);
2592 Reg = &ClusterWorkGroupMaxIDY;
2593 RC = &AMDGPU::SReg_32RegClass;
2594 Ty = LLT::scalar(32);
2595 break;
2597 if (HasFixedDims)
2598 return LoadConstant(ClusterDims.getDims()[2] - 1);
2599 Reg = &ClusterWorkGroupMaxIDZ;
2600 RC = &AMDGPU::SReg_32RegClass;
2601 Ty = LLT::scalar(32);
2602 break;
2604 Reg = &ClusterWorkGroupMaxFlatID;
2605 RC = &AMDGPU::SReg_32RegClass;
2606 Ty = LLT::scalar(32);
2607 break;
2608 default:
2609 break;
2610 }
2611 }
2612
2613 if (!Reg)
2614 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2615 if (!Reg) {
2617 // It's possible for a kernarg intrinsic call to appear in a kernel with
2618 // no allocated segment, in which case we do not add the user sgpr
2619 // argument, so just return null.
2620 return DAG.getConstant(0, SDLoc(), VT);
2621 }
2622
2623 // It's undefined behavior if a function marked with the amdgpu-no-*
2624 // attributes uses the corresponding intrinsic.
2625 return DAG.getPOISON(VT);
2626 }
2627
2628 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2629}
2630
2632 CallingConv::ID CallConv,
2633 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2634 FunctionType *FType,
2636 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2637 const ISD::InputArg *Arg = &Ins[I];
2638
2639 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2640 "vector type argument should have been split");
2641
2642 // First check if it's a PS input addr.
2643 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2644 PSInputNum <= 15) {
2645 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2646
2647 // Inconveniently only the first part of the split is marked as isSplit,
2648 // so skip to the end. We only want to increment PSInputNum once for the
2649 // entire split argument.
2650 if (Arg->Flags.isSplit()) {
2651 while (!Arg->Flags.isSplitEnd()) {
2652 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2653 "unexpected vector split in ps argument type");
2654 if (!SkipArg)
2655 Splits.push_back(*Arg);
2656 Arg = &Ins[++I];
2657 }
2658 }
2659
2660 if (SkipArg) {
2661 // We can safely skip PS inputs.
2662 Skipped.set(Arg->getOrigArgIndex());
2663 ++PSInputNum;
2664 continue;
2665 }
2666
2667 Info->markPSInputAllocated(PSInputNum);
2668 if (Arg->Used)
2669 Info->markPSInputEnabled(PSInputNum);
2670
2671 ++PSInputNum;
2672 }
2673
2674 Splits.push_back(*Arg);
2675 }
2676}
2677
2678// Allocate special inputs passed in VGPRs.
2680 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2681 SIMachineFunctionInfo &Info) const {
2682 const LLT S32 = LLT::scalar(32);
2684
2685 if (Info.hasWorkItemIDX()) {
2686 Register Reg = AMDGPU::VGPR0;
2687 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2688
2689 CCInfo.AllocateReg(Reg);
2690 unsigned Mask =
2691 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2692 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2693 }
2694
2695 if (Info.hasWorkItemIDY()) {
2696 assert(Info.hasWorkItemIDX());
2697 if (Subtarget->hasPackedTID()) {
2698 Info.setWorkItemIDY(
2699 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2700 } else {
2701 unsigned Reg = AMDGPU::VGPR1;
2702 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2703
2704 CCInfo.AllocateReg(Reg);
2705 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2706 }
2707 }
2708
2709 if (Info.hasWorkItemIDZ()) {
2710 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2711 if (Subtarget->hasPackedTID()) {
2712 Info.setWorkItemIDZ(
2713 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2714 } else {
2715 unsigned Reg = AMDGPU::VGPR2;
2716 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2717
2718 CCInfo.AllocateReg(Reg);
2719 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2720 }
2721 }
2722}
2723
2724// Try to allocate a VGPR at the end of the argument list, or if no argument
2725// VGPRs are left allocating a stack slot.
2726// If \p Mask is is given it indicates bitfield position in the register.
2727// If \p Arg is given use it with new ]p Mask instead of allocating new.
2728static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2729 ArgDescriptor Arg = ArgDescriptor()) {
2730 if (Arg.isSet())
2731 return ArgDescriptor::createArg(Arg, Mask);
2732
2733 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2734 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2735 if (RegIdx == ArgVGPRs.size()) {
2736 // Spill to stack required.
2737 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2738
2739 return ArgDescriptor::createStack(Offset, Mask);
2740 }
2741
2742 unsigned Reg = ArgVGPRs[RegIdx];
2743 Reg = CCInfo.AllocateReg(Reg);
2744 assert(Reg != AMDGPU::NoRegister);
2745
2746 MachineFunction &MF = CCInfo.getMachineFunction();
2747 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2748 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2749 return ArgDescriptor::createRegister(Reg, Mask);
2750}
2751
2753 const TargetRegisterClass *RC,
2754 unsigned NumArgRegs) {
2755 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2756 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2757 if (RegIdx == ArgSGPRs.size())
2758 report_fatal_error("ran out of SGPRs for arguments");
2759
2760 unsigned Reg = ArgSGPRs[RegIdx];
2761 Reg = CCInfo.AllocateReg(Reg);
2762 assert(Reg != AMDGPU::NoRegister);
2763
2764 MachineFunction &MF = CCInfo.getMachineFunction();
2765 MF.addLiveIn(Reg, RC);
2767}
2768
2769// If this has a fixed position, we still should allocate the register in the
2770// CCInfo state. Technically we could get away with this for values passed
2771// outside of the normal argument range.
2773 const TargetRegisterClass *RC,
2774 MCRegister Reg) {
2775 Reg = CCInfo.AllocateReg(Reg);
2776 assert(Reg != AMDGPU::NoRegister);
2777 MachineFunction &MF = CCInfo.getMachineFunction();
2778 MF.addLiveIn(Reg, RC);
2779}
2780
2781static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2782 if (Arg) {
2783 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2784 Arg.getRegister());
2785 } else
2786 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2787}
2788
2789static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2790 if (Arg) {
2791 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2792 Arg.getRegister());
2793 } else
2794 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2795}
2796
2797/// Allocate implicit function VGPR arguments at the end of allocated user
2798/// arguments.
2800 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2801 SIMachineFunctionInfo &Info) const {
2802 const unsigned Mask = 0x3ff;
2803 ArgDescriptor Arg;
2804
2805 if (Info.hasWorkItemIDX()) {
2806 Arg = allocateVGPR32Input(CCInfo, Mask);
2807 Info.setWorkItemIDX(Arg);
2808 }
2809
2810 if (Info.hasWorkItemIDY()) {
2811 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2812 Info.setWorkItemIDY(Arg);
2813 }
2814
2815 if (Info.hasWorkItemIDZ())
2816 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2817}
2818
2819/// Allocate implicit function VGPR arguments in fixed registers.
2821 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2822 SIMachineFunctionInfo &Info) const {
2823 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2824 if (!Reg)
2825 report_fatal_error("failed to allocate VGPR for implicit arguments");
2826
2827 const unsigned Mask = 0x3ff;
2828 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2829 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2830 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2831}
2832
2834 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2835 SIMachineFunctionInfo &Info) const {
2836 auto &ArgInfo = Info.getArgInfo();
2837 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2838
2839 // TODO: Unify handling with private memory pointers.
2840 if (UserSGPRInfo.hasDispatchPtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2842
2843 if (UserSGPRInfo.hasQueuePtr())
2844 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2845
2846 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2847 // constant offset from the kernarg segment.
2848 if (Info.hasImplicitArgPtr())
2849 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2850
2851 if (UserSGPRInfo.hasDispatchID())
2852 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2853
2854 // flat_scratch_init is not applicable for non-kernel functions.
2855
2856 if (Info.hasWorkGroupIDX())
2857 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2858
2859 if (Info.hasWorkGroupIDY())
2860 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2861
2862 if (Info.hasWorkGroupIDZ())
2863 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2864
2865 if (Info.hasLDSKernelId())
2866 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2867}
2868
2869// Allocate special inputs passed in user SGPRs.
2871 MachineFunction &MF,
2872 const SIRegisterInfo &TRI,
2873 SIMachineFunctionInfo &Info) const {
2874 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2875 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2876 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2877 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2878 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2879 }
2880
2881 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2882 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2883 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2884 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2885 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2886 }
2887
2888 if (UserSGPRInfo.hasDispatchPtr()) {
2889 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2890 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2891 CCInfo.AllocateReg(DispatchPtrReg);
2892 }
2893
2894 if (UserSGPRInfo.hasQueuePtr()) {
2895 Register QueuePtrReg = Info.addQueuePtr(TRI);
2896 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2897 CCInfo.AllocateReg(QueuePtrReg);
2898 }
2899
2900 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2902 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2903 CCInfo.AllocateReg(InputPtrReg);
2904
2905 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2906 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2907 }
2908
2909 if (UserSGPRInfo.hasDispatchID()) {
2910 Register DispatchIDReg = Info.addDispatchID(TRI);
2911 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2912 CCInfo.AllocateReg(DispatchIDReg);
2913 }
2914
2915 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2916 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2917 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2918 CCInfo.AllocateReg(FlatScratchInitReg);
2919 }
2920
2921 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2922 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2923 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2924 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2925 }
2926
2927 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2928 // these from the dispatch pointer.
2929}
2930
2931// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2932// sequential starting from the first argument.
2934 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2936 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2937 Function &F = MF.getFunction();
2938 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2939 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2940 bool InPreloadSequence = true;
2941 unsigned InIdx = 0;
2942 bool AlignedForImplictArgs = false;
2943 unsigned ImplicitArgOffset = 0;
2944 for (auto &Arg : F.args()) {
2945 if (!InPreloadSequence || !Arg.hasInRegAttr())
2946 break;
2947
2948 unsigned ArgIdx = Arg.getArgNo();
2949 // Don't preload non-original args or parts not in the current preload
2950 // sequence.
2951 if (InIdx < Ins.size() &&
2952 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2953 break;
2954
2955 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2956 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2957 InIdx++) {
2958 assert(ArgLocs[ArgIdx].isMemLoc());
2959 auto &ArgLoc = ArgLocs[InIdx];
2960 const Align KernelArgBaseAlign = Align(16);
2961 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2962 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2963 unsigned NumAllocSGPRs =
2964 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2965
2966 // Fix alignment for hidden arguments.
2967 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2968 if (!AlignedForImplictArgs) {
2969 ImplicitArgOffset =
2970 alignTo(LastExplicitArgOffset,
2971 Subtarget->getAlignmentForImplicitArgPtr()) -
2972 LastExplicitArgOffset;
2973 AlignedForImplictArgs = true;
2974 }
2975 ArgOffset += ImplicitArgOffset;
2976 }
2977
2978 // Arg is preloaded into the previous SGPR.
2979 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2980 assert(InIdx >= 1 && "No previous SGPR");
2981 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2982 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2983 continue;
2984 }
2985
2986 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2987 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2988 // Check for free user SGPRs for preloading.
2989 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2990 InPreloadSequence = false;
2991 break;
2992 }
2993
2994 // Preload this argument.
2995 const TargetRegisterClass *RC =
2996 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2997 SmallVectorImpl<MCRegister> *PreloadRegs =
2998 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2999
3000 if (PreloadRegs->size() > 1)
3001 RC = &AMDGPU::SGPR_32RegClass;
3002 for (auto &Reg : *PreloadRegs) {
3003 assert(Reg);
3004 MF.addLiveIn(Reg, RC);
3005 CCInfo.AllocateReg(Reg);
3006 }
3007
3008 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3009 }
3010 }
3011}
3012
3014 const SIRegisterInfo &TRI,
3015 SIMachineFunctionInfo &Info) const {
3016 // Always allocate this last since it is a synthetic preload.
3017 if (Info.hasLDSKernelId()) {
3018 Register Reg = Info.addLDSKernelId();
3019 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3020 CCInfo.AllocateReg(Reg);
3021 }
3022}
3023
3024// Allocate special input registers that are initialized per-wave.
3027 CallingConv::ID CallConv,
3028 bool IsShader) const {
3029 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3030 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3031 // Note: user SGPRs are handled by the front-end for graphics shaders
3032 // Pad up the used user SGPRs with dead inputs.
3033
3034 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3035 // before enabling architected SGPRs for workgroup IDs.
3036 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3037
3038 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3039 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3040 // rely on it to reach 16 since if we end up having no stack usage, it will
3041 // not really be added.
3042 unsigned NumRequiredSystemSGPRs =
3043 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3044 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3045 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3046 Register Reg = Info.addReservedUserSGPR();
3047 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3048 CCInfo.AllocateReg(Reg);
3049 }
3050 }
3051
3052 if (!HasArchitectedSGPRs) {
3053 if (Info.hasWorkGroupIDX()) {
3054 Register Reg = Info.addWorkGroupIDX();
3055 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3056 CCInfo.AllocateReg(Reg);
3057 }
3058
3059 if (Info.hasWorkGroupIDY()) {
3060 Register Reg = Info.addWorkGroupIDY();
3061 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3062 CCInfo.AllocateReg(Reg);
3063 }
3064
3065 if (Info.hasWorkGroupIDZ()) {
3066 Register Reg = Info.addWorkGroupIDZ();
3067 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3068 CCInfo.AllocateReg(Reg);
3069 }
3070 }
3071
3072 if (Info.hasWorkGroupInfo()) {
3073 Register Reg = Info.addWorkGroupInfo();
3074 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3075 CCInfo.AllocateReg(Reg);
3076 }
3077
3078 if (Info.hasPrivateSegmentWaveByteOffset()) {
3079 // Scratch wave offset passed in system SGPR.
3080 unsigned PrivateSegmentWaveByteOffsetReg;
3081
3082 if (IsShader) {
3083 PrivateSegmentWaveByteOffsetReg =
3084 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3085
3086 // This is true if the scratch wave byte offset doesn't have a fixed
3087 // location.
3088 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3089 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3090 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3091 }
3092 } else
3093 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3094
3095 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3096 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3097 }
3098
3099 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3100 Info.getNumPreloadedSGPRs() >= 16);
3101}
3102
3104 MachineFunction &MF,
3105 const SIRegisterInfo &TRI,
3107 // Now that we've figured out where the scratch register inputs are, see if
3108 // should reserve the arguments and use them directly.
3109 MachineFrameInfo &MFI = MF.getFrameInfo();
3110 bool HasStackObjects = MFI.hasStackObjects();
3111 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3112
3113 // Record that we know we have non-spill stack objects so we don't need to
3114 // check all stack objects later.
3115 if (HasStackObjects)
3116 Info.setHasNonSpillStackObjects(true);
3117
3118 // Everything live out of a block is spilled with fast regalloc, so it's
3119 // almost certain that spilling will be required.
3121 HasStackObjects = true;
3122
3123 // For now assume stack access is needed in any callee functions, so we need
3124 // the scratch registers to pass in.
3125 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3126
3127 if (!ST.enableFlatScratch()) {
3128 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3129 // If we have stack objects, we unquestionably need the private buffer
3130 // resource. For the Code Object V2 ABI, this will be the first 4 user
3131 // SGPR inputs. We can reserve those and use them directly.
3132
3133 Register PrivateSegmentBufferReg =
3135 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3136 } else {
3137 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3138 // We tentatively reserve the last registers (skipping the last registers
3139 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3140 // we'll replace these with the ones immediately after those which were
3141 // really allocated. In the prologue copies will be inserted from the
3142 // argument to these reserved registers.
3143
3144 // Without HSA, relocations are used for the scratch pointer and the
3145 // buffer resource setup is always inserted in the prologue. Scratch wave
3146 // offset is still in an input SGPR.
3147 Info.setScratchRSrcReg(ReservedBufferReg);
3148 }
3149 }
3150
3152
3153 // For entry functions we have to set up the stack pointer if we use it,
3154 // whereas non-entry functions get this "for free". This means there is no
3155 // intrinsic advantage to using S32 over S34 in cases where we do not have
3156 // calls but do need a frame pointer (i.e. if we are requested to have one
3157 // because frame pointer elimination is disabled). To keep things simple we
3158 // only ever use S32 as the call ABI stack pointer, and so using it does not
3159 // imply we need a separate frame pointer.
3160 //
3161 // Try to use s32 as the SP, but move it if it would interfere with input
3162 // arguments. This won't work with calls though.
3163 //
3164 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3165 // registers.
3166 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3167 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3168 } else {
3170
3171 if (MFI.hasCalls())
3172 report_fatal_error("call in graphics shader with too many input SGPRs");
3173
3174 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3175 if (!MRI.isLiveIn(Reg)) {
3176 Info.setStackPtrOffsetReg(Reg);
3177 break;
3178 }
3179 }
3180
3181 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3182 report_fatal_error("failed to find register for SP");
3183 }
3184
3185 // hasFP should be accurate for entry functions even before the frame is
3186 // finalized, because it does not rely on the known stack size, only
3187 // properties like whether variable sized objects are present.
3188 if (ST.getFrameLowering()->hasFP(MF)) {
3189 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3190 }
3191}
3192
3195 return !Info->isEntryFunction();
3196}
3197
3199
3201 MachineBasicBlock *Entry,
3202 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3204
3205 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3206 if (!IStart)
3207 return;
3208
3209 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3210 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3211 MachineBasicBlock::iterator MBBI = Entry->begin();
3212 for (const MCPhysReg *I = IStart; *I; ++I) {
3213 const TargetRegisterClass *RC = nullptr;
3214 if (AMDGPU::SReg_64RegClass.contains(*I))
3215 RC = &AMDGPU::SGPR_64RegClass;
3216 else if (AMDGPU::SReg_32RegClass.contains(*I))
3217 RC = &AMDGPU::SGPR_32RegClass;
3218 else
3219 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3220
3221 Register NewVR = MRI->createVirtualRegister(RC);
3222 // Create copy from CSR to a virtual register.
3223 Entry->addLiveIn(*I);
3224 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3225 .addReg(*I);
3226
3227 // Insert the copy-back instructions right before the terminator.
3228 for (auto *Exit : Exits)
3229 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3230 TII->get(TargetOpcode::COPY), *I)
3231 .addReg(NewVR);
3232 }
3233}
3234
3236 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3237 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3238 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3240
3242 const Function &Fn = MF.getFunction();
3245 bool IsError = false;
3246
3247 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3249 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3250 IsError = true;
3251 }
3252
3255 BitVector Skipped(Ins.size());
3256 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3257 *DAG.getContext());
3258
3259 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3260 bool IsKernel = AMDGPU::isKernel(CallConv);
3261 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3262
3263 if (IsGraphics) {
3264 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3265 assert(!UserSGPRInfo.hasDispatchPtr() &&
3266 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3267 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3268 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3269 (void)UserSGPRInfo;
3270 if (!Subtarget->enableFlatScratch())
3271 assert(!UserSGPRInfo.hasFlatScratchInit());
3272 if ((CallConv != CallingConv::AMDGPU_CS &&
3273 CallConv != CallingConv::AMDGPU_Gfx &&
3274 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3275 !Subtarget->hasArchitectedSGPRs())
3276 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3277 !Info->hasWorkGroupIDZ());
3278 }
3279
3280 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3281
3282 if (CallConv == CallingConv::AMDGPU_PS) {
3283 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3284
3285 // At least one interpolation mode must be enabled or else the GPU will
3286 // hang.
3287 //
3288 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3289 // set PSInputAddr, the user wants to enable some bits after the compilation
3290 // based on run-time states. Since we can't know what the final PSInputEna
3291 // will look like, so we shouldn't do anything here and the user should take
3292 // responsibility for the correct programming.
3293 //
3294 // Otherwise, the following restrictions apply:
3295 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3296 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3297 // enabled too.
3298 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3299 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3300 CCInfo.AllocateReg(AMDGPU::VGPR0);
3301 CCInfo.AllocateReg(AMDGPU::VGPR1);
3302 Info->markPSInputAllocated(0);
3303 Info->markPSInputEnabled(0);
3304 }
3305 if (Subtarget->isAmdPalOS()) {
3306 // For isAmdPalOS, the user does not enable some bits after compilation
3307 // based on run-time states; the register values being generated here are
3308 // the final ones set in hardware. Therefore we need to apply the
3309 // workaround to PSInputAddr and PSInputEnable together. (The case where
3310 // a bit is set in PSInputAddr but not PSInputEnable is where the
3311 // frontend set up an input arg for a particular interpolation mode, but
3312 // nothing uses that input arg. Really we should have an earlier pass
3313 // that removes such an arg.)
3314 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3315 if ((PsInputBits & 0x7F) == 0 ||
3316 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3317 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3318 }
3319 } else if (IsKernel) {
3320 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3321 } else {
3322 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3323 Ins.end());
3324 }
3325
3326 if (IsKernel)
3327 analyzeFormalArgumentsCompute(CCInfo, Ins);
3328
3329 if (IsEntryFunc) {
3330 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3331 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3332 if (IsKernel && Subtarget->hasKernargPreload())
3333 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3334
3335 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3336 } else if (!IsGraphics) {
3337 // For the fixed ABI, pass workitem IDs in the last argument register.
3338 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3339
3340 // FIXME: Sink this into allocateSpecialInputSGPRs
3341 if (!Subtarget->enableFlatScratch())
3342 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3343
3344 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3345 }
3346
3347 if (!IsKernel) {
3348 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3349 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3350
3351 // This assumes the registers are allocated by CCInfo in ascending order
3352 // with no gaps.
3353 Info->setNumWaveDispatchSGPRs(
3354 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3355 Info->setNumWaveDispatchVGPRs(
3356 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3357 } else if (Info->getNumKernargPreloadedSGPRs()) {
3358 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3359 }
3360
3362
3363 if (IsWholeWaveFunc) {
3364 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3365 {MVT::i1, MVT::Other}, Chain);
3366 InVals.push_back(Setup.getValue(0));
3367 Chains.push_back(Setup.getValue(1));
3368 }
3369
3370 // FIXME: This is the minimum kernel argument alignment. We should improve
3371 // this to the maximum alignment of the arguments.
3372 //
3373 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3374 // kern arg offset.
3375 const Align KernelArgBaseAlign = Align(16);
3376
3377 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3378 ++i) {
3379 const ISD::InputArg &Arg = Ins[i];
3380 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3381 InVals.push_back(DAG.getPOISON(Arg.VT));
3382 continue;
3383 }
3384
3385 CCValAssign &VA = ArgLocs[ArgIdx++];
3386 MVT VT = VA.getLocVT();
3387
3388 if (IsEntryFunc && VA.isMemLoc()) {
3389 VT = Ins[i].VT;
3390 EVT MemVT = VA.getLocVT();
3391
3392 const uint64_t Offset = VA.getLocMemOffset();
3393 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3394
3395 if (Arg.Flags.isByRef()) {
3396 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3397
3398 const GCNTargetMachine &TM =
3399 static_cast<const GCNTargetMachine &>(getTargetMachine());
3400 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3401 Arg.Flags.getPointerAddrSpace())) {
3404 }
3405
3406 InVals.push_back(Ptr);
3407 continue;
3408 }
3409
3410 SDValue NewArg;
3411 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3412 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3413 // In this case the argument is packed into the previous preload SGPR.
3414 int64_t AlignDownOffset = alignDown(Offset, 4);
3415 int64_t OffsetDiff = Offset - AlignDownOffset;
3416 EVT IntVT = MemVT.changeTypeToInteger();
3417
3418 const SIMachineFunctionInfo *Info =
3421 Register Reg =
3422 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3423
3424 assert(Reg);
3425 Register VReg = MRI.getLiveInVirtReg(Reg);
3426 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3427
3428 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3429 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3430
3431 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3432 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3433 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3434 Ins[i].Flags.isSExt(), &Ins[i]);
3435
3436 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3437 } else {
3438 const SIMachineFunctionInfo *Info =
3441 const SmallVectorImpl<MCRegister> &PreloadRegs =
3442 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3443
3444 SDValue Copy;
3445 if (PreloadRegs.size() == 1) {
3446 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3447 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3448 NewArg = DAG.getCopyFromReg(
3449 Chain, DL, VReg,
3451 TRI->getRegSizeInBits(*RC)));
3452
3453 } else {
3454 // If the kernarg alignment does not match the alignment of the SGPR
3455 // tuple RC that can accommodate this argument, it will be built up
3456 // via copies from from the individual SGPRs that the argument was
3457 // preloaded to.
3459 for (auto Reg : PreloadRegs) {
3460 Register VReg = MRI.getLiveInVirtReg(Reg);
3461 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3462 Elts.push_back(Copy);
3463 }
3464 NewArg =
3465 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3466 PreloadRegs.size()),
3467 DL, Elts);
3468 }
3469
3470 // If the argument was preloaded to multiple consecutive 32-bit
3471 // registers because of misalignment between addressable SGPR tuples
3472 // and the argument size, we can still assume that because of kernarg
3473 // segment alignment restrictions that NewArg's size is the same as
3474 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3475 // truncate since we cannot preload to less than a single SGPR and the
3476 // MemVT may be smaller.
3477 EVT MemVTInt =
3479 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3480 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3481
3482 NewArg = DAG.getBitcast(MemVT, NewArg);
3483 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3484 Ins[i].Flags.isSExt(), &Ins[i]);
3485 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3486 }
3487 } else {
3488 // Hidden arguments that are in the kernel signature must be preloaded
3489 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3490 // the argument list and is not preloaded.
3491 if (Arg.isOrigArg()) {
3492 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3493 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3495 *OrigArg->getParent(),
3496 "hidden argument in kernel signature was not preloaded",
3497 DL.getDebugLoc()));
3498 }
3499 }
3500
3501 NewArg =
3502 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3503 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3504 }
3505 Chains.push_back(NewArg.getValue(1));
3506
3507 auto *ParamTy =
3508 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3509 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3510 ParamTy &&
3511 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3512 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3513 // On SI local pointers are just offsets into LDS, so they are always
3514 // less than 16-bits. On CI and newer they could potentially be
3515 // real pointers, so we can't guarantee their size.
3516 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3517 DAG.getValueType(MVT::i16));
3518 }
3519
3520 InVals.push_back(NewArg);
3521 continue;
3522 }
3523 if (!IsEntryFunc && VA.isMemLoc()) {
3524 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3525 InVals.push_back(Val);
3526 if (!Arg.Flags.isByVal())
3527 Chains.push_back(Val.getValue(1));
3528 continue;
3529 }
3530
3531 assert(VA.isRegLoc() && "Parameter must be in a register!");
3532
3533 Register Reg = VA.getLocReg();
3534 const TargetRegisterClass *RC = nullptr;
3535 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3536 RC = &AMDGPU::VGPR_32RegClass;
3537 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3538 RC = &AMDGPU::SGPR_32RegClass;
3539 else
3540 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3541
3542 Reg = MF.addLiveIn(Reg, RC);
3543 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3544
3545 if (Arg.Flags.isSRet()) {
3546 // The return object should be reasonably addressable.
3547
3548 // FIXME: This helps when the return is a real sret. If it is a
3549 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3550 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3551 unsigned NumBits =
3553 Val = DAG.getNode(
3554 ISD::AssertZext, DL, VT, Val,
3555 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3556 }
3557
3558 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3559 InVals.push_back(Val);
3560 }
3561
3562 // Start adding system SGPRs.
3563 if (IsEntryFunc)
3564 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3565
3566 if (DAG.getPass()) {
3567 auto &ArgUsageInfo =
3569 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3570 } else if (auto *MFAM = DAG.getMFAM()) {
3571 Module &M = *MF.getFunction().getParent();
3572 auto *ArgUsageInfo =
3574 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3575 if (ArgUsageInfo)
3576 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3577 }
3578
3579 unsigned StackArgSize = CCInfo.getStackSize();
3580 Info->setBytesInStackArgArea(StackArgSize);
3581
3582 return Chains.empty() ? Chain
3583 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3584}
3585
3586// TODO: If return values can't fit in registers, we should return as many as
3587// possible in registers before passing on stack.
3589 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3590 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3591 const Type *RetTy) const {
3592 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3593 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3594 // for shaders. Vector types should be explicitly handled by CC.
3595 if (AMDGPU::isEntryFunctionCC(CallConv))
3596 return true;
3597
3599 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3600 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3601 return false;
3602
3603 // We must use the stack if return would require unavailable registers.
3604 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3605 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3606 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3607 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3608 return false;
3609
3610 return true;
3611}
3612
3613SDValue
3615 bool isVarArg,
3617 const SmallVectorImpl<SDValue> &OutVals,
3618 const SDLoc &DL, SelectionDAG &DAG) const {
3622
3623 if (AMDGPU::isKernel(CallConv)) {
3624 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3625 OutVals, DL, DAG);
3626 }
3627
3628 bool IsShader = AMDGPU::isShader(CallConv);
3629
3630 Info->setIfReturnsVoid(Outs.empty());
3631 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3632
3633 // CCValAssign - represent the assignment of the return value to a location.
3635
3636 // CCState - Info about the registers and stack slots.
3637 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3638 *DAG.getContext());
3639
3640 // Analyze outgoing return values.
3641 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3642
3643 SDValue Glue;
3645 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3646
3647 SDValue ReadFirstLane =
3648 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3649 // Copy the result values into the output registers.
3650 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3651 ++I, ++RealRVLocIdx) {
3652 CCValAssign &VA = RVLocs[I];
3653 assert(VA.isRegLoc() && "Can only return in registers!");
3654 // TODO: Partially return in registers if return values don't fit.
3655 SDValue Arg = OutVals[RealRVLocIdx];
3656
3657 // Copied from other backends.
3658 switch (VA.getLocInfo()) {
3659 case CCValAssign::Full:
3660 break;
3661 case CCValAssign::BCvt:
3662 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3663 break;
3664 case CCValAssign::SExt:
3665 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3666 break;
3667 case CCValAssign::ZExt:
3668 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3669 break;
3670 case CCValAssign::AExt:
3671 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3672 break;
3673 default:
3674 llvm_unreachable("Unknown loc info!");
3675 }
3676 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3678 ReadFirstLane, Arg);
3679 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3680 Glue = Chain.getValue(1);
3681 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3682 }
3683
3684 // FIXME: Does sret work properly?
3685 if (!Info->isEntryFunction()) {
3686 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3687 const MCPhysReg *I =
3688 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3689 if (I) {
3690 for (; *I; ++I) {
3691 if (AMDGPU::SReg_64RegClass.contains(*I))
3692 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3693 else if (AMDGPU::SReg_32RegClass.contains(*I))
3694 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3695 else
3696 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3697 }
3698 }
3699 }
3700
3701 // Update chain and glue.
3702 RetOps[0] = Chain;
3703 if (Glue.getNode())
3704 RetOps.push_back(Glue);
3705
3706 unsigned Opc = AMDGPUISD::ENDPGM;
3707 if (!IsWaveEnd)
3708 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3709 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3710 : AMDGPUISD::RET_GLUE;
3711 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3712}
3713
3715 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3716 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3717 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3718 SDValue ThisVal) const {
3719 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3720
3721 // Assign locations to each value returned by this call.
3723 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3724 *DAG.getContext());
3725 CCInfo.AnalyzeCallResult(Ins, RetCC);
3726
3727 // Copy all of the result registers out of their specified physreg.
3728 for (CCValAssign VA : RVLocs) {
3729 SDValue Val;
3730
3731 if (VA.isRegLoc()) {
3732 Val =
3733 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3734 Chain = Val.getValue(1);
3735 InGlue = Val.getValue(2);
3736 } else if (VA.isMemLoc()) {
3737 report_fatal_error("TODO: return values in memory");
3738 } else
3739 llvm_unreachable("unknown argument location type");
3740
3741 switch (VA.getLocInfo()) {
3742 case CCValAssign::Full:
3743 break;
3744 case CCValAssign::BCvt:
3745 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3746 break;
3747 case CCValAssign::ZExt:
3748 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3749 DAG.getValueType(VA.getValVT()));
3750 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3751 break;
3752 case CCValAssign::SExt:
3753 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3754 DAG.getValueType(VA.getValVT()));
3755 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3756 break;
3757 case CCValAssign::AExt:
3758 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3759 break;
3760 default:
3761 llvm_unreachable("Unknown loc info!");
3762 }
3763
3764 InVals.push_back(Val);
3765 }
3766
3767 return Chain;
3768}
3769
3770// Add code to pass special inputs required depending on used features separate
3771// from the explicit user arguments present in the IR.
3773 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3774 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3775 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3776 // If we don't have a call site, this was a call inserted by
3777 // legalization. These can never use special inputs.
3778 if (!CLI.CB)
3779 return;
3780
3781 SelectionDAG &DAG = CLI.DAG;
3782 const SDLoc &DL = CLI.DL;
3783 const Function &F = DAG.getMachineFunction().getFunction();
3784
3785 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3786 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3787
3788 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3790 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3791 if (DAG.getPass()) {
3792 auto &ArgUsageInfo =
3794 CalleeArgInfo =
3795 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3796 } else if (auto *MFAM = DAG.getMFAM()) {
3798 auto *ArgUsageInfo =
3800 DAG.getMachineFunction())
3801 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3802 if (ArgUsageInfo)
3803 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3804 }
3805 }
3806
3807 // TODO: Unify with private memory register handling. This is complicated by
3808 // the fact that at least in kernels, the input argument is not necessarily
3809 // in the same location as the input.
3810 // clang-format off
3811 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3812 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3813 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3814 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3815 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3816 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3817 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3818 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3819 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3820 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3821 };
3822 // clang-format on
3823
3824 for (auto [InputID, Attrs] : ImplicitAttrs) {
3825 // If the callee does not use the attribute value, skip copying the value.
3826 if (all_of(Attrs, [&](StringRef Attr) {
3827 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3828 }))
3829 continue;
3830
3831 const auto [OutgoingArg, ArgRC, ArgTy] =
3832 CalleeArgInfo->getPreloadedValue(InputID);
3833 if (!OutgoingArg)
3834 continue;
3835
3836 const auto [IncomingArg, IncomingArgRC, Ty] =
3837 CallerArgInfo.getPreloadedValue(InputID);
3838 assert(IncomingArgRC == ArgRC);
3839
3840 // All special arguments are ints for now.
3841 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3842 SDValue InputReg;
3843
3844 if (IncomingArg) {
3845 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3846 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3847 // The implicit arg ptr is special because it doesn't have a corresponding
3848 // input for kernels, and is computed from the kernarg segment pointer.
3849 InputReg = getImplicitArgPtr(DAG, DL);
3850 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3851 std::optional<uint32_t> Id =
3853 if (Id.has_value()) {
3854 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3855 } else {
3856 InputReg = DAG.getPOISON(ArgVT);
3857 }
3858 } else {
3859 // We may have proven the input wasn't needed, although the ABI is
3860 // requiring it. We just need to allocate the register appropriately.
3861 InputReg = DAG.getPOISON(ArgVT);
3862 }
3863
3864 if (OutgoingArg->isRegister()) {
3865 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3866 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3867 report_fatal_error("failed to allocate implicit input argument");
3868 } else {
3869 unsigned SpecialArgOffset =
3870 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3871 SDValue ArgStore =
3872 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3873 MemOpChains.push_back(ArgStore);
3874 }
3875 }
3876
3877 // Pack workitem IDs into a single register or pass it as is if already
3878 // packed.
3879
3880 auto [OutgoingArg, ArgRC, Ty] =
3882 if (!OutgoingArg)
3883 std::tie(OutgoingArg, ArgRC, Ty) =
3885 if (!OutgoingArg)
3886 std::tie(OutgoingArg, ArgRC, Ty) =
3888 if (!OutgoingArg)
3889 return;
3890
3891 const ArgDescriptor *IncomingArgX = std::get<0>(
3893 const ArgDescriptor *IncomingArgY = std::get<0>(
3895 const ArgDescriptor *IncomingArgZ = std::get<0>(
3897
3898 SDValue InputReg;
3899 SDLoc SL;
3900
3901 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3902 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3903 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3904
3905 // If incoming ids are not packed we need to pack them.
3906 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3907 NeedWorkItemIDX) {
3908 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3909 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3910 } else {
3911 InputReg = DAG.getConstant(0, DL, MVT::i32);
3912 }
3913 }
3914
3915 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3916 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3917 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3918 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3919 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3920 InputReg = InputReg.getNode()
3921 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3922 : Y;
3923 }
3924
3925 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3926 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3927 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3928 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3929 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3930 InputReg = InputReg.getNode()
3931 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3932 : Z;
3933 }
3934
3935 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3936 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3937 // We're in a situation where the outgoing function requires the workitem
3938 // ID, but the calling function does not have it (e.g a graphics function
3939 // calling a C calling convention function). This is illegal, but we need
3940 // to produce something.
3941 InputReg = DAG.getPOISON(MVT::i32);
3942 } else {
3943 // Workitem ids are already packed, any of present incoming arguments
3944 // will carry all required fields.
3945 ArgDescriptor IncomingArg =
3946 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3947 : IncomingArgY ? *IncomingArgY
3948 : *IncomingArgZ,
3949 ~0u);
3950 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3951 }
3952 }
3953
3954 if (OutgoingArg->isRegister()) {
3955 if (InputReg)
3956 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3957
3958 CCInfo.AllocateReg(OutgoingArg->getRegister());
3959 } else {
3960 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3961 if (InputReg) {
3962 SDValue ArgStore =
3963 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3964 MemOpChains.push_back(ArgStore);
3965 }
3966 }
3967}
3968
3970 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3972 const SmallVectorImpl<SDValue> &OutVals,
3973 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3974 if (AMDGPU::isChainCC(CalleeCC))
3975 return true;
3976
3977 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3978 return false;
3979
3980 // For a divergent call target, we need to do a waterfall loop over the
3981 // possible callees which precludes us from using a simple jump.
3982 if (Callee->isDivergent())
3983 return false;
3984
3986 const Function &CallerF = MF.getFunction();
3987 CallingConv::ID CallerCC = CallerF.getCallingConv();
3989 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3990
3991 // Kernels aren't callable, and don't have a live in return address so it
3992 // doesn't make sense to do a tail call with entry functions.
3993 if (!CallerPreserved)
3994 return false;
3995
3996 bool CCMatch = CallerCC == CalleeCC;
3997
3999 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4000 return true;
4001 return false;
4002 }
4003
4004 // TODO: Can we handle var args?
4005 if (IsVarArg)
4006 return false;
4007
4008 for (const Argument &Arg : CallerF.args()) {
4009 if (Arg.hasByValAttr())
4010 return false;
4011 }
4012
4013 LLVMContext &Ctx = *DAG.getContext();
4014
4015 // Check that the call results are passed in the same way.
4016 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4017 CCAssignFnForCall(CalleeCC, IsVarArg),
4018 CCAssignFnForCall(CallerCC, IsVarArg)))
4019 return false;
4020
4021 // The callee has to preserve all registers the caller needs to preserve.
4022 if (!CCMatch) {
4023 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4024 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4025 return false;
4026 }
4027
4028 // Nothing more to check if the callee is taking no arguments.
4029 if (Outs.empty())
4030 return true;
4031
4033 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4034
4035 // FIXME: We are not allocating special input registers, so we will be
4036 // deciding based on incorrect register assignments.
4037 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4038
4039 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4040 // If the stack arguments for this call do not fit into our own save area then
4041 // the call cannot be made tail.
4042 // TODO: Is this really necessary?
4043 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4044 return false;
4045
4046 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4047 // FIXME: What about inreg arguments that end up passed in memory?
4048 if (!CCVA.isRegLoc())
4049 continue;
4050
4051 // If we are passing an argument in an SGPR, and the value is divergent,
4052 // this call requires a waterfall loop.
4053 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4054 LLVM_DEBUG(
4055 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4056 << printReg(CCVA.getLocReg(), TRI) << '\n');
4057 return false;
4058 }
4059 }
4060
4061 const MachineRegisterInfo &MRI = MF.getRegInfo();
4062 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4063}
4064
4066 if (!CI->isTailCall())
4067 return false;
4068
4069 const Function *ParentFn = CI->getFunction();
4071 return false;
4072 return true;
4073}
4074
4075namespace {
4076// Chain calls have special arguments that we need to handle. These are
4077// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4078// arguments (index 0 and 1 respectively).
4079enum ChainCallArgIdx {
4080 Exec = 2,
4081 Flags,
4082 NumVGPRs,
4083 FallbackExec,
4084 FallbackCallee
4085};
4086} // anonymous namespace
4087
4088// The wave scratch offset register is used as the global base pointer.
4090 SmallVectorImpl<SDValue> &InVals) const {
4091 CallingConv::ID CallConv = CLI.CallConv;
4092 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4093
4094 SelectionDAG &DAG = CLI.DAG;
4095
4096 const SDLoc &DL = CLI.DL;
4097 SDValue Chain = CLI.Chain;
4098 SDValue Callee = CLI.Callee;
4099
4100 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4101 bool UsesDynamicVGPRs = false;
4102 if (IsChainCallConv) {
4103 // The last arguments should be the value that we need to put in EXEC,
4104 // followed by the flags and any other arguments with special meanings.
4105 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4106 // we don't treat them like the "real" arguments.
4107 auto RequestedExecIt =
4108 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4109 return Arg.OrigArgIndex == 2;
4110 });
4111 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4112
4113 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4114 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4115 CLI.OutVals.end());
4116 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4117
4118 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4119 "Haven't popped all the special args");
4120
4121 TargetLowering::ArgListEntry RequestedExecArg =
4122 CLI.Args[ChainCallArgIdx::Exec];
4123 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4124 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4125
4126 // Convert constants into TargetConstants, so they become immediate operands
4127 // instead of being selected into S_MOV.
4128 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4129 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4130 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4131 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4132 } else
4133 ChainCallSpecialArgs.push_back(Arg.Node);
4134 };
4135
4136 PushNodeOrTargetConstant(RequestedExecArg);
4137
4138 // Process any other special arguments depending on the value of the flags.
4139 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4140
4141 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4142 if (FlagsValue.isZero()) {
4143 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4144 return lowerUnhandledCall(CLI, InVals,
4145 "no additional args allowed if flags == 0");
4146 } else if (FlagsValue.isOneBitSet(0)) {
4147 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4148 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4149 }
4150
4151 if (!Subtarget->isWave32()) {
4152 return lowerUnhandledCall(
4153 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4154 }
4155
4156 UsesDynamicVGPRs = true;
4157 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4158 CLI.Args.end(), PushNodeOrTargetConstant);
4159 }
4160 }
4161
4163 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4165 bool &IsTailCall = CLI.IsTailCall;
4166 bool IsVarArg = CLI.IsVarArg;
4167 bool IsSibCall = false;
4169
4170 if (Callee.isUndef() || isNullConstant(Callee)) {
4171 if (!CLI.IsTailCall) {
4172 for (ISD::InputArg &Arg : CLI.Ins)
4173 InVals.push_back(DAG.getPOISON(Arg.VT));
4174 }
4175
4176 return Chain;
4177 }
4178
4179 if (IsVarArg) {
4180 return lowerUnhandledCall(CLI, InVals,
4181 "unsupported call to variadic function ");
4182 }
4183
4184 if (!CLI.CB)
4185 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4186
4187 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4188 return lowerUnhandledCall(CLI, InVals,
4189 "unsupported required tail call to function ");
4190 }
4191
4192 if (IsTailCall) {
4193 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4194 Outs, OutVals, Ins, DAG);
4195 if (!IsTailCall &&
4196 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4197 report_fatal_error("failed to perform tail call elimination on a call "
4198 "site marked musttail or on llvm.amdgcn.cs.chain");
4199 }
4200
4201 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4202
4203 // A sibling call is one where we're under the usual C ABI and not planning
4204 // to change that but can still do a tail call:
4205 if (!TailCallOpt && IsTailCall)
4206 IsSibCall = true;
4207
4208 if (IsTailCall)
4209 ++NumTailCalls;
4210 }
4211
4214 SmallVector<SDValue, 8> MemOpChains;
4215
4216 // Analyze operands of the call, assigning locations to each operand.
4218 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4219 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4220
4221 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4223 // With a fixed ABI, allocate fixed registers before user arguments.
4224 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4225 }
4226
4227 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4228
4229 // Get a count of how many bytes are to be pushed on the stack.
4230 unsigned NumBytes = CCInfo.getStackSize();
4231
4232 if (IsSibCall) {
4233 // Since we're not changing the ABI to make this a tail call, the memory
4234 // operands are already available in the caller's incoming argument space.
4235 NumBytes = 0;
4236 }
4237
4238 // FPDiff is the byte offset of the call's argument area from the callee's.
4239 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4240 // by this amount for a tail call. In a sibling call it must be 0 because the
4241 // caller will deallocate the entire stack and the callee still expects its
4242 // arguments to begin at SP+0. Completely unused for non-tail calls.
4243 int32_t FPDiff = 0;
4244 MachineFrameInfo &MFI = MF.getFrameInfo();
4245 auto *TRI = Subtarget->getRegisterInfo();
4246
4247 // Adjust the stack pointer for the new arguments...
4248 // These operations are automatically eliminated by the prolog/epilog pass
4249 if (!IsSibCall)
4250 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4251
4252 if (!IsSibCall || IsChainCallConv) {
4253 if (!Subtarget->enableFlatScratch()) {
4254 SmallVector<SDValue, 4> CopyFromChains;
4255
4256 // In the HSA case, this should be an identity copy.
4257 SDValue ScratchRSrcReg =
4258 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4259 RegsToPass.emplace_back(IsChainCallConv
4260 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4261 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4262 ScratchRSrcReg);
4263 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4264 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4265 }
4266 }
4267
4268 const unsigned NumSpecialInputs = RegsToPass.size();
4269
4270 MVT PtrVT = MVT::i32;
4271
4272 // Walk the register/memloc assignments, inserting copies/loads.
4273 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4274 CCValAssign &VA = ArgLocs[i];
4275 SDValue Arg = OutVals[i];
4276
4277 // Promote the value if needed.
4278 switch (VA.getLocInfo()) {
4279 case CCValAssign::Full:
4280 break;
4281 case CCValAssign::BCvt:
4282 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4283 break;
4284 case CCValAssign::ZExt:
4285 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4286 break;
4287 case CCValAssign::SExt:
4288 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4289 break;
4290 case CCValAssign::AExt:
4291 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4292 break;
4293 case CCValAssign::FPExt:
4294 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4295 break;
4296 default:
4297 llvm_unreachable("Unknown loc info!");
4298 }
4299
4300 if (VA.isRegLoc()) {
4301 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4302 } else {
4303 assert(VA.isMemLoc());
4304
4305 SDValue DstAddr;
4306 MachinePointerInfo DstInfo;
4307
4308 unsigned LocMemOffset = VA.getLocMemOffset();
4309 int32_t Offset = LocMemOffset;
4310
4311 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4312 MaybeAlign Alignment;
4313
4314 if (IsTailCall) {
4315 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4316 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4317 : VA.getValVT().getStoreSize();
4318
4319 // FIXME: We can have better than the minimum byval required alignment.
4320 Alignment =
4321 Flags.isByVal()
4322 ? Flags.getNonZeroByValAlign()
4323 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4324
4325 Offset = Offset + FPDiff;
4326 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4327
4328 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4329 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4330
4331 // Make sure any stack arguments overlapping with where we're storing
4332 // are loaded before this eventual operation. Otherwise they'll be
4333 // clobbered.
4334
4335 // FIXME: Why is this really necessary? This seems to just result in a
4336 // lot of code to copy the stack and write them back to the same
4337 // locations, which are supposed to be immutable?
4338 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4339 } else {
4340 // Stores to the argument stack area are relative to the stack pointer.
4341 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4342 MVT::i32);
4343 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4344 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4345 Alignment =
4346 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4347 }
4348
4349 if (Outs[i].Flags.isByVal()) {
4350 SDValue SizeNode =
4351 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4352 SDValue Cpy =
4353 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4354 Outs[i].Flags.getNonZeroByValAlign(),
4355 /*isVol = */ false, /*AlwaysInline = */ true,
4356 /*CI=*/nullptr, std::nullopt, DstInfo,
4358
4359 MemOpChains.push_back(Cpy);
4360 } else {
4361 SDValue Store =
4362 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4363 MemOpChains.push_back(Store);
4364 }
4365 }
4366 }
4367
4368 if (!MemOpChains.empty())
4369 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4370
4371 SDValue ReadFirstLaneID =
4372 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4373
4374 SDValue TokenGlue;
4375 if (CLI.ConvergenceControlToken) {
4376 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4378 }
4379
4380 // Build a sequence of copy-to-reg nodes chained together with token chain
4381 // and flag operands which copy the outgoing args into the appropriate regs.
4382 SDValue InGlue;
4383
4384 unsigned ArgIdx = 0;
4385 for (auto [Reg, Val] : RegsToPass) {
4386 if (ArgIdx++ >= NumSpecialInputs &&
4387 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4388 // For chain calls, the inreg arguments are required to be
4389 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4390 // they are uniform.
4391 //
4392 // For other calls, if an inreg arguments is known to be uniform,
4393 // speculatively insert a readfirstlane in case it is in a VGPR.
4394 //
4395 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4396 // value, so let that continue to produce invalid code.
4397
4398 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4399 if (TokenGlue)
4400 ReadfirstlaneArgs.push_back(TokenGlue);
4402 ReadfirstlaneArgs);
4403 }
4404
4405 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4406 InGlue = Chain.getValue(1);
4407 }
4408
4409 // We don't usually want to end the call-sequence here because we would tidy
4410 // the frame up *after* the call, however in the ABI-changing tail-call case
4411 // we've carefully laid out the parameters so that when sp is reset they'll be
4412 // in the correct location.
4413 if (IsTailCall && !IsSibCall) {
4414 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4415 InGlue = Chain.getValue(1);
4416 }
4417
4418 std::vector<SDValue> Ops({Chain});
4419
4420 // Add a redundant copy of the callee global which will not be legalized, as
4421 // we need direct access to the callee later.
4423 const GlobalValue *GV = GSD->getGlobal();
4424 Ops.push_back(Callee);
4425 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4426 } else {
4427 if (IsTailCall) {
4428 // isEligibleForTailCallOptimization considered whether the call target is
4429 // divergent, but we may still end up with a uniform value in a VGPR.
4430 // Insert a readfirstlane just in case.
4431 SDValue ReadFirstLaneID =
4432 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4433
4434 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4435 if (TokenGlue)
4436 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4437 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4438 ReadfirstlaneArgs);
4439 }
4440
4441 Ops.push_back(Callee);
4442 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4443 }
4444
4445 if (IsTailCall) {
4446 // Each tail call may have to adjust the stack by a different amount, so
4447 // this information must travel along with the operation for eventual
4448 // consumption by emitEpilogue.
4449 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4450 }
4451
4452 if (IsChainCallConv)
4453 llvm::append_range(Ops, ChainCallSpecialArgs);
4454
4455 // Add argument registers to the end of the list so that they are known live
4456 // into the call.
4457 for (auto &[Reg, Val] : RegsToPass)
4458 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4459
4460 // Add a register mask operand representing the call-preserved registers.
4461 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4462 assert(Mask && "Missing call preserved mask for calling convention");
4463 Ops.push_back(DAG.getRegisterMask(Mask));
4464
4465 if (SDValue Token = CLI.ConvergenceControlToken) {
4467 GlueOps.push_back(Token);
4468 if (InGlue)
4469 GlueOps.push_back(InGlue);
4470
4471 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4472 MVT::Glue, GlueOps),
4473 0);
4474 }
4475
4476 if (InGlue)
4477 Ops.push_back(InGlue);
4478
4479 // If we're doing a tall call, use a TC_RETURN here rather than an
4480 // actual call instruction.
4481 if (IsTailCall) {
4482 MFI.setHasTailCall();
4483 unsigned OPC = AMDGPUISD::TC_RETURN;
4484 switch (CallConv) {
4486 OPC = AMDGPUISD::TC_RETURN_GFX;
4487 break;
4490 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4491 : AMDGPUISD::TC_RETURN_CHAIN;
4492 break;
4493 }
4494
4495 // If the caller is a whole wave function, we need to use a special opcode
4496 // so we can patch up EXEC.
4497 if (Info->isWholeWaveFunction())
4498 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4499
4500 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4501 }
4502
4503 // Returns a chain and a flag for retval copy to use.
4504 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4505 Chain = Call.getValue(0);
4506 InGlue = Call.getValue(1);
4507
4508 uint64_t CalleePopBytes = NumBytes;
4509 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4510 if (!Ins.empty())
4511 InGlue = Chain.getValue(1);
4512
4513 // Handle result values, copying them out of physregs into vregs that we
4514 // return.
4515 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4516 InVals, /*IsThisReturn=*/false, SDValue());
4517}
4518
4519// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4520// except for:
4521// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4522// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4524 SelectionDAG &DAG) const {
4525 const MachineFunction &MF = DAG.getMachineFunction();
4527
4528 SDLoc dl(Op);
4529 EVT VT = Op.getValueType();
4530 SDValue Chain = Op.getOperand(0);
4531 Register SPReg = Info->getStackPtrOffsetReg();
4532
4533 // Chain the dynamic stack allocation so that it doesn't modify the stack
4534 // pointer when other instructions are using the stack.
4535 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4536
4537 SDValue Size = Op.getOperand(1);
4538 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4539 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4540
4541 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4543 "Stack grows upwards for AMDGPU");
4544
4545 Chain = BaseAddr.getValue(1);
4546 Align StackAlign = TFL->getStackAlign();
4547 if (Alignment > StackAlign) {
4548 uint64_t ScaledAlignment = Alignment.value()
4549 << Subtarget->getWavefrontSizeLog2();
4550 uint64_t StackAlignMask = ScaledAlignment - 1;
4551 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4552 DAG.getConstant(StackAlignMask, dl, VT));
4553 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4554 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4555 }
4556
4557 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4558 SDValue NewSP;
4560 // For constant sized alloca, scale alloca size by wave-size
4561 SDValue ScaledSize = DAG.getNode(
4562 ISD::SHL, dl, VT, Size,
4563 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4564 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4565 } else {
4566 // For dynamic sized alloca, perform wave-wide reduction to get max of
4567 // alloca size(divergent) and then scale it by wave-size
4568 SDValue WaveReduction =
4569 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4570 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4571 Size, DAG.getConstant(0, dl, MVT::i32));
4572 SDValue ScaledSize = DAG.getNode(
4573 ISD::SHL, dl, VT, Size,
4574 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4575 NewSP =
4576 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4577 SDValue ReadFirstLaneID =
4578 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4579 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4580 NewSP);
4581 }
4582
4583 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4584 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4585
4586 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4587}
4588
4590 if (Op.getValueType() != MVT::i32)
4591 return Op; // Defer to cannot select error.
4592
4594 SDLoc SL(Op);
4595
4596 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4597
4598 // Convert from wave uniform to swizzled vector address. This should protect
4599 // from any edge cases where the stacksave result isn't directly used with
4600 // stackrestore.
4601 SDValue VectorAddress =
4602 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4603 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4604}
4605
4607 SelectionDAG &DAG) const {
4608 SDLoc SL(Op);
4609 assert(Op.getValueType() == MVT::i32);
4610
4611 uint32_t BothRoundHwReg =
4613 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4614
4615 SDValue IntrinID =
4616 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4617 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4618 Op.getOperand(0), IntrinID, GetRoundBothImm);
4619
4620 // There are two rounding modes, one for f32 and one for f64/f16. We only
4621 // report in the standard value range if both are the same.
4622 //
4623 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4624 // ties away from zero is not supported, and the other values are rotated by
4625 // 1.
4626 //
4627 // If the two rounding modes are not the same, report a target defined value.
4628
4629 // Mode register rounding mode fields:
4630 //
4631 // [1:0] Single-precision round mode.
4632 // [3:2] Double/Half-precision round mode.
4633 //
4634 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4635 //
4636 // Hardware Spec
4637 // Toward-0 3 0
4638 // Nearest Even 0 1
4639 // +Inf 1 2
4640 // -Inf 2 3
4641 // NearestAway0 N/A 4
4642 //
4643 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4644 // table we can index by the raw hardware mode.
4645 //
4646 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4647
4648 SDValue BitTable =
4650
4651 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4652 SDValue RoundModeTimesNumBits =
4653 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4654
4655 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4656 // knew only one mode was demanded.
4657 SDValue TableValue =
4658 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4659 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4660
4661 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4662 SDValue TableEntry =
4663 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4664
4665 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4666 // if it's an extended value.
4667 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4668 SDValue IsStandardValue =
4669 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4670 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4671 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4672 TableEntry, EnumOffset);
4673
4674 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4675}
4676
4678 SelectionDAG &DAG) const {
4679 SDLoc SL(Op);
4680
4681 SDValue NewMode = Op.getOperand(1);
4682 assert(NewMode.getValueType() == MVT::i32);
4683
4684 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4685 // hardware MODE.fp_round values.
4686 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4687 uint32_t ClampedVal = std::min(
4688 static_cast<uint32_t>(ConstMode->getZExtValue()),
4690 NewMode = DAG.getConstant(
4691 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4692 } else {
4693 // If we know the input can only be one of the supported standard modes in
4694 // the range 0-3, we can use a simplified mapping to hardware values.
4695 KnownBits KB = DAG.computeKnownBits(NewMode);
4696 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4697 // The supported standard values are 0-3. The extended values start at 8. We
4698 // need to offset by 4 if the value is in the extended range.
4699
4700 if (UseReducedTable) {
4701 // Truncate to the low 32-bits.
4702 SDValue BitTable = DAG.getConstant(
4703 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4704
4705 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4706 SDValue RoundModeTimesNumBits =
4707 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4708
4709 NewMode =
4710 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4711
4712 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4713 // the table extracted bits into inline immediates.
4714 } else {
4715 // table_index = umin(value, value - 4)
4716 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4717 SDValue BitTable =
4719
4720 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4721 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4722 SDValue IndexVal =
4723 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4724
4725 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4726 SDValue RoundModeTimesNumBits =
4727 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4728
4729 SDValue TableValue =
4730 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4731 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4732
4733 // No need to mask out the high bits since the setreg will ignore them
4734 // anyway.
4735 NewMode = TruncTable;
4736 }
4737
4738 // Insert a readfirstlane in case the value is a VGPR. We could do this
4739 // earlier and keep more operations scalar, but that interferes with
4740 // combining the source.
4741 SDValue ReadFirstLaneID =
4742 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4743 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4744 ReadFirstLaneID, NewMode);
4745 }
4746
4747 // N.B. The setreg will be later folded into s_round_mode on supported
4748 // targets.
4749 SDValue IntrinID =
4750 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4751 uint32_t BothRoundHwReg =
4753 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4754
4755 SDValue SetReg =
4756 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4757 IntrinID, RoundBothImm, NewMode);
4758
4759 return SetReg;
4760}
4761
4763 if (Op->isDivergent() &&
4764 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4765 // Cannot do I$ prefetch with divergent pointer.
4766 return SDValue();
4767
4768 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4772 break;
4774 if (Subtarget->hasSafeSmemPrefetch())
4775 break;
4776 [[fallthrough]];
4777 default:
4778 return SDValue();
4779 }
4780
4781 // I$ prefetch
4782 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4783 return SDValue();
4784
4785 return Op;
4786}
4787
4788// Work around DAG legality rules only based on the result type.
4790 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4791 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4792 EVT SrcVT = Src.getValueType();
4793
4794 if (SrcVT.getScalarType() != MVT::bf16)
4795 return Op;
4796
4797 SDLoc SL(Op);
4798 SDValue BitCast =
4799 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4800
4801 EVT DstVT = Op.getValueType();
4802 if (IsStrict)
4803 llvm_unreachable("Need STRICT_BF16_TO_FP");
4804
4805 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4806}
4807
4809 SDLoc SL(Op);
4810 if (Op.getValueType() != MVT::i64)
4811 return Op;
4812
4813 uint32_t ModeHwReg =
4815 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4816 uint32_t TrapHwReg =
4818 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4819
4820 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4821 SDValue IntrinID =
4822 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4823 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4824 Op.getOperand(0), IntrinID, ModeHwRegImm);
4825 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4826 Op.getOperand(0), IntrinID, TrapHwRegImm);
4827 SDValue TokenReg =
4828 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4829 GetTrapReg.getValue(1));
4830
4831 SDValue CvtPtr =
4832 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4833 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4834
4835 return DAG.getMergeValues({Result, TokenReg}, SL);
4836}
4837
4839 SDLoc SL(Op);
4840 if (Op.getOperand(1).getValueType() != MVT::i64)
4841 return Op;
4842
4843 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4844 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4845 DAG.getConstant(0, SL, MVT::i32));
4846 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4847 DAG.getConstant(1, SL, MVT::i32));
4848
4849 SDValue ReadFirstLaneID =
4850 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4851 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4852 ReadFirstLaneID, NewModeReg);
4853 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4854 ReadFirstLaneID, NewTrapReg);
4855
4856 unsigned ModeHwReg =
4858 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4859 unsigned TrapHwReg =
4861 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4862
4863 SDValue IntrinID =
4864 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4865 SDValue SetModeReg =
4866 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4867 IntrinID, ModeHwRegImm, NewModeReg);
4868 SDValue SetTrapReg =
4869 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4870 IntrinID, TrapHwRegImm, NewTrapReg);
4871 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4872}
4873
4875 const MachineFunction &MF) const {
4876 const Function &Fn = MF.getFunction();
4877
4879 .Case("m0", AMDGPU::M0)
4880 .Case("exec", AMDGPU::EXEC)
4881 .Case("exec_lo", AMDGPU::EXEC_LO)
4882 .Case("exec_hi", AMDGPU::EXEC_HI)
4883 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4884 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4885 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4886 .Default(Register());
4887 if (!Reg)
4888 return Reg;
4889
4890 if (!Subtarget->hasFlatScrRegister() &&
4891 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4892 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4893 "\" for subtarget."));
4894 }
4895
4896 switch (Reg) {
4897 case AMDGPU::M0:
4898 case AMDGPU::EXEC_LO:
4899 case AMDGPU::EXEC_HI:
4900 case AMDGPU::FLAT_SCR_LO:
4901 case AMDGPU::FLAT_SCR_HI:
4902 if (VT.getSizeInBits() == 32)
4903 return Reg;
4904 break;
4905 case AMDGPU::EXEC:
4906 case AMDGPU::FLAT_SCR:
4907 if (VT.getSizeInBits() == 64)
4908 return Reg;
4909 break;
4910 default:
4911 llvm_unreachable("missing register type checking");
4912 }
4913
4915 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4916}
4917
4918// If kill is not the last instruction, split the block so kill is always a
4919// proper terminator.
4922 MachineBasicBlock *BB) const {
4923 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4925 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4926 return SplitBB;
4927}
4928
4929// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4930// \p MI will be the only instruction in the loop body block. Otherwise, it will
4931// be the first instruction in the remainder block.
4932//
4933/// \returns { LoopBody, Remainder }
4934static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4936 MachineFunction *MF = MBB.getParent();
4938
4939 // To insert the loop we need to split the block. Move everything after this
4940 // point to a new block, and insert a new empty block between the two.
4942 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4944 ++MBBI;
4945
4946 MF->insert(MBBI, LoopBB);
4947 MF->insert(MBBI, RemainderBB);
4948
4949 LoopBB->addSuccessor(LoopBB);
4950 LoopBB->addSuccessor(RemainderBB);
4951
4952 // Move the rest of the block into a new block.
4953 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4954
4955 if (InstInLoop) {
4956 auto Next = std::next(I);
4957
4958 // Move instruction to loop body.
4959 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4960
4961 // Move the rest of the block.
4962 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4963 } else {
4964 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4965 }
4966
4967 MBB.addSuccessor(LoopBB);
4968
4969 return std::pair(LoopBB, RemainderBB);
4970}
4971
4972/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4974 MachineBasicBlock *MBB = MI.getParent();
4976 auto I = MI.getIterator();
4977 auto E = std::next(I);
4978
4979 // clang-format off
4980 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4981 .addImm(0);
4982 // clang-format on
4983
4984 MIBundleBuilder Bundler(*MBB, I, E);
4985 finalizeBundle(*MBB, Bundler.begin());
4986}
4987
4990 MachineBasicBlock *BB) const {
4991 const DebugLoc &DL = MI.getDebugLoc();
4992
4994
4996
4997 // Apparently kill flags are only valid if the def is in the same block?
4998 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4999 Src->setIsKill(false);
5000
5001 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5002
5003 MachineBasicBlock::iterator I = LoopBB->end();
5004
5005 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5007
5008 // Clear TRAP_STS.MEM_VIOL
5009 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5010 .addImm(0)
5011 .addImm(EncodedReg);
5012
5014
5015 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5016
5017 // Load and check TRAP_STS.MEM_VIOL
5018 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5019 .addImm(EncodedReg);
5020
5021 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5022 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5023 .addReg(Reg, RegState::Kill)
5024 .addImm(0);
5025 // clang-format off
5026 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5027 .addMBB(LoopBB);
5028 // clang-format on
5029
5030 return RemainderBB;
5031}
5032
5033// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5034// wavefront. If the value is uniform and just happens to be in a VGPR, this
5035// will only do one iteration. In the worst case, this will loop 64 times.
5036//
5037// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5040 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5041 const DebugLoc &DL, const MachineOperand &Idx,
5042 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5043 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5044 Register &SGPRIdxReg) {
5045
5046 MachineFunction *MF = OrigBB.getParent();
5047 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5048 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5051
5052 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5053 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5054 Register NewExec = MRI.createVirtualRegister(BoolRC);
5055 Register CurrentIdxReg =
5056 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5057 Register CondReg = MRI.createVirtualRegister(BoolRC);
5058
5059 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5060 .addReg(InitReg)
5061 .addMBB(&OrigBB)
5062 .addReg(ResultReg)
5063 .addMBB(&LoopBB);
5064
5065 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5066 .addReg(InitSaveExecReg)
5067 .addMBB(&OrigBB)
5068 .addReg(NewExec)
5069 .addMBB(&LoopBB);
5070
5071 // Read the next variant <- also loop target.
5072 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5073 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5074
5075 // Compare the just read M0 value to all possible Idx values.
5076 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5077 .addReg(CurrentIdxReg)
5078 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5079
5080 // Update EXEC, save the original EXEC value to VCC.
5081 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5082 .addReg(CondReg, RegState::Kill);
5083
5084 MRI.setSimpleHint(NewExec, CondReg);
5085
5086 if (UseGPRIdxMode) {
5087 if (Offset == 0) {
5088 SGPRIdxReg = CurrentIdxReg;
5089 } else {
5090 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5091 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5092 .addReg(CurrentIdxReg, RegState::Kill)
5093 .addImm(Offset);
5094 }
5095 } else {
5096 // Move index from VCC into M0
5097 if (Offset == 0) {
5098 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5099 .addReg(CurrentIdxReg, RegState::Kill);
5100 } else {
5101 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5102 .addReg(CurrentIdxReg, RegState::Kill)
5103 .addImm(Offset);
5104 }
5105 }
5106
5107 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5108 MachineInstr *InsertPt =
5109 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5110 .addReg(LMC.ExecReg)
5111 .addReg(NewExec);
5112
5113 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5114 // s_cbranch_scc0?
5115
5116 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5117 // clang-format off
5118 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5119 .addMBB(&LoopBB);
5120 // clang-format on
5121
5122 return InsertPt->getIterator();
5123}
5124
5125// This has slightly sub-optimal regalloc when the source vector is killed by
5126// the read. The register allocator does not understand that the kill is
5127// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5128// subregister from it, using 1 more VGPR than necessary. This was saved when
5129// this was expanded after register allocation.
5132 unsigned InitResultReg, unsigned PhiReg, int Offset,
5133 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5134 MachineFunction *MF = MBB.getParent();
5135 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5136 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5138 const DebugLoc &DL = MI.getDebugLoc();
5140
5141 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5142 Register DstReg = MI.getOperand(0).getReg();
5143 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5144 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5146
5147 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5148
5149 // Save the EXEC mask
5150 // clang-format off
5151 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5152 .addReg(LMC.ExecReg);
5153 // clang-format on
5154
5155 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5156
5157 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5158
5159 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5160 InitResultReg, DstReg, PhiReg, TmpExec,
5161 Offset, UseGPRIdxMode, SGPRIdxReg);
5162
5163 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5165 ++MBBI;
5166 MF->insert(MBBI, LandingPad);
5167 LoopBB->removeSuccessor(RemainderBB);
5168 LandingPad->addSuccessor(RemainderBB);
5169 LoopBB->addSuccessor(LandingPad);
5170 MachineBasicBlock::iterator First = LandingPad->begin();
5171 // clang-format off
5172 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5173 .addReg(SaveExec);
5174 // clang-format on
5175
5176 return InsPt;
5177}
5178
5179// Returns subreg index, offset
5180static std::pair<unsigned, int>
5182 const TargetRegisterClass *SuperRC, unsigned VecReg,
5183 int Offset) {
5184 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5185
5186 // Skip out of bounds offsets, or else we would end up using an undefined
5187 // register.
5188 if (Offset >= NumElts || Offset < 0)
5189 return std::pair(AMDGPU::sub0, Offset);
5190
5191 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5192}
5193
5196 int Offset) {
5197 MachineBasicBlock *MBB = MI.getParent();
5198 const DebugLoc &DL = MI.getDebugLoc();
5200
5201 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5202
5203 assert(Idx->getReg() != AMDGPU::NoRegister);
5204
5205 if (Offset == 0) {
5206 // clang-format off
5207 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5208 .add(*Idx);
5209 // clang-format on
5210 } else {
5211 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5212 .add(*Idx)
5213 .addImm(Offset);
5214 }
5215}
5216
5219 int Offset) {
5220 MachineBasicBlock *MBB = MI.getParent();
5221 const DebugLoc &DL = MI.getDebugLoc();
5223
5224 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5225
5226 if (Offset == 0)
5227 return Idx->getReg();
5228
5229 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5230 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5231 .add(*Idx)
5232 .addImm(Offset);
5233 return Tmp;
5234}
5235
5238 const GCNSubtarget &ST) {
5239 const SIInstrInfo *TII = ST.getInstrInfo();
5240 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5241 MachineFunction *MF = MBB.getParent();
5243
5244 Register Dst = MI.getOperand(0).getReg();
5245 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5246 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5247 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5248
5249 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5250 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5251
5252 unsigned SubReg;
5253 std::tie(SubReg, Offset) =
5254 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5255
5256 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5257
5258 // Check for a SGPR index.
5259 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5261 const DebugLoc &DL = MI.getDebugLoc();
5262
5263 if (UseGPRIdxMode) {
5264 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5265 // to avoid interfering with other uses, so probably requires a new
5266 // optimization pass.
5268
5269 const MCInstrDesc &GPRIDXDesc =
5270 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5271 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5272 .addReg(SrcReg)
5273 .addReg(Idx)
5274 .addImm(SubReg);
5275 } else {
5277
5278 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5279 .addReg(SrcReg, 0, SubReg)
5280 .addReg(SrcReg, RegState::Implicit);
5281 }
5282
5283 MI.eraseFromParent();
5284
5285 return &MBB;
5286 }
5287
5288 // Control flow needs to be inserted if indexing with a VGPR.
5289 const DebugLoc &DL = MI.getDebugLoc();
5291
5292 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5293 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5294
5295 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5296
5297 Register SGPRIdxReg;
5298 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5299 UseGPRIdxMode, SGPRIdxReg);
5300
5301 MachineBasicBlock *LoopBB = InsPt->getParent();
5302
5303 if (UseGPRIdxMode) {
5304 const MCInstrDesc &GPRIDXDesc =
5305 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5306
5307 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5308 .addReg(SrcReg)
5309 .addReg(SGPRIdxReg)
5310 .addImm(SubReg);
5311 } else {
5312 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5313 .addReg(SrcReg, 0, SubReg)
5314 .addReg(SrcReg, RegState::Implicit);
5315 }
5316
5317 MI.eraseFromParent();
5318
5319 return LoopBB;
5320}
5321
5324 const GCNSubtarget &ST) {
5325 const SIInstrInfo *TII = ST.getInstrInfo();
5326 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5327 MachineFunction *MF = MBB.getParent();
5329
5330 Register Dst = MI.getOperand(0).getReg();
5331 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5332 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5333 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5334 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5335 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5336 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5337
5338 // This can be an immediate, but will be folded later.
5339 assert(Val->getReg());
5340
5341 unsigned SubReg;
5342 std::tie(SubReg, Offset) =
5343 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5344 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5345
5346 if (Idx->getReg() == AMDGPU::NoRegister) {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 assert(Offset == 0);
5351
5352 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5353 .add(*SrcVec)
5354 .add(*Val)
5355 .addImm(SubReg);
5356
5357 MI.eraseFromParent();
5358 return &MBB;
5359 }
5360
5361 // Check for a SGPR index.
5362 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5364 const DebugLoc &DL = MI.getDebugLoc();
5365
5366 if (UseGPRIdxMode) {
5368
5369 const MCInstrDesc &GPRIDXDesc =
5370 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5371 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5372 .addReg(SrcVec->getReg())
5373 .add(*Val)
5374 .addReg(Idx)
5375 .addImm(SubReg);
5376 } else {
5378
5379 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5380 TRI.getRegSizeInBits(*VecRC), 32, false);
5381 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5382 .addReg(SrcVec->getReg())
5383 .add(*Val)
5384 .addImm(SubReg);
5385 }
5386 MI.eraseFromParent();
5387 return &MBB;
5388 }
5389
5390 // Control flow needs to be inserted if indexing with a VGPR.
5391 if (Val->isReg())
5392 MRI.clearKillFlags(Val->getReg());
5393
5394 const DebugLoc &DL = MI.getDebugLoc();
5395
5396 Register PhiReg = MRI.createVirtualRegister(VecRC);
5397
5398 Register SGPRIdxReg;
5399 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5400 UseGPRIdxMode, SGPRIdxReg);
5401 MachineBasicBlock *LoopBB = InsPt->getParent();
5402
5403 if (UseGPRIdxMode) {
5404 const MCInstrDesc &GPRIDXDesc =
5405 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5406
5407 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5408 .addReg(PhiReg)
5409 .add(*Val)
5410 .addReg(SGPRIdxReg)
5411 .addImm(SubReg);
5412 } else {
5413 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5414 TRI.getRegSizeInBits(*VecRC), 32, false);
5415 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5416 .addReg(PhiReg)
5417 .add(*Val)
5418 .addImm(SubReg);
5419 }
5420
5421 MI.eraseFromParent();
5422 return LoopBB;
5423}
5424
5426 MachineBasicBlock *BB) {
5427 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5428 // For GFX12, we emit s_add_u64 and s_sub_u64.
5429 MachineFunction *MF = BB->getParent();
5430 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5431 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5433 const DebugLoc &DL = MI.getDebugLoc();
5434 MachineOperand &Dest = MI.getOperand(0);
5435 MachineOperand &Src0 = MI.getOperand(1);
5436 MachineOperand &Src1 = MI.getOperand(2);
5437 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5438 if (ST.hasScalarAddSub64()) {
5439 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5440 // clang-format off
5441 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5442 .add(Src0)
5443 .add(Src1);
5444 // clang-format on
5445 } else {
5446 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5447 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5448
5449 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5450 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5451
5452 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5453 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5454 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5455 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5456
5457 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5458 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5459 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5460 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5461
5462 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5463 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5464 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5465 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5466 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5467 .addReg(DestSub0)
5468 .addImm(AMDGPU::sub0)
5469 .addReg(DestSub1)
5470 .addImm(AMDGPU::sub1);
5471 }
5472 MI.eraseFromParent();
5473 return BB;
5474}
5475
5477 switch (Opc) {
5478 case AMDGPU::S_MIN_U32:
5479 return std::numeric_limits<uint32_t>::max();
5480 case AMDGPU::S_MIN_I32:
5481 return std::numeric_limits<int32_t>::max();
5482 case AMDGPU::S_MAX_U32:
5483 return std::numeric_limits<uint32_t>::min();
5484 case AMDGPU::S_MAX_I32:
5485 return std::numeric_limits<int32_t>::min();
5486 case AMDGPU::V_ADD_F32_e64: // -0.0
5487 return 0x80000000;
5488 case AMDGPU::V_SUB_F32_e64: // +0.0
5489 return 0x0;
5490 case AMDGPU::S_ADD_I32:
5491 case AMDGPU::S_SUB_I32:
5492 case AMDGPU::S_OR_B32:
5493 case AMDGPU::S_XOR_B32:
5494 return std::numeric_limits<uint32_t>::min();
5495 case AMDGPU::S_AND_B32:
5496 return std::numeric_limits<uint32_t>::max();
5497 case AMDGPU::V_MIN_F32_e64:
5498 case AMDGPU::V_MAX_F32_e64:
5499 return 0x7fc00000; // qNAN
5500 default:
5502 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5503 }
5504}
5505
5507 switch (Opc) {
5508 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5509 return std::numeric_limits<uint64_t>::max();
5510 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5511 return std::numeric_limits<int64_t>::max();
5512 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5513 return std::numeric_limits<uint64_t>::min();
5514 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5515 return std::numeric_limits<int64_t>::min();
5516 case AMDGPU::S_ADD_U64_PSEUDO:
5517 case AMDGPU::S_SUB_U64_PSEUDO:
5518 case AMDGPU::S_OR_B64:
5519 case AMDGPU::S_XOR_B64:
5520 return std::numeric_limits<uint64_t>::min();
5521 case AMDGPU::S_AND_B64:
5522 return std::numeric_limits<uint64_t>::max();
5523 default:
5525 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5526 }
5527}
5528
5529static bool is32bitWaveReduceOperation(unsigned Opc) {
5530 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5531 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5532 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5533 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5534 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5535 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5536 Opc == AMDGPU::V_SUB_F32_e64;
5537}
5538
5540 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5541 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5542}
5543
5546 const GCNSubtarget &ST,
5547 unsigned Opc) {
5549 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5550 const DebugLoc &DL = MI.getDebugLoc();
5551 const SIInstrInfo *TII = ST.getInstrInfo();
5552
5553 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5554 Register SrcReg = MI.getOperand(1).getReg();
5555 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5556 Register DstReg = MI.getOperand(0).getReg();
5557 MachineBasicBlock *RetBB = nullptr;
5558 if (isSGPR) {
5559 switch (Opc) {
5560 case AMDGPU::S_MIN_U32:
5561 case AMDGPU::S_MIN_I32:
5562 case AMDGPU::V_MIN_F32_e64:
5563 case AMDGPU::S_MAX_U32:
5564 case AMDGPU::S_MAX_I32:
5565 case AMDGPU::V_MAX_F32_e64:
5566 case AMDGPU::S_AND_B32:
5567 case AMDGPU::S_OR_B32: {
5568 // Idempotent operations.
5569 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5570 RetBB = &BB;
5571 break;
5572 }
5573 case AMDGPU::V_CMP_LT_U64_e64: // umin
5574 case AMDGPU::V_CMP_LT_I64_e64: // min
5575 case AMDGPU::V_CMP_GT_U64_e64: // umax
5576 case AMDGPU::V_CMP_GT_I64_e64: // max
5577 case AMDGPU::S_AND_B64:
5578 case AMDGPU::S_OR_B64: {
5579 // Idempotent operations.
5580 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5581 RetBB = &BB;
5582 break;
5583 }
5584 case AMDGPU::S_XOR_B32:
5585 case AMDGPU::S_XOR_B64:
5586 case AMDGPU::S_ADD_I32:
5587 case AMDGPU::S_ADD_U64_PSEUDO:
5588 case AMDGPU::V_ADD_F32_e64:
5589 case AMDGPU::S_SUB_I32:
5590 case AMDGPU::S_SUB_U64_PSEUDO:
5591 case AMDGPU::V_SUB_F32_e64: {
5592 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5593 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5594 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5595 Register NumActiveLanes =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597
5598 bool IsWave32 = ST.isWave32();
5599 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5600 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5601 unsigned BitCountOpc =
5602 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5603
5604 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5605
5606 auto NewAccumulator =
5607 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5608 .addReg(ExecMask);
5609
5610 switch (Opc) {
5611 case AMDGPU::S_XOR_B32:
5612 case AMDGPU::S_XOR_B64: {
5613 // Performing an XOR operation on a uniform value
5614 // depends on the parity of the number of active lanes.
5615 // For even parity, the result will be 0, for odd
5616 // parity the result will be the same as the input value.
5617 Register ParityRegister =
5618 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5619
5620 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5621 .addReg(NewAccumulator->getOperand(0).getReg())
5622 .addImm(1)
5623 .setOperandDead(3); // Dead scc
5624 if (Opc == AMDGPU::S_XOR_B32) {
5625 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5626 .addReg(SrcReg)
5627 .addReg(ParityRegister);
5628 } else {
5629 Register DestSub0 =
5630 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5631 Register DestSub1 =
5632 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5633
5634 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5635 const TargetRegisterClass *SrcSubRC =
5636 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5637
5638 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5639 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5640 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5641 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5642
5643 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5644 .add(Op1L)
5645 .addReg(ParityRegister);
5646
5647 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5648 .add(Op1H)
5649 .addReg(ParityRegister);
5650
5651 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5652 .addReg(DestSub0)
5653 .addImm(AMDGPU::sub0)
5654 .addReg(DestSub1)
5655 .addImm(AMDGPU::sub1);
5656 }
5657 break;
5658 }
5659 case AMDGPU::S_SUB_I32: {
5660 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5661
5662 // Take the negation of the source operand.
5663 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5664 .addImm(0)
5665 .addReg(SrcReg);
5666 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5667 .addReg(NegatedVal)
5668 .addReg(NewAccumulator->getOperand(0).getReg());
5669 break;
5670 }
5671 case AMDGPU::S_ADD_I32: {
5672 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5673 .addReg(SrcReg)
5674 .addReg(NewAccumulator->getOperand(0).getReg());
5675 break;
5676 }
5677 case AMDGPU::S_ADD_U64_PSEUDO:
5678 case AMDGPU::S_SUB_U64_PSEUDO: {
5679 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5680 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5681 Register Op1H_Op0L_Reg =
5682 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5683 Register Op1L_Op0H_Reg =
5684 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5685 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5686 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5687 Register NegatedValLo =
5688 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5689 Register NegatedValHi =
5690 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5691
5692 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5693 const TargetRegisterClass *Src1SubRC =
5694 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5695
5696 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5697 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5698 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5699 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5700
5701 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5702 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5703 .addImm(0)
5704 .addReg(NewAccumulator->getOperand(0).getReg())
5705 .setOperandDead(3); // Dead scc
5706 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5707 .addReg(NegatedValLo)
5708 .addImm(31)
5709 .setOperandDead(3); // Dead scc
5710 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5711 .add(Op1L)
5712 .addReg(NegatedValHi);
5713 }
5714 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5715 ? NegatedValLo
5716 : NewAccumulator->getOperand(0).getReg();
5717 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5718 .add(Op1L)
5719 .addReg(LowOpcode);
5720 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5721 .add(Op1L)
5722 .addReg(LowOpcode);
5723 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5724 .add(Op1H)
5725 .addReg(LowOpcode);
5726
5727 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5728 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5729 .addReg(CarryReg)
5730 .addReg(Op1H_Op0L_Reg)
5731 .setOperandDead(3); // Dead scc
5732
5733 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5734 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5735 .addReg(HiVal)
5736 .addReg(Op1L_Op0H_Reg)
5737 .setOperandDead(3); // Dead scc
5738 }
5739 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5740 .addReg(DestSub0)
5741 .addImm(AMDGPU::sub0)
5742 .addReg(DestSub1)
5743 .addImm(AMDGPU::sub1);
5744 break;
5745 }
5746 case AMDGPU::V_ADD_F32_e64:
5747 case AMDGPU::V_SUB_F32_e64: {
5748 Register ActiveLanesVreg =
5749 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5750 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5751 // Get number of active lanes as a float val.
5752 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5753 ActiveLanesVreg)
5754 .addReg(NewAccumulator->getOperand(0).getReg())
5755 .addImm(0) // clamp
5756 .addImm(0); // output-modifier
5757
5758 // Take negation of input for SUB reduction
5759 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5760 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5761 .addImm(srcMod) // src0 modifier
5762 .addReg(SrcReg)
5763 .addImm(0) // src1 modifier
5764 .addReg(ActiveLanesVreg)
5765 .addImm(0) // clamp
5766 .addImm(0); // output-mod
5767 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5768 .addReg(DstVreg);
5769 }
5770 }
5771 RetBB = &BB;
5772 }
5773 }
5774 } else {
5775 // TODO: Implement DPP Strategy and switch based on immediate strategy
5776 // operand. For now, for all the cases (default, Iterative and DPP we use
5777 // iterative approach by default.)
5778
5779 // To reduce the VGPR using iterative approach, we need to iterate
5780 // over all the active lanes. Lowering consists of ComputeLoop,
5781 // which iterate over only active lanes. We use copy of EXEC register
5782 // as induction variable and every active lane modifies it using bitset0
5783 // so that we will get the next active lane for next iteration.
5785 Register SrcReg = MI.getOperand(1).getReg();
5786 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5788
5789 // Create Control flow for loop
5790 // Split MI's Machine Basic block into For loop
5791 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5792
5793 // Create virtual registers required for lowering.
5794 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5795 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5796 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5797 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5798 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5799 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5800 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5801 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5802 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5803
5804 bool IsWave32 = ST.isWave32();
5805 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5806 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5807
5808 // Create initial values of induction variable from Exec, Accumulator and
5809 // insert branch instr to newly created ComputeBlock
5810 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5811 if (is32BitOpc) {
5813 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5814 .addImm(IdentityValue);
5815 } else {
5817 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5818 .addImm(IdentityValue);
5819 }
5820 // clang-format off
5821 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5822 .addMBB(ComputeLoop);
5823 // clang-format on
5824
5825 // Start constructing ComputeLoop
5826 I = ComputeLoop->begin();
5827 auto Accumulator =
5828 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5829 .addReg(IdentityValReg)
5830 .addMBB(&BB);
5831 auto ActiveBits =
5832 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5833 .addReg(LoopIterator)
5834 .addMBB(&BB);
5835
5836 I = ComputeLoop->end();
5837 MachineInstr *NewAccumulator;
5838 // Perform the computations
5839 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5840 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5841 .addReg(ActiveBitsReg);
5842 if (is32BitOpc) {
5843 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5844 LaneValueReg)
5845 .addReg(SrcReg)
5846 .addReg(FF1Reg);
5847 if (isFPOp) {
5848 Register LaneValVreg =
5849 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5850 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5851 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5852 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5853 LaneValVreg)
5854 .addReg(LaneValueReg);
5855 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5856 .addImm(0) // src0 modifier
5857 .addReg(Accumulator->getOperand(0).getReg())
5858 .addImm(0) // src1 modifier
5859 .addReg(LaneValVreg)
5860 .addImm(0) // clamp
5861 .addImm(0); // omod
5862 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5863 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5864 .addReg(DstVreg);
5865 } else {
5866 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5867 .addReg(Accumulator->getOperand(0).getReg())
5868 .addReg(LaneValueReg);
5869 }
5870 } else {
5871 Register LaneValueLoReg =
5872 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5873 Register LaneValueHiReg =
5874 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5875 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5876 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5877 const TargetRegisterClass *SrcSubRC =
5878 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5879 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5880 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5881 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5882 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5883 // lane value input should be in an sgpr
5884 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5885 LaneValueLoReg)
5886 .add(Op1L)
5887 .addReg(FF1Reg);
5888 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5889 LaneValueHiReg)
5890 .add(Op1H)
5891 .addReg(FF1Reg);
5892 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5893 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5894 .addReg(LaneValueLoReg)
5895 .addImm(AMDGPU::sub0)
5896 .addReg(LaneValueHiReg)
5897 .addImm(AMDGPU::sub1);
5898 switch (Opc) {
5899 case AMDGPU::S_OR_B64:
5900 case AMDGPU::S_AND_B64:
5901 case AMDGPU::S_XOR_B64: {
5902 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5903 .addReg(Accumulator->getOperand(0).getReg())
5904 .addReg(LaneValue->getOperand(0).getReg())
5905 .setOperandDead(3); // Dead scc
5906 break;
5907 }
5908 case AMDGPU::V_CMP_GT_I64_e64:
5909 case AMDGPU::V_CMP_GT_U64_e64:
5910 case AMDGPU::V_CMP_LT_I64_e64:
5911 case AMDGPU::V_CMP_LT_U64_e64: {
5912 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5913 Register ComparisonResultReg =
5914 MRI.createVirtualRegister(WaveMaskRegClass);
5915 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5916 const TargetRegisterClass *VSubRegClass =
5917 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5918 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5919 MachineOperand SrcReg0Sub0 =
5920 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5921 VregClass, AMDGPU::sub0, VSubRegClass);
5922 MachineOperand SrcReg0Sub1 =
5923 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5924 VregClass, AMDGPU::sub1, VSubRegClass);
5925 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5926 AccumulatorVReg)
5927 .add(SrcReg0Sub0)
5928 .addImm(AMDGPU::sub0)
5929 .add(SrcReg0Sub1)
5930 .addImm(AMDGPU::sub1);
5931 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5932 .addReg(LaneValue->getOperand(0).getReg())
5933 .addReg(AccumulatorVReg);
5934
5935 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5936 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5937 .addReg(LaneMaskReg)
5938 .addReg(ActiveBitsReg);
5939
5940 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5941 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5942 .addReg(LaneValue->getOperand(0).getReg())
5943 .addReg(Accumulator->getOperand(0).getReg());
5944 break;
5945 }
5946 case AMDGPU::S_ADD_U64_PSEUDO:
5947 case AMDGPU::S_SUB_U64_PSEUDO: {
5948 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5949 .addReg(Accumulator->getOperand(0).getReg())
5950 .addReg(LaneValue->getOperand(0).getReg());
5951 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5952 break;
5953 }
5954 }
5955 }
5956 // Manipulate the iterator to get the next active lane
5957 unsigned BITSETOpc =
5958 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5959 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5960 .addReg(FF1Reg)
5961 .addReg(ActiveBitsReg);
5962
5963 // Add phi nodes
5964 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5965 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5966
5967 // Creating branching
5968 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5969 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5970 .addReg(NewActiveBitsReg)
5971 .addImm(0);
5972 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5973 .addMBB(ComputeLoop);
5974
5975 RetBB = ComputeEnd;
5976 }
5977 MI.eraseFromParent();
5978 return RetBB;
5979}
5980
5983 MachineBasicBlock *BB) const {
5984 MachineFunction *MF = BB->getParent();
5986 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5988 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5990 const DebugLoc &DL = MI.getDebugLoc();
5991
5992 switch (MI.getOpcode()) {
5993 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5994 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5995 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5996 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5997 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5998 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5999 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6000 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6001 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6002 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6003 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6004 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6005 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6006 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6007 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6008 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6009 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6010 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6011 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6012 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6013 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6014 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6015 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6016 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6017 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6018 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6019 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6020 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6021 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6022 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6023 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6024 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6025 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6026 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6027 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6028 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6029 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6030 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6031 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6032 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6033 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6034 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6035 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6036 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6037 case AMDGPU::S_UADDO_PSEUDO:
6038 case AMDGPU::S_USUBO_PSEUDO: {
6039 MachineOperand &Dest0 = MI.getOperand(0);
6040 MachineOperand &Dest1 = MI.getOperand(1);
6041 MachineOperand &Src0 = MI.getOperand(2);
6042 MachineOperand &Src1 = MI.getOperand(3);
6043
6044 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6045 ? AMDGPU::S_ADD_U32
6046 : AMDGPU::S_SUB_U32;
6047 // clang-format off
6048 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6049 .add(Src0)
6050 .add(Src1);
6051 // clang-format on
6052
6053 unsigned SelOpc =
6054 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6055 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6056
6057 MI.eraseFromParent();
6058 return BB;
6059 }
6060 case AMDGPU::S_ADD_U64_PSEUDO:
6061 case AMDGPU::S_SUB_U64_PSEUDO: {
6062 return Expand64BitScalarArithmetic(MI, BB);
6063 }
6064 case AMDGPU::V_ADD_U64_PSEUDO:
6065 case AMDGPU::V_SUB_U64_PSEUDO: {
6066 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6067
6068 MachineOperand &Dest = MI.getOperand(0);
6069 MachineOperand &Src0 = MI.getOperand(1);
6070 MachineOperand &Src1 = MI.getOperand(2);
6071
6072 if (ST.hasAddSubU64Insts()) {
6073 auto I = BuildMI(*BB, MI, DL,
6074 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6075 : AMDGPU::V_SUB_U64_e64),
6076 Dest.getReg())
6077 .add(Src0)
6078 .add(Src1)
6079 .addImm(0); // clamp
6080 TII->legalizeOperands(*I);
6081 MI.eraseFromParent();
6082 return BB;
6083 }
6084
6085 if (IsAdd && ST.hasLshlAddU64Inst()) {
6086 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6087 Dest.getReg())
6088 .add(Src0)
6089 .addImm(0)
6090 .add(Src1);
6091 TII->legalizeOperands(*Add);
6092 MI.eraseFromParent();
6093 return BB;
6094 }
6095
6096 const auto *CarryRC = TRI->getWaveMaskRegClass();
6097
6098 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6099 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6100
6101 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6102 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6103
6104 const TargetRegisterClass *Src0RC = Src0.isReg()
6105 ? MRI.getRegClass(Src0.getReg())
6106 : &AMDGPU::VReg_64RegClass;
6107 const TargetRegisterClass *Src1RC = Src1.isReg()
6108 ? MRI.getRegClass(Src1.getReg())
6109 : &AMDGPU::VReg_64RegClass;
6110
6111 const TargetRegisterClass *Src0SubRC =
6112 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6113 const TargetRegisterClass *Src1SubRC =
6114 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6115
6116 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6117 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6118 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6119 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6120
6121 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6122 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6123 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6124 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6125
6126 unsigned LoOpc =
6127 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6128 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6129 .addReg(CarryReg, RegState::Define)
6130 .add(SrcReg0Sub0)
6131 .add(SrcReg1Sub0)
6132 .addImm(0); // clamp bit
6133
6134 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6135 MachineInstr *HiHalf =
6136 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6137 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6138 .add(SrcReg0Sub1)
6139 .add(SrcReg1Sub1)
6140 .addReg(CarryReg, RegState::Kill)
6141 .addImm(0); // clamp bit
6142
6143 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6144 .addReg(DestSub0)
6145 .addImm(AMDGPU::sub0)
6146 .addReg(DestSub1)
6147 .addImm(AMDGPU::sub1);
6148 TII->legalizeOperands(*LoHalf);
6149 TII->legalizeOperands(*HiHalf);
6150 MI.eraseFromParent();
6151 return BB;
6152 }
6153 case AMDGPU::S_ADD_CO_PSEUDO:
6154 case AMDGPU::S_SUB_CO_PSEUDO: {
6155 // This pseudo has a chance to be selected
6156 // only from uniform add/subcarry node. All the VGPR operands
6157 // therefore assumed to be splat vectors.
6159 MachineOperand &Dest = MI.getOperand(0);
6160 MachineOperand &CarryDest = MI.getOperand(1);
6161 MachineOperand &Src0 = MI.getOperand(2);
6162 MachineOperand &Src1 = MI.getOperand(3);
6163 MachineOperand &Src2 = MI.getOperand(4);
6164 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6165 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6166 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6167 .addReg(Src0.getReg());
6168 Src0.setReg(RegOp0);
6169 }
6170 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6171 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6172 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6173 .addReg(Src1.getReg());
6174 Src1.setReg(RegOp1);
6175 }
6176 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6177 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6178 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6179 .addReg(Src2.getReg());
6180 Src2.setReg(RegOp2);
6181 }
6182
6183 if (ST.isWave64()) {
6184 if (ST.hasScalarCompareEq64()) {
6185 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6186 .addReg(Src2.getReg())
6187 .addImm(0);
6188 } else {
6189 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6190 const TargetRegisterClass *SubRC =
6191 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6192 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6193 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6194 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6195 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6196 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6197
6198 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6199 .add(Src2Sub0)
6200 .add(Src2Sub1);
6201
6202 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6203 .addReg(Src2_32, RegState::Kill)
6204 .addImm(0);
6205 }
6206 } else {
6207 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6208 .addReg(Src2.getReg())
6209 .addImm(0);
6210 }
6211
6212 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6213 ? AMDGPU::S_ADDC_U32
6214 : AMDGPU::S_SUBB_U32;
6215
6216 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6217
6218 unsigned SelOpc =
6219 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6220
6221 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6222 .addImm(-1)
6223 .addImm(0);
6224
6225 MI.eraseFromParent();
6226 return BB;
6227 }
6228 case AMDGPU::SI_INIT_M0: {
6229 MachineOperand &M0Init = MI.getOperand(0);
6230 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6231 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6232 AMDGPU::M0)
6233 .add(M0Init);
6234 MI.eraseFromParent();
6235 return BB;
6236 }
6237 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6238 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6239 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6240 TII->get(AMDGPU::S_CMP_EQ_U32))
6241 .addImm(0)
6242 .addImm(0);
6243 return BB;
6244 }
6245 case AMDGPU::GET_GROUPSTATICSIZE: {
6246 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6247 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6248 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6249 .add(MI.getOperand(0))
6250 .addImm(MFI->getLDSSize());
6251 MI.eraseFromParent();
6252 return BB;
6253 }
6254 case AMDGPU::GET_SHADERCYCLESHILO: {
6256 // The algorithm is:
6257 //
6258 // hi1 = getreg(SHADER_CYCLES_HI)
6259 // lo1 = getreg(SHADER_CYCLES_LO)
6260 // hi2 = getreg(SHADER_CYCLES_HI)
6261 //
6262 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6263 // Otherwise there was overflow and the result is hi2:0. In both cases the
6264 // result should represent the actual time at some point during the sequence
6265 // of three getregs.
6266 using namespace AMDGPU::Hwreg;
6267 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6268 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6269 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6270 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6271 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6272 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6273 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6274 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6275 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6276 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6277 .addReg(RegHi1)
6278 .addReg(RegHi2);
6279 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6280 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6281 .addReg(RegLo1)
6282 .addImm(0);
6283 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6284 .add(MI.getOperand(0))
6285 .addReg(RegLo)
6286 .addImm(AMDGPU::sub0)
6287 .addReg(RegHi2)
6288 .addImm(AMDGPU::sub1);
6289 MI.eraseFromParent();
6290 return BB;
6291 }
6292 case AMDGPU::SI_INDIRECT_SRC_V1:
6293 case AMDGPU::SI_INDIRECT_SRC_V2:
6294 case AMDGPU::SI_INDIRECT_SRC_V3:
6295 case AMDGPU::SI_INDIRECT_SRC_V4:
6296 case AMDGPU::SI_INDIRECT_SRC_V5:
6297 case AMDGPU::SI_INDIRECT_SRC_V6:
6298 case AMDGPU::SI_INDIRECT_SRC_V7:
6299 case AMDGPU::SI_INDIRECT_SRC_V8:
6300 case AMDGPU::SI_INDIRECT_SRC_V9:
6301 case AMDGPU::SI_INDIRECT_SRC_V10:
6302 case AMDGPU::SI_INDIRECT_SRC_V11:
6303 case AMDGPU::SI_INDIRECT_SRC_V12:
6304 case AMDGPU::SI_INDIRECT_SRC_V16:
6305 case AMDGPU::SI_INDIRECT_SRC_V32:
6306 return emitIndirectSrc(MI, *BB, *getSubtarget());
6307 case AMDGPU::SI_INDIRECT_DST_V1:
6308 case AMDGPU::SI_INDIRECT_DST_V2:
6309 case AMDGPU::SI_INDIRECT_DST_V3:
6310 case AMDGPU::SI_INDIRECT_DST_V4:
6311 case AMDGPU::SI_INDIRECT_DST_V5:
6312 case AMDGPU::SI_INDIRECT_DST_V6:
6313 case AMDGPU::SI_INDIRECT_DST_V7:
6314 case AMDGPU::SI_INDIRECT_DST_V8:
6315 case AMDGPU::SI_INDIRECT_DST_V9:
6316 case AMDGPU::SI_INDIRECT_DST_V10:
6317 case AMDGPU::SI_INDIRECT_DST_V11:
6318 case AMDGPU::SI_INDIRECT_DST_V12:
6319 case AMDGPU::SI_INDIRECT_DST_V16:
6320 case AMDGPU::SI_INDIRECT_DST_V32:
6321 return emitIndirectDst(MI, *BB, *getSubtarget());
6322 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6323 case AMDGPU::SI_KILL_I1_PSEUDO:
6324 return splitKillBlock(MI, BB);
6325 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6326 Register Dst = MI.getOperand(0).getReg();
6327 const MachineOperand &Src0 = MI.getOperand(1);
6328 const MachineOperand &Src1 = MI.getOperand(2);
6329 Register SrcCond = MI.getOperand(3).getReg();
6330
6331 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6332 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6333 const auto *CondRC = TRI->getWaveMaskRegClass();
6334 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6335
6336 const TargetRegisterClass *Src0RC = Src0.isReg()
6337 ? MRI.getRegClass(Src0.getReg())
6338 : &AMDGPU::VReg_64RegClass;
6339 const TargetRegisterClass *Src1RC = Src1.isReg()
6340 ? MRI.getRegClass(Src1.getReg())
6341 : &AMDGPU::VReg_64RegClass;
6342
6343 const TargetRegisterClass *Src0SubRC =
6344 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6345 const TargetRegisterClass *Src1SubRC =
6346 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6347
6348 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6349 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6350 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6351 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6352
6353 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6354 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6355 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6356 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6357
6358 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6359 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6360 .addImm(0)
6361 .add(Src0Sub0)
6362 .addImm(0)
6363 .add(Src1Sub0)
6364 .addReg(SrcCondCopy);
6365 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6366 .addImm(0)
6367 .add(Src0Sub1)
6368 .addImm(0)
6369 .add(Src1Sub1)
6370 .addReg(SrcCondCopy);
6371
6372 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6373 .addReg(DstLo)
6374 .addImm(AMDGPU::sub0)
6375 .addReg(DstHi)
6376 .addImm(AMDGPU::sub1);
6377 MI.eraseFromParent();
6378 return BB;
6379 }
6380 case AMDGPU::SI_BR_UNDEF: {
6381 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6382 .add(MI.getOperand(0));
6383 Br->getOperand(1).setIsUndef(); // read undef SCC
6384 MI.eraseFromParent();
6385 return BB;
6386 }
6387 case AMDGPU::ADJCALLSTACKUP:
6388 case AMDGPU::ADJCALLSTACKDOWN: {
6390 MachineInstrBuilder MIB(*MF, &MI);
6391 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6392 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6393 return BB;
6394 }
6395 case AMDGPU::SI_CALL_ISEL: {
6396 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6397
6399 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6400
6401 for (const MachineOperand &MO : MI.operands())
6402 MIB.add(MO);
6403
6404 MIB.cloneMemRefs(MI);
6405 MI.eraseFromParent();
6406 return BB;
6407 }
6408 case AMDGPU::V_ADD_CO_U32_e32:
6409 case AMDGPU::V_SUB_CO_U32_e32:
6410 case AMDGPU::V_SUBREV_CO_U32_e32: {
6411 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6412 unsigned Opc = MI.getOpcode();
6413
6414 bool NeedClampOperand = false;
6415 if (TII->pseudoToMCOpcode(Opc) == -1) {
6417 NeedClampOperand = true;
6418 }
6419
6420 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6421 if (TII->isVOP3(*I)) {
6422 I.addReg(TRI->getVCC(), RegState::Define);
6423 }
6424 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6425 if (NeedClampOperand)
6426 I.addImm(0); // clamp bit for e64 encoding
6427
6428 TII->legalizeOperands(*I);
6429
6430 MI.eraseFromParent();
6431 return BB;
6432 }
6433 case AMDGPU::V_ADDC_U32_e32:
6434 case AMDGPU::V_SUBB_U32_e32:
6435 case AMDGPU::V_SUBBREV_U32_e32:
6436 // These instructions have an implicit use of vcc which counts towards the
6437 // constant bus limit.
6438 TII->legalizeOperands(MI);
6439 return BB;
6440 case AMDGPU::DS_GWS_INIT:
6441 case AMDGPU::DS_GWS_SEMA_BR:
6442 case AMDGPU::DS_GWS_BARRIER:
6443 case AMDGPU::DS_GWS_SEMA_V:
6444 case AMDGPU::DS_GWS_SEMA_P:
6445 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6446 // A s_waitcnt 0 is required to be the instruction immediately following.
6447 if (getSubtarget()->hasGWSAutoReplay()) {
6449 return BB;
6450 }
6451
6452 return emitGWSMemViolTestLoop(MI, BB);
6453 case AMDGPU::S_SETREG_B32: {
6454 // Try to optimize cases that only set the denormal mode or rounding mode.
6455 //
6456 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6457 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6458 // instead.
6459 //
6460 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6461 // allow you to have a no side effect instruction in the output of a
6462 // sideeffecting pattern.
6463 auto [ID, Offset, Width] =
6464 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6466 return BB;
6467
6468 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6469 const unsigned SetMask = WidthMask << Offset;
6470
6471 if (getSubtarget()->hasDenormModeInst()) {
6472 unsigned SetDenormOp = 0;
6473 unsigned SetRoundOp = 0;
6474
6475 // The dedicated instructions can only set the whole denorm or round mode
6476 // at once, not a subset of bits in either.
6477 if (SetMask ==
6479 // If this fully sets both the round and denorm mode, emit the two
6480 // dedicated instructions for these.
6481 SetRoundOp = AMDGPU::S_ROUND_MODE;
6482 SetDenormOp = AMDGPU::S_DENORM_MODE;
6483 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6484 SetRoundOp = AMDGPU::S_ROUND_MODE;
6485 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6486 SetDenormOp = AMDGPU::S_DENORM_MODE;
6487 }
6488
6489 if (SetRoundOp || SetDenormOp) {
6490 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6491 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6492 unsigned ImmVal = Def->getOperand(1).getImm();
6493 if (SetRoundOp) {
6494 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6495 .addImm(ImmVal & 0xf);
6496
6497 // If we also have the denorm mode, get just the denorm mode bits.
6498 ImmVal >>= 4;
6499 }
6500
6501 if (SetDenormOp) {
6502 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6503 .addImm(ImmVal & 0xf);
6504 }
6505
6506 MI.eraseFromParent();
6507 return BB;
6508 }
6509 }
6510 }
6511
6512 // If only FP bits are touched, used the no side effects pseudo.
6513 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6514 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6515 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6516
6517 return BB;
6518 }
6519 case AMDGPU::S_INVERSE_BALLOT_U32:
6520 case AMDGPU::S_INVERSE_BALLOT_U64:
6521 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6522 // necessary. After that they are equivalent to a COPY.
6523 MI.setDesc(TII->get(AMDGPU::COPY));
6524 return BB;
6525 case AMDGPU::ENDPGM_TRAP: {
6526 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6527 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6528 MI.addOperand(MachineOperand::CreateImm(0));
6529 return BB;
6530 }
6531
6532 // We need a block split to make the real endpgm a terminator. We also don't
6533 // want to break phis in successor blocks, so we can't just delete to the
6534 // end of the block.
6535
6536 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6538 MF->push_back(TrapBB);
6539 // clang-format off
6540 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6541 .addImm(0);
6542 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6543 .addMBB(TrapBB);
6544 // clang-format on
6545
6546 BB->addSuccessor(TrapBB);
6547 MI.eraseFromParent();
6548 return SplitBB;
6549 }
6550 case AMDGPU::SIMULATED_TRAP: {
6551 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6552 MachineBasicBlock *SplitBB =
6553 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6554 MI.eraseFromParent();
6555 return SplitBB;
6556 }
6557 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6558 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6560
6561 // During ISel, it's difficult to propagate the original EXEC mask to use as
6562 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6563 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6564 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6565 Register OriginalExec = Setup->getOperand(0).getReg();
6566 MF->getRegInfo().clearKillFlags(OriginalExec);
6567 MI.getOperand(0).setReg(OriginalExec);
6568 return BB;
6569 }
6570 default:
6571 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6572 if (!MI.mayStore())
6574 return BB;
6575 }
6577 }
6578}
6579
6581 // This currently forces unfolding various combinations of fsub into fma with
6582 // free fneg'd operands. As long as we have fast FMA (controlled by
6583 // isFMAFasterThanFMulAndFAdd), we should perform these.
6584
6585 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6586 // most of these combines appear to be cycle neutral but save on instruction
6587 // count / code size.
6588 return true;
6589}
6590
6592
6594 EVT VT) const {
6595 if (!VT.isVector()) {
6596 return MVT::i1;
6597 }
6598 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6599}
6600
6602 // TODO: Should i16 be used always if legal? For now it would force VALU
6603 // shifts.
6604 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6605}
6606
6608 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6609 ? Ty.changeElementSize(16)
6610 : Ty.changeElementSize(32);
6611}
6612
6613// Answering this is somewhat tricky and depends on the specific device which
6614// have different rates for fma or all f64 operations.
6615//
6616// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6617// regardless of which device (although the number of cycles differs between
6618// devices), so it is always profitable for f64.
6619//
6620// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6621// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6622// which we can always do even without fused FP ops since it returns the same
6623// result as the separate operations and since it is always full
6624// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6625// however does not support denormals, so we do report fma as faster if we have
6626// a fast fma device and require denormals.
6627//
6629 EVT VT) const {
6630 VT = VT.getScalarType();
6631
6632 switch (VT.getSimpleVT().SimpleTy) {
6633 case MVT::f32: {
6634 // If mad is not available this depends only on if f32 fma is full rate.
6635 if (!Subtarget->hasMadMacF32Insts())
6636 return Subtarget->hasFastFMAF32();
6637
6638 // Otherwise f32 mad is always full rate and returns the same result as
6639 // the separate operations so should be preferred over fma.
6640 // However does not support denormals.
6642 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6643
6644 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6645 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6646 }
6647 case MVT::f64:
6648 return true;
6649 case MVT::f16:
6650 case MVT::bf16:
6651 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6652 default:
6653 break;
6654 }
6655
6656 return false;
6657}
6658
6660 LLT Ty) const {
6661 switch (Ty.getScalarSizeInBits()) {
6662 case 16:
6663 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6664 case 32:
6665 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6666 case 64:
6667 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6668 default:
6669 break;
6670 }
6671
6672 return false;
6673}
6674
6676 if (!Ty.isScalar())
6677 return false;
6678
6679 if (Ty.getScalarSizeInBits() == 16)
6680 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6681 if (Ty.getScalarSizeInBits() == 32)
6682 return Subtarget->hasMadMacF32Insts() &&
6683 denormalModeIsFlushAllF32(*MI.getMF());
6684
6685 return false;
6686}
6687
6689 const SDNode *N) const {
6690 // TODO: Check future ftz flag
6691 // v_mad_f32/v_mac_f32 do not support denormals.
6692 EVT VT = N->getValueType(0);
6693 if (VT == MVT::f32)
6694 return Subtarget->hasMadMacF32Insts() &&
6696 if (VT == MVT::f16) {
6697 return Subtarget->hasMadF16() &&
6699 }
6700
6701 return false;
6702}
6703
6704//===----------------------------------------------------------------------===//
6705// Custom DAG Lowering Operations
6706//===----------------------------------------------------------------------===//
6707
6708// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6709// wider vector type is legal.
6711 SelectionDAG &DAG) const {
6712 unsigned Opc = Op.getOpcode();
6713 EVT VT = Op.getValueType();
6714 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6715 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6716 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6717 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6718 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6719 VT == MVT::v32bf16);
6720
6721 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6722
6723 SDLoc SL(Op);
6724 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6725 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6726
6727 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6728}
6729
6730// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6731// regression whereby extra unnecessary instructions were added to codegen
6732// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6733// instructions to extract the result from the vector.
6735 [[maybe_unused]] EVT VT = Op.getValueType();
6736
6737 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6738 VT == MVT::v16i32) &&
6739 "Unexpected ValueType.");
6740
6741 return DAG.UnrollVectorOp(Op.getNode());
6742}
6743
6744// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6745// wider vector type is legal.
6747 SelectionDAG &DAG) const {
6748 unsigned Opc = Op.getOpcode();
6749 EVT VT = Op.getValueType();
6750 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6751 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6752 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6753 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6754 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6755 VT == MVT::v32bf16);
6756
6757 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6758 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6759
6760 SDLoc SL(Op);
6761
6762 SDValue OpLo =
6763 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6764 SDValue OpHi =
6765 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6766
6767 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6768}
6769
6771 SelectionDAG &DAG) const {
6772 unsigned Opc = Op.getOpcode();
6773 EVT VT = Op.getValueType();
6774 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6775 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6776 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6777 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6778 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6779 VT == MVT::v32bf16);
6780
6781 SDValue Op0 = Op.getOperand(0);
6782 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6783 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6784 : std::pair(Op0, Op0);
6785
6786 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6787 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6788
6789 SDLoc SL(Op);
6790 auto ResVT = DAG.GetSplitDestVTs(VT);
6791
6792 SDValue OpLo =
6793 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6794 SDValue OpHi =
6795 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6796
6797 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6798}
6799
6801 switch (Op.getOpcode()) {
6802 default:
6804 case ISD::BRCOND:
6805 return LowerBRCOND(Op, DAG);
6806 case ISD::RETURNADDR:
6807 return LowerRETURNADDR(Op, DAG);
6808 case ISD::LOAD: {
6809 SDValue Result = LowerLOAD(Op, DAG);
6810 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6811 "Load should return a value and a chain");
6812 return Result;
6813 }
6814 case ISD::FSQRT: {
6815 EVT VT = Op.getValueType();
6816 if (VT == MVT::f32)
6817 return lowerFSQRTF32(Op, DAG);
6818 if (VT == MVT::f64)
6819 return lowerFSQRTF64(Op, DAG);
6820 return SDValue();
6821 }
6822 case ISD::FSIN:
6823 case ISD::FCOS:
6824 return LowerTrig(Op, DAG);
6825 case ISD::SELECT:
6826 return LowerSELECT(Op, DAG);
6827 case ISD::FDIV:
6828 return LowerFDIV(Op, DAG);
6829 case ISD::FFREXP:
6830 return LowerFFREXP(Op, DAG);
6832 return LowerATOMIC_CMP_SWAP(Op, DAG);
6833 case ISD::STORE:
6834 return LowerSTORE(Op, DAG);
6835 case ISD::GlobalAddress: {
6838 return LowerGlobalAddress(MFI, Op, DAG);
6839 }
6841 return LowerExternalSymbol(Op, DAG);
6843 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6845 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6847 return LowerINTRINSIC_VOID(Op, DAG);
6848 case ISD::ADDRSPACECAST:
6849 return lowerADDRSPACECAST(Op, DAG);
6851 return lowerINSERT_SUBVECTOR(Op, DAG);
6853 return lowerINSERT_VECTOR_ELT(Op, DAG);
6855 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6857 return lowerVECTOR_SHUFFLE(Op, DAG);
6859 return lowerSCALAR_TO_VECTOR(Op, DAG);
6860 case ISD::BUILD_VECTOR:
6861 return lowerBUILD_VECTOR(Op, DAG);
6862 case ISD::FP_ROUND:
6864 return lowerFP_ROUND(Op, DAG);
6865 case ISD::TRAP:
6866 return lowerTRAP(Op, DAG);
6867 case ISD::DEBUGTRAP:
6868 return lowerDEBUGTRAP(Op, DAG);
6869 case ISD::ABS:
6870 case ISD::FABS:
6871 case ISD::FNEG:
6872 case ISD::FCANONICALIZE:
6873 case ISD::BSWAP:
6874 return splitUnaryVectorOp(Op, DAG);
6875 case ISD::FMINNUM:
6876 case ISD::FMAXNUM:
6877 return lowerFMINNUM_FMAXNUM(Op, DAG);
6878 case ISD::FMINIMUMNUM:
6879 case ISD::FMAXIMUMNUM:
6880 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6881 case ISD::FMINIMUM:
6882 case ISD::FMAXIMUM:
6883 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6884 case ISD::FLDEXP:
6885 case ISD::STRICT_FLDEXP:
6886 return lowerFLDEXP(Op, DAG);
6887 case ISD::FMA:
6888 return splitTernaryVectorOp(Op, DAG);
6889 case ISD::FP_TO_SINT:
6890 case ISD::FP_TO_UINT:
6891 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6892 Op.getValueType() == MVT::i16 &&
6893 Op.getOperand(0).getValueType() == MVT::f32) {
6894 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6895 return Op;
6896 }
6897 return LowerFP_TO_INT(Op, DAG);
6898 case ISD::SHL:
6899 case ISD::SRA:
6900 case ISD::SRL:
6901 case ISD::ADD:
6902 case ISD::SUB:
6903 case ISD::SMIN:
6904 case ISD::SMAX:
6905 case ISD::UMIN:
6906 case ISD::UMAX:
6907 case ISD::FADD:
6908 case ISD::FMUL:
6909 case ISD::FMINNUM_IEEE:
6910 case ISD::FMAXNUM_IEEE:
6911 case ISD::UADDSAT:
6912 case ISD::USUBSAT:
6913 case ISD::SADDSAT:
6914 case ISD::SSUBSAT:
6915 return splitBinaryVectorOp(Op, DAG);
6916 case ISD::FCOPYSIGN:
6917 return lowerFCOPYSIGN(Op, DAG);
6918 case ISD::MUL:
6919 return lowerMUL(Op, DAG);
6920 case ISD::SMULO:
6921 case ISD::UMULO:
6922 return lowerXMULO(Op, DAG);
6923 case ISD::SMUL_LOHI:
6924 case ISD::UMUL_LOHI:
6925 return lowerXMUL_LOHI(Op, DAG);
6927 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6928 case ISD::STACKSAVE:
6929 return LowerSTACKSAVE(Op, DAG);
6930 case ISD::GET_ROUNDING:
6931 return lowerGET_ROUNDING(Op, DAG);
6932 case ISD::SET_ROUNDING:
6933 return lowerSET_ROUNDING(Op, DAG);
6934 case ISD::PREFETCH:
6935 return lowerPREFETCH(Op, DAG);
6936 case ISD::FP_EXTEND:
6938 return lowerFP_EXTEND(Op, DAG);
6939 case ISD::GET_FPENV:
6940 return lowerGET_FPENV(Op, DAG);
6941 case ISD::SET_FPENV:
6942 return lowerSET_FPENV(Op, DAG);
6943 case ISD::ROTR:
6944 return lowerROTR(Op, DAG);
6945 }
6946 return SDValue();
6947}
6948
6949// Used for D16: Casts the result of an instruction into the right vector,
6950// packs values if loads return unpacked values.
6952 const SDLoc &DL, SelectionDAG &DAG,
6953 bool Unpacked) {
6954 if (!LoadVT.isVector())
6955 return Result;
6956
6957 // Cast back to the original packed type or to a larger type that is a
6958 // multiple of 32 bit for D16. Widening the return type is a required for
6959 // legalization.
6960 EVT FittingLoadVT = LoadVT;
6961 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6962 FittingLoadVT =
6964 LoadVT.getVectorNumElements() + 1);
6965 }
6966
6967 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6968 // Truncate to v2i16/v4i16.
6969 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6970
6971 // Workaround legalizer not scalarizing truncate after vector op
6972 // legalization but not creating intermediate vector trunc.
6974 DAG.ExtractVectorElements(Result, Elts);
6975 for (SDValue &Elt : Elts)
6976 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6977
6978 // Pad illegal v1i16/v3fi6 to v4i16
6979 if ((LoadVT.getVectorNumElements() % 2) == 1)
6980 Elts.push_back(DAG.getPOISON(MVT::i16));
6981
6982 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6983
6984 // Bitcast to original type (v2f16/v4f16).
6985 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6986 }
6987
6988 // Cast back to the original packed type.
6989 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6990}
6991
6992SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6993 SelectionDAG &DAG,
6995 bool IsIntrinsic) const {
6996 SDLoc DL(M);
6997
6998 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6999 EVT LoadVT = M->getValueType(0);
7000
7001 EVT EquivLoadVT = LoadVT;
7002 if (LoadVT.isVector()) {
7003 if (Unpacked) {
7004 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7005 LoadVT.getVectorNumElements());
7006 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7007 // Widen v3f16 to legal type
7008 EquivLoadVT =
7010 LoadVT.getVectorNumElements() + 1);
7011 }
7012 }
7013
7014 // Change from v4f16/v2f16 to EquivLoadVT.
7015 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7016
7018 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7019 M->getMemoryVT(), M->getMemOperand());
7020
7021 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7022
7023 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7024}
7025
7026SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7027 SelectionDAG &DAG,
7028 ArrayRef<SDValue> Ops) const {
7029 SDLoc DL(M);
7030 EVT LoadVT = M->getValueType(0);
7031 EVT EltType = LoadVT.getScalarType();
7032 EVT IntVT = LoadVT.changeTypeToInteger();
7033
7034 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7035
7036 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7037 bool IsTFE = M->getNumValues() == 3;
7038
7039 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7040 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7041 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7042 : AMDGPUISD::BUFFER_LOAD;
7043
7044 if (IsD16) {
7045 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7046 }
7047
7048 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7049 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7050 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7051 IsTFE);
7052
7053 if (isTypeLegal(LoadVT)) {
7054 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7055 M->getMemOperand(), DAG);
7056 }
7057
7058 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7059 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7060 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7061 M->getMemOperand(), DAG);
7062 return DAG.getMergeValues(
7063 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7064 DL);
7065}
7066
7068 SelectionDAG &DAG) {
7069 EVT VT = N->getValueType(0);
7070 unsigned CondCode = N->getConstantOperandVal(3);
7071 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7072 return DAG.getPOISON(VT);
7073
7074 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7075
7076 SDValue LHS = N->getOperand(1);
7077 SDValue RHS = N->getOperand(2);
7078
7079 SDLoc DL(N);
7080
7081 EVT CmpVT = LHS.getValueType();
7082 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7083 unsigned PromoteOp =
7085 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7086 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7087 }
7088
7089 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7090
7091 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7092 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7093
7094 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7095 DAG.getCondCode(CCOpcode));
7096 if (VT.bitsEq(CCVT))
7097 return SetCC;
7098 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7099}
7100
7102 SelectionDAG &DAG) {
7103 EVT VT = N->getValueType(0);
7104
7105 unsigned CondCode = N->getConstantOperandVal(3);
7106 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7107 return DAG.getPOISON(VT);
7108
7109 SDValue Src0 = N->getOperand(1);
7110 SDValue Src1 = N->getOperand(2);
7111 EVT CmpVT = Src0.getValueType();
7112 SDLoc SL(N);
7113
7114 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7115 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7116 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7117 }
7118
7119 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7120 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7121 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7122 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7123 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7124 DAG.getCondCode(CCOpcode));
7125 if (VT.bitsEq(CCVT))
7126 return SetCC;
7127 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7128}
7129
7131 SelectionDAG &DAG) {
7132 EVT VT = N->getValueType(0);
7133 SDValue Src = N->getOperand(1);
7134 SDLoc SL(N);
7135
7136 if (Src.getOpcode() == ISD::SETCC) {
7137 SDValue Op0 = Src.getOperand(0);
7138 SDValue Op1 = Src.getOperand(1);
7139 // Need to expand bfloat to float for comparison (setcc).
7140 if (Op0.getValueType() == MVT::bf16) {
7141 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7142 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7143 }
7144 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7145 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7146 }
7147 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7148 // (ballot 0) -> 0
7149 if (Arg->isZero())
7150 return DAG.getConstant(0, SL, VT);
7151
7152 // (ballot 1) -> EXEC/EXEC_LO
7153 if (Arg->isOne()) {
7154 Register Exec;
7155 if (VT.getScalarSizeInBits() == 32)
7156 Exec = AMDGPU::EXEC_LO;
7157 else if (VT.getScalarSizeInBits() == 64)
7158 Exec = AMDGPU::EXEC;
7159 else
7160 return SDValue();
7161
7162 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7163 }
7164 }
7165
7166 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7167 // ISD::SETNE)
7168 return DAG.getNode(
7169 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7170 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7171}
7172
7174 SelectionDAG &DAG) {
7175 EVT VT = N->getValueType(0);
7176 unsigned ValSize = VT.getSizeInBits();
7177 unsigned IID = N->getConstantOperandVal(0);
7178 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7179 IID == Intrinsic::amdgcn_permlanex16;
7180 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7181 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7182 SDLoc SL(N);
7183 MVT IntVT = MVT::getIntegerVT(ValSize);
7184 const GCNSubtarget *ST = TLI.getSubtarget();
7185 unsigned SplitSize = 32;
7186 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7187 ST->hasDPALU_DPP() &&
7188 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7189 SplitSize = 64;
7190
7191 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7192 SDValue Src2, MVT ValT) -> SDValue {
7193 SmallVector<SDValue, 8> Operands;
7194 switch (IID) {
7195 case Intrinsic::amdgcn_permlane16:
7196 case Intrinsic::amdgcn_permlanex16:
7197 case Intrinsic::amdgcn_update_dpp:
7198 Operands.push_back(N->getOperand(6));
7199 Operands.push_back(N->getOperand(5));
7200 Operands.push_back(N->getOperand(4));
7201 [[fallthrough]];
7202 case Intrinsic::amdgcn_writelane:
7203 Operands.push_back(Src2);
7204 [[fallthrough]];
7205 case Intrinsic::amdgcn_readlane:
7206 case Intrinsic::amdgcn_set_inactive:
7207 case Intrinsic::amdgcn_set_inactive_chain_arg:
7208 case Intrinsic::amdgcn_mov_dpp8:
7209 Operands.push_back(Src1);
7210 [[fallthrough]];
7211 case Intrinsic::amdgcn_readfirstlane:
7212 case Intrinsic::amdgcn_permlane64:
7213 Operands.push_back(Src0);
7214 break;
7215 default:
7216 llvm_unreachable("unhandled lane op");
7217 }
7218
7219 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7220 std::reverse(Operands.begin(), Operands.end());
7221
7222 if (SDNode *GL = N->getGluedNode()) {
7223 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7224 GL = GL->getOperand(0).getNode();
7225 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7226 SDValue(GL, 0)));
7227 }
7228
7229 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7230 };
7231
7232 SDValue Src0 = N->getOperand(1);
7233 SDValue Src1, Src2;
7234 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7235 IID == Intrinsic::amdgcn_mov_dpp8 ||
7236 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7237 Src1 = N->getOperand(2);
7238 if (IID == Intrinsic::amdgcn_writelane ||
7239 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7240 Src2 = N->getOperand(3);
7241 }
7242
7243 if (ValSize == SplitSize) {
7244 // Already legal
7245 return SDValue();
7246 }
7247
7248 if (ValSize < 32) {
7249 bool IsFloat = VT.isFloatingPoint();
7250 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7251 SL, MVT::i32);
7252
7253 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7254 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7255 SL, MVT::i32);
7256 }
7257
7258 if (IID == Intrinsic::amdgcn_writelane) {
7259 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7260 SL, MVT::i32);
7261 }
7262
7263 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7264 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7265 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7266 }
7267
7268 if (ValSize % SplitSize != 0)
7269 return SDValue();
7270
7271 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7272 EVT VT = N->getValueType(0);
7273 unsigned NE = VT.getVectorNumElements();
7274 EVT EltVT = VT.getVectorElementType();
7276 unsigned NumOperands = N->getNumOperands();
7277 SmallVector<SDValue, 4> Operands(NumOperands);
7278 SDNode *GL = N->getGluedNode();
7279
7280 // only handle convergencectrl_glue
7282
7283 for (unsigned i = 0; i != NE; ++i) {
7284 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7285 ++j) {
7286 SDValue Operand = N->getOperand(j);
7287 EVT OperandVT = Operand.getValueType();
7288 if (OperandVT.isVector()) {
7289 // A vector operand; extract a single element.
7290 EVT OperandEltVT = OperandVT.getVectorElementType();
7291 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7292 Operand, DAG.getVectorIdxConstant(i, SL));
7293 } else {
7294 // A scalar operand; just use it as is.
7295 Operands[j] = Operand;
7296 }
7297 }
7298
7299 if (GL)
7300 Operands[NumOperands - 1] =
7301 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7302 SDValue(GL->getOperand(0).getNode(), 0));
7303
7304 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7305 }
7306
7307 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7308 return DAG.getBuildVector(VecVT, SL, Scalars);
7309 };
7310
7311 if (VT.isVector()) {
7312 switch (MVT::SimpleValueType EltTy =
7314 case MVT::i32:
7315 case MVT::f32:
7316 if (SplitSize == 32) {
7317 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7318 return unrollLaneOp(LaneOp.getNode());
7319 }
7320 [[fallthrough]];
7321 case MVT::i16:
7322 case MVT::f16:
7323 case MVT::bf16: {
7324 unsigned SubVecNumElt =
7325 SplitSize / VT.getVectorElementType().getSizeInBits();
7326 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7328 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7329 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7330 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7331 DAG.getConstant(EltIdx, SL, MVT::i32));
7332
7333 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7334 IsPermLane16)
7335 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7336 DAG.getConstant(EltIdx, SL, MVT::i32));
7337
7338 if (IID == Intrinsic::amdgcn_writelane)
7339 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7340 DAG.getConstant(EltIdx, SL, MVT::i32));
7341
7342 Pieces.push_back(
7343 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7344 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7345 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7346 EltIdx += SubVecNumElt;
7347 }
7348 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7349 }
7350 default:
7351 // Handle all other cases by bitcasting to i32 vectors
7352 break;
7353 }
7354 }
7355
7356 MVT VecVT =
7357 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7358 Src0 = DAG.getBitcast(VecVT, Src0);
7359
7360 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7361 Src1 = DAG.getBitcast(VecVT, Src1);
7362
7363 if (IID == Intrinsic::amdgcn_writelane)
7364 Src2 = DAG.getBitcast(VecVT, Src2);
7365
7366 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7367 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7368 return DAG.getBitcast(VT, UnrolledLaneOp);
7369}
7370
7372 SelectionDAG &DAG) {
7373 EVT VT = N->getValueType(0);
7374
7375 if (VT.getSizeInBits() != 32)
7376 return SDValue();
7377
7378 SDLoc SL(N);
7379
7380 SDValue Value = N->getOperand(1);
7381 SDValue Index = N->getOperand(2);
7382
7383 // ds_bpermute requires index to be multiplied by 4
7384 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7385 SDValue ShiftedIndex =
7386 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7387
7388 // Intrinsics will require i32 to operate on
7389 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7390
7391 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7392 SmallVector<SDValue> IntrinArgs) -> SDValue {
7393 SmallVector<SDValue> Operands(1);
7394 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7395 Operands.append(IntrinArgs);
7396 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7397 };
7398
7399 // If we can bpermute across the whole wave, then just do that
7401 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7402 {ShiftedIndex, ValueI32});
7403 return DAG.getBitcast(VT, BPermute);
7404 }
7405
7406 assert(TLI.getSubtarget()->isWave64());
7407
7408 // Otherwise, we need to make use of whole wave mode
7409 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7410
7411 // Set inactive lanes to poison
7412 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7413 {ValueI32, PoisonVal});
7414 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7415 {ShiftedIndex, PoisonVal});
7416
7417 SDValue Swapped =
7418 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7419
7420 // Get permutation of each half, then we'll select which one to use
7421 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7422 {WWMIndex, WWMValue});
7423 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7424 MVT::i32, {WWMIndex, Swapped});
7425 SDValue BPermOtherHalfWWM =
7426 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7427
7428 // Select which side to take the permute from
7429 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7430 // We can get away with only using mbcnt_lo here since we're only
7431 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7432 // returns 32 for lanes 32-63.
7433 SDValue ThreadID =
7434 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7435 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7436
7437 SDValue SameOrOtherHalf =
7438 DAG.getNode(ISD::AND, SL, MVT::i32,
7439 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7440 DAG.getTargetConstant(32, SL, MVT::i32));
7441 SDValue UseSameHalf =
7442 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7443 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7444 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7445 BPermOtherHalfWWM);
7446 return DAG.getBitcast(VT, Result);
7447}
7448
7451 SelectionDAG &DAG) const {
7452 switch (N->getOpcode()) {
7454 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7455 Results.push_back(Res);
7456 return;
7457 }
7459 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7460 Results.push_back(Res);
7461 return;
7462 }
7464 unsigned IID = N->getConstantOperandVal(0);
7465 switch (IID) {
7466 case Intrinsic::amdgcn_make_buffer_rsrc:
7467 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7468 return;
7469 case Intrinsic::amdgcn_cvt_pkrtz: {
7470 SDValue Src0 = N->getOperand(1);
7471 SDValue Src1 = N->getOperand(2);
7472 SDLoc SL(N);
7473 SDValue Cvt =
7474 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7475 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7476 return;
7477 }
7478 case Intrinsic::amdgcn_cvt_pknorm_i16:
7479 case Intrinsic::amdgcn_cvt_pknorm_u16:
7480 case Intrinsic::amdgcn_cvt_pk_i16:
7481 case Intrinsic::amdgcn_cvt_pk_u16: {
7482 SDValue Src0 = N->getOperand(1);
7483 SDValue Src1 = N->getOperand(2);
7484 SDLoc SL(N);
7485 unsigned Opcode;
7486
7487 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7488 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7489 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7490 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7491 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7492 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7493 else
7494 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7495
7496 EVT VT = N->getValueType(0);
7497 if (isTypeLegal(VT))
7498 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7499 else {
7500 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7501 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7502 }
7503 return;
7504 }
7505 case Intrinsic::amdgcn_s_buffer_load: {
7506 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7507 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7508 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7509 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7510 // s_buffer_load_i8.
7511 if (!Subtarget->hasScalarSubwordLoads())
7512 return;
7513 SDValue Op = SDValue(N, 0);
7514 SDValue Rsrc = Op.getOperand(1);
7515 SDValue Offset = Op.getOperand(2);
7516 SDValue CachePolicy = Op.getOperand(3);
7517 EVT VT = Op.getValueType();
7518 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7519 SDLoc DL(Op);
7521 const DataLayout &DataLayout = DAG.getDataLayout();
7522 Align Alignment =
7528 VT.getStoreSize(), Alignment);
7529 SDValue LoadVal;
7530 if (!Offset->isDivergent()) {
7531 SDValue Ops[] = {Rsrc, // source register
7532 Offset, CachePolicy};
7533 SDValue BufferLoad =
7534 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7535 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7536 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7537 } else {
7538 SDValue Ops[] = {
7539 DAG.getEntryNode(), // Chain
7540 Rsrc, // rsrc
7541 DAG.getConstant(0, DL, MVT::i32), // vindex
7542 {}, // voffset
7543 {}, // soffset
7544 {}, // offset
7545 CachePolicy, // cachepolicy
7546 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7547 };
7548 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7549 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7550 }
7551 Results.push_back(LoadVal);
7552 return;
7553 }
7554 case Intrinsic::amdgcn_dead: {
7555 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7556 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7557 return;
7558 }
7559 }
7560 break;
7561 }
7563 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7564 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7565 // FIXME: Hacky
7566 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7567 Results.push_back(Res.getOperand(I));
7568 }
7569 } else {
7570 Results.push_back(Res);
7571 Results.push_back(Res.getValue(1));
7572 }
7573 return;
7574 }
7575
7576 break;
7577 }
7578 case ISD::SELECT: {
7579 SDLoc SL(N);
7580 EVT VT = N->getValueType(0);
7581 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7582 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7583 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7584
7585 EVT SelectVT = NewVT;
7586 if (NewVT.bitsLT(MVT::i32)) {
7587 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7588 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7589 SelectVT = MVT::i32;
7590 }
7591
7592 SDValue NewSelect =
7593 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7594
7595 if (NewVT != SelectVT)
7596 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7597 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7598 return;
7599 }
7600 case ISD::FNEG: {
7601 if (N->getValueType(0) != MVT::v2f16)
7602 break;
7603
7604 SDLoc SL(N);
7605 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7606
7607 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7608 DAG.getConstant(0x80008000, SL, MVT::i32));
7609 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7610 return;
7611 }
7612 case ISD::FABS: {
7613 if (N->getValueType(0) != MVT::v2f16)
7614 break;
7615
7616 SDLoc SL(N);
7617 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7618
7619 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7620 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7621 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7622 return;
7623 }
7624 case ISD::FSQRT: {
7625 if (N->getValueType(0) != MVT::f16)
7626 break;
7627 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7628 break;
7629 }
7630 default:
7632 break;
7633 }
7634}
7635
7636/// Helper function for LowerBRCOND
7637static SDNode *findUser(SDValue Value, unsigned Opcode) {
7638
7639 for (SDUse &U : Value->uses()) {
7640 if (U.get() != Value)
7641 continue;
7642
7643 if (U.getUser()->getOpcode() == Opcode)
7644 return U.getUser();
7645 }
7646 return nullptr;
7647}
7648
7649unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7650 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7651 switch (Intr->getConstantOperandVal(1)) {
7652 case Intrinsic::amdgcn_if:
7653 return AMDGPUISD::IF;
7654 case Intrinsic::amdgcn_else:
7655 return AMDGPUISD::ELSE;
7656 case Intrinsic::amdgcn_loop:
7657 return AMDGPUISD::LOOP;
7658 case Intrinsic::amdgcn_end_cf:
7659 llvm_unreachable("should not occur");
7660 default:
7661 return 0;
7662 }
7663 }
7664
7665 // break, if_break, else_break are all only used as inputs to loop, not
7666 // directly as branch conditions.
7667 return 0;
7668}
7669
7676
7678 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7679 return false;
7680
7681 // FIXME: Either avoid relying on address space here or change the default
7682 // address space for functions to avoid the explicit check.
7683 return (GV->getValueType()->isFunctionTy() ||
7686}
7687
7689 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7690}
7691
7693 if (!GV->hasExternalLinkage())
7694 return true;
7695
7696 const auto OS = getTargetMachine().getTargetTriple().getOS();
7697 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7698}
7699
7700/// This transforms the control flow intrinsics to get the branch destination as
7701/// last parameter, also switches branch target with BR if the need arise
7702SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7703 SDLoc DL(BRCOND);
7704
7705 SDNode *Intr = BRCOND.getOperand(1).getNode();
7706 SDValue Target = BRCOND.getOperand(2);
7707 SDNode *BR = nullptr;
7708 SDNode *SetCC = nullptr;
7709
7710 switch (Intr->getOpcode()) {
7711 case ISD::SETCC: {
7712 // As long as we negate the condition everything is fine
7713 SetCC = Intr;
7714 Intr = SetCC->getOperand(0).getNode();
7715 break;
7716 }
7717 case ISD::XOR: {
7718 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7719 SDValue LHS = Intr->getOperand(0);
7720 SDValue RHS = Intr->getOperand(1);
7721 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7722 Intr = LHS.getNode();
7723 break;
7724 }
7725 [[fallthrough]];
7726 }
7727 default: {
7728 // Get the target from BR if we don't negate the condition
7729 BR = findUser(BRCOND, ISD::BR);
7730 assert(BR && "brcond missing unconditional branch user");
7731 Target = BR->getOperand(1);
7732 }
7733 }
7734
7735 unsigned CFNode = isCFIntrinsic(Intr);
7736 if (CFNode == 0) {
7737 // This is a uniform branch so we don't need to legalize.
7738 return BRCOND;
7739 }
7740
7741 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7743
7744 assert(!SetCC ||
7745 (SetCC->getConstantOperandVal(1) == 1 &&
7746 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7747 ISD::SETNE));
7748
7749 // operands of the new intrinsic call
7751 if (HaveChain)
7752 Ops.push_back(BRCOND.getOperand(0));
7753
7754 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7755 Ops.push_back(Target);
7756
7757 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7758
7759 // build the new intrinsic call
7760 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7761
7762 if (!HaveChain) {
7763 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7764
7766 }
7767
7768 if (BR) {
7769 // Give the branch instruction our target
7770 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7771 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7772 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7773 }
7774
7775 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7776
7777 // Copy the intrinsic results to registers
7778 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7779 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7780 if (!CopyToReg)
7781 continue;
7782
7783 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7784 SDValue(Result, i - 1), SDValue());
7785
7786 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7787 }
7788
7789 // Remove the old intrinsic from the chain
7790 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7791 Intr->getOperand(0));
7792
7793 return Chain;
7794}
7795
7796SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7797 MVT VT = Op.getSimpleValueType();
7798 SDLoc DL(Op);
7799 // Checking the depth
7800 if (Op.getConstantOperandVal(0) != 0)
7801 return DAG.getConstant(0, DL, VT);
7802
7803 MachineFunction &MF = DAG.getMachineFunction();
7804 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7805 // Check for kernel and shader functions
7806 if (Info->isEntryFunction())
7807 return DAG.getConstant(0, DL, VT);
7808
7809 MachineFrameInfo &MFI = MF.getFrameInfo();
7810 // There is a call to @llvm.returnaddress in this function
7811 MFI.setReturnAddressIsTaken(true);
7812
7813 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7814 // Get the return address reg and mark it as an implicit live-in
7815 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7816 getRegClassFor(VT, Op.getNode()->isDivergent()));
7817
7818 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7819}
7820
7821SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7822 const SDLoc &DL, EVT VT) const {
7823 return Op.getValueType().bitsLE(VT)
7824 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7825 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7826 DAG.getTargetConstant(0, DL, MVT::i32));
7827}
7828
7829SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7830 SelectionDAG &DAG) const {
7831 EVT DstVT = Op.getValueType();
7832 unsigned NumElts = DstVT.getVectorNumElements();
7833 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7834
7835 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7836
7837 SDLoc DL(Op);
7838 unsigned Opc = Op.getOpcode();
7839 SDValue Flags = Op.getOperand(1);
7840 EVT HalfDstVT =
7841 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7842 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7843 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7844
7845 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7846}
7847
7848SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7849 SDValue Src = Op.getOperand(0);
7850 EVT SrcVT = Src.getValueType();
7851 EVT DstVT = Op.getValueType();
7852
7853 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7854 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7855 if (SrcVT.getScalarType() != MVT::f32)
7856 return SDValue();
7857 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7858 }
7859
7860 if (SrcVT.getScalarType() != MVT::f64)
7861 return Op;
7862
7863 SDLoc DL(Op);
7864 if (DstVT == MVT::f16) {
7865 // TODO: Handle strictfp
7866 if (Op.getOpcode() != ISD::FP_ROUND)
7867 return Op;
7868
7869 if (!Subtarget->has16BitInsts()) {
7870 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7871 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7872 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7873 }
7874 if (Op->getFlags().hasApproximateFuncs()) {
7875 SDValue Flags = Op.getOperand(1);
7876 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7877 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7878 }
7879 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7880 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7881 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7882 }
7883
7884 assert(DstVT.getScalarType() == MVT::bf16 &&
7885 "custom lower FP_ROUND for f16 or bf16");
7886 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7887
7888 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7889 // hardware f32 -> bf16 instruction.
7890 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
7891 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7892 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7893 DAG.getTargetConstant(0, DL, MVT::i32));
7894}
7895
7896SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7897 SelectionDAG &DAG) const {
7898 EVT VT = Op.getValueType();
7899 const MachineFunction &MF = DAG.getMachineFunction();
7900 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7901 bool IsIEEEMode = Info->getMode().IEEE;
7902
7903 // FIXME: Assert during selection that this is only selected for
7904 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7905 // mode functions, but this happens to be OK since it's only done in cases
7906 // where there is known no sNaN.
7907 if (IsIEEEMode)
7908 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7909
7910 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7911 VT == MVT::v16bf16)
7912 return splitBinaryVectorOp(Op, DAG);
7913 return Op;
7914}
7915
7916SDValue
7917SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7918 SelectionDAG &DAG) const {
7919 EVT VT = Op.getValueType();
7920 const MachineFunction &MF = DAG.getMachineFunction();
7921 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7922 bool IsIEEEMode = Info->getMode().IEEE;
7923
7924 if (IsIEEEMode)
7925 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7926
7927 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7928 VT == MVT::v16bf16)
7929 return splitBinaryVectorOp(Op, DAG);
7930 return Op;
7931}
7932
7933SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7934 SelectionDAG &DAG) const {
7935 EVT VT = Op.getValueType();
7936 if (VT.isVector())
7937 return splitBinaryVectorOp(Op, DAG);
7938
7939 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7940 !Subtarget->hasMinimum3Maximum3F16() &&
7941 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7942 "should not need to widen f16 minimum/maximum to v2f16");
7943
7944 // Widen f16 operation to v2f16
7945
7946 // fminimum f16:x, f16:y ->
7947 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7948 // (v2f16 (scalar_to_vector y))), 0
7949 SDLoc SL(Op);
7950 SDValue WideSrc0 =
7951 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7952 SDValue WideSrc1 =
7953 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7954
7955 SDValue Widened =
7956 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7957
7958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7959 DAG.getConstant(0, SL, MVT::i32));
7960}
7961
7962SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7963 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7964 EVT VT = Op.getValueType();
7965 assert(VT == MVT::f16);
7966
7967 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7968 EVT ExpVT = Exp.getValueType();
7969 if (ExpVT == MVT::i16)
7970 return Op;
7971
7972 SDLoc DL(Op);
7973
7974 // Correct the exponent type for f16 to i16.
7975 // Clamp the range of the exponent to the instruction's range.
7976
7977 // TODO: This should be a generic narrowing legalization, and can easily be
7978 // for GlobalISel.
7979
7980 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7981 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7982
7983 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7984 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7985
7986 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7987
7988 if (IsStrict) {
7989 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7990 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7991 }
7992
7993 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7994}
7995
7997 switch (Op->getOpcode()) {
7998 case ISD::SRA:
7999 case ISD::SMIN:
8000 case ISD::SMAX:
8001 return ISD::SIGN_EXTEND;
8002 case ISD::SRL:
8003 case ISD::UMIN:
8004 case ISD::UMAX:
8005 return ISD::ZERO_EXTEND;
8006 case ISD::ADD:
8007 case ISD::SUB:
8008 case ISD::AND:
8009 case ISD::OR:
8010 case ISD::XOR:
8011 case ISD::SHL:
8012 case ISD::SELECT:
8013 case ISD::MUL:
8014 // operation result won't be influenced by garbage high bits.
8015 // TODO: are all of those cases correct, and are there more?
8016 return ISD::ANY_EXTEND;
8017 case ISD::SETCC: {
8018 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8020 }
8021 default:
8022 llvm_unreachable("unexpected opcode!");
8023 }
8024}
8025
8026SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8027 DAGCombinerInfo &DCI) const {
8028 const unsigned Opc = Op.getOpcode();
8029 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8030 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8031 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8032 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8033 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8034
8035 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8036 : Op->getOperand(0).getValueType();
8037 auto &DAG = DCI.DAG;
8038 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8039
8040 if (DCI.isBeforeLegalizeOps() ||
8041 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8042 return SDValue();
8043
8044 SDLoc DL(Op);
8045 SDValue LHS;
8046 SDValue RHS;
8047 if (Opc == ISD::SELECT) {
8048 LHS = Op->getOperand(1);
8049 RHS = Op->getOperand(2);
8050 } else {
8051 LHS = Op->getOperand(0);
8052 RHS = Op->getOperand(1);
8053 }
8054
8055 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8056 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8057
8058 // Special case: for shifts, the RHS always needs a zext.
8059 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8060 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8061 else
8062 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8063
8064 // setcc always return i1/i1 vec so no need to truncate after.
8065 if (Opc == ISD::SETCC) {
8066 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8067 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8068 }
8069
8070 // For other ops, we extend the operation's return type as well so we need to
8071 // truncate back to the original type.
8072 SDValue NewVal;
8073 if (Opc == ISD::SELECT)
8074 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8075 else
8076 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8077
8078 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8079}
8080
8081SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8082 SDValue Mag = Op.getOperand(0);
8083 EVT MagVT = Mag.getValueType();
8084
8085 if (MagVT.getVectorNumElements() > 2)
8086 return splitBinaryVectorOp(Op, DAG);
8087
8088 SDValue Sign = Op.getOperand(1);
8089 EVT SignVT = Sign.getValueType();
8090
8091 if (MagVT == SignVT)
8092 return Op;
8093
8094 // fcopysign v2f16:mag, v2f32:sign ->
8095 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8096
8097 SDLoc SL(Op);
8098 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8099 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8100
8101 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8102
8103 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8104}
8105
8106// Custom lowering for vector multiplications and s_mul_u64.
8107SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8108 EVT VT = Op.getValueType();
8109
8110 // Split vector operands.
8111 if (VT.isVector())
8112 return splitBinaryVectorOp(Op, DAG);
8113
8114 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8115
8116 // There are four ways to lower s_mul_u64:
8117 //
8118 // 1. If all the operands are uniform, then we lower it as it is.
8119 //
8120 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8121 // multiplications because there is not a vector equivalent of s_mul_u64.
8122 //
8123 // 3. If the cost model decides that it is more efficient to use vector
8124 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8125 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8126 //
8127 // 4. If the cost model decides to use vector registers and both of the
8128 // operands are zero-extended/sign-extended from 32-bits, then we split the
8129 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8130 // possible to check if the operands are zero-extended or sign-extended in
8131 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8132 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8133 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8134 // If the cost model decides that we have to use vector registers, then
8135 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8136 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8137 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8138 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8139 // SIInstrInfo.cpp .
8140
8141 if (Op->isDivergent())
8142 return SDValue();
8143
8144 SDValue Op0 = Op.getOperand(0);
8145 SDValue Op1 = Op.getOperand(1);
8146 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8147 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8148 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8149 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8150 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8151 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8152 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8153 SDLoc SL(Op);
8154 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8155 return SDValue(
8156 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8157 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8158 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8159 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8160 return SDValue(
8161 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8162 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8163 return Op;
8164}
8165
8166SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8167 EVT VT = Op.getValueType();
8168 SDLoc SL(Op);
8169 SDValue LHS = Op.getOperand(0);
8170 SDValue RHS = Op.getOperand(1);
8171 bool isSigned = Op.getOpcode() == ISD::SMULO;
8172
8173 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8174 const APInt &C = RHSC->getAPIntValue();
8175 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8176 if (C.isPowerOf2()) {
8177 // smulo(x, signed_min) is same as umulo(x, signed_min).
8178 bool UseArithShift = isSigned && !C.isMinSignedValue();
8179 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8180 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8181 SDValue Overflow =
8182 DAG.getSetCC(SL, MVT::i1,
8183 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8184 Result, ShiftAmt),
8185 LHS, ISD::SETNE);
8186 return DAG.getMergeValues({Result, Overflow}, SL);
8187 }
8188 }
8189
8190 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8191 SDValue Top =
8192 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8193
8194 SDValue Sign = isSigned
8195 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8196 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8197 SL, MVT::i32))
8198 : DAG.getConstant(0, SL, VT);
8199 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8200
8201 return DAG.getMergeValues({Result, Overflow}, SL);
8202}
8203
8204SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8205 if (Op->isDivergent()) {
8206 // Select to V_MAD_[IU]64_[IU]32.
8207 return Op;
8208 }
8209 if (Subtarget->hasSMulHi()) {
8210 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8211 return SDValue();
8212 }
8213 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8214 // calculate the high part, so we might as well do the whole thing with
8215 // V_MAD_[IU]64_[IU]32.
8216 return Op;
8217}
8218
8219SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8220 if (!Subtarget->isTrapHandlerEnabled() ||
8221 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8222 return lowerTrapEndpgm(Op, DAG);
8223
8224 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8225 : lowerTrapHsaQueuePtr(Op, DAG);
8226}
8227
8228SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8229 SDLoc SL(Op);
8230 SDValue Chain = Op.getOperand(0);
8231 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8232}
8233
8234SDValue
8235SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8236 const SDLoc &DL, Align Alignment,
8237 ImplicitParameter Param) const {
8238 MachineFunction &MF = DAG.getMachineFunction();
8239 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8240 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8241 MachinePointerInfo PtrInfo =
8243 return DAG.getLoad(
8244 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8246}
8247
8248SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8249 SelectionDAG &DAG) const {
8250 SDLoc SL(Op);
8251 SDValue Chain = Op.getOperand(0);
8252
8253 SDValue QueuePtr;
8254 // For code object version 5, QueuePtr is passed through implicit kernarg.
8255 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8257 QueuePtr =
8258 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8259 } else {
8260 MachineFunction &MF = DAG.getMachineFunction();
8261 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8262 Register UserSGPR = Info->getQueuePtrUserSGPR();
8263
8264 if (UserSGPR == AMDGPU::NoRegister) {
8265 // We probably are in a function incorrectly marked with
8266 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8267 // trap, so just use a null pointer.
8268 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8269 } else {
8270 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8271 MVT::i64);
8272 }
8273 }
8274
8275 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8276 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8277
8278 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8279 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8280 ToReg.getValue(1)};
8281 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8282}
8283
8284SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8285 SDLoc SL(Op);
8286 SDValue Chain = Op.getOperand(0);
8287
8288 // We need to simulate the 's_trap 2' instruction on targets that run in
8289 // PRIV=1 (where it is treated as a nop).
8290 if (Subtarget->hasPrivEnabledTrap2NopBug())
8291 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8292
8293 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8294 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8295 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8296}
8297
8298SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8299 SDLoc SL(Op);
8300 SDValue Chain = Op.getOperand(0);
8301 MachineFunction &MF = DAG.getMachineFunction();
8302
8303 if (!Subtarget->isTrapHandlerEnabled() ||
8304 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8305 LLVMContext &Ctx = MF.getFunction().getContext();
8306 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8307 "debugtrap handler not supported",
8308 Op.getDebugLoc(), DS_Warning));
8309 return Chain;
8310 }
8311
8312 uint64_t TrapID =
8313 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8314 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8315 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8316}
8317
8318SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8319 SelectionDAG &DAG) const {
8320 if (Subtarget->hasApertureRegs()) {
8321 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8322 ? AMDGPU::SRC_SHARED_BASE
8323 : AMDGPU::SRC_PRIVATE_BASE;
8324 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8325 !Subtarget->hasGloballyAddressableScratch()) &&
8326 "Cannot use src_private_base with globally addressable scratch!");
8327 // Note: this feature (register) is broken. When used as a 32-bit operand,
8328 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8329 // bits.
8330 //
8331 // To work around the issue, emit a 64 bit copy from this register
8332 // then extract the high bits. Note that this shouldn't even result in a
8333 // shift being emitted and simply become a pair of registers (e.g.):
8334 // s_mov_b64 s[6:7], src_shared_base
8335 // v_mov_b32_e32 v1, s7
8336 SDValue Copy =
8337 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8338 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8339 }
8340
8341 // For code object version 5, private_base and shared_base are passed through
8342 // implicit kernargs.
8343 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8347 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8348 }
8349
8350 MachineFunction &MF = DAG.getMachineFunction();
8351 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8352 Register UserSGPR = Info->getQueuePtrUserSGPR();
8353 if (UserSGPR == AMDGPU::NoRegister) {
8354 // We probably are in a function incorrectly marked with
8355 // amdgpu-no-queue-ptr. This is undefined.
8356 return DAG.getPOISON(MVT::i32);
8357 }
8358
8359 SDValue QueuePtr =
8360 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8361
8362 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8363 // private_segment_aperture_base_hi.
8364 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8365
8366 SDValue Ptr =
8367 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8368
8369 // TODO: Use custom target PseudoSourceValue.
8370 // TODO: We should use the value from the IR intrinsic call, but it might not
8371 // be available and how do we get it?
8372 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8373 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8374 commonAlignment(Align(64), StructOffset),
8377}
8378
8379/// Return true if the value is a known valid address, such that a null check is
8380/// not necessary.
8382 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8384 return true;
8385
8386 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8387 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8388
8389 // TODO: Search through arithmetic, handle arguments and loads
8390 // marked nonnull.
8391 return false;
8392}
8393
8394SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8395 SelectionDAG &DAG) const {
8396 SDLoc SL(Op);
8397
8398 const AMDGPUTargetMachine &TM =
8399 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8400
8401 unsigned DestAS, SrcAS;
8402 SDValue Src;
8403 bool IsNonNull = false;
8404 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8405 SrcAS = ASC->getSrcAddressSpace();
8406 Src = ASC->getOperand(0);
8407 DestAS = ASC->getDestAddressSpace();
8408 } else {
8409 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8410 Op.getConstantOperandVal(0) ==
8411 Intrinsic::amdgcn_addrspacecast_nonnull);
8412 Src = Op->getOperand(1);
8413 SrcAS = Op->getConstantOperandVal(2);
8414 DestAS = Op->getConstantOperandVal(3);
8415 IsNonNull = true;
8416 }
8417
8418 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8419
8420 // flat -> local/private
8421 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8422 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8423 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8424 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8425
8426 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8427 Subtarget->hasGloballyAddressableScratch()) {
8428 // flat -> private with globally addressable scratch: subtract
8429 // src_flat_scratch_base_lo.
8430 SDValue FlatScratchBaseLo(
8431 DAG.getMachineNode(
8432 AMDGPU::S_MOV_B32, SL, MVT::i32,
8433 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8434 0);
8435 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8436 }
8437
8438 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8439 return Ptr;
8440
8441 unsigned NullVal = TM.getNullPointerValue(DestAS);
8442 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8443 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8444
8445 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8446 SegmentNullPtr);
8447 }
8448 }
8449
8450 // local/private -> flat
8451 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8452 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8453 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8454 SDValue CvtPtr;
8455 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8456 Subtarget->hasGloballyAddressableScratch()) {
8457 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8458 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8459 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8460 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8461 ThreadID = DAG.getNode(
8462 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8463 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8464 AllOnes, ThreadID);
8465 if (Subtarget->isWave64())
8466 ThreadID = DAG.getNode(
8467 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8468 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8469 AllOnes, ThreadID);
8470 SDValue ShAmt = DAG.getShiftAmountConstant(
8471 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8472 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8473 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8474 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8475 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8476 // 64-bit hi:lo value.
8477 SDValue FlatScratchBase = {
8478 DAG.getMachineNode(
8479 AMDGPU::S_MOV_B64, SL, MVT::i64,
8480 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8481 0};
8482 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8483 } else {
8484 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8485 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8486 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8487 }
8488
8489 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8490 return CvtPtr;
8491
8492 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8493 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8494
8495 SDValue NonNull =
8496 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8497
8498 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8499 FlatNullPtr);
8500 }
8501 }
8502
8503 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8504 Op.getValueType() == MVT::i64) {
8505 const SIMachineFunctionInfo *Info =
8506 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8507 if (Info->get32BitAddressHighBits() == 0)
8508 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8509
8510 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8511 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8512 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8513 }
8514
8515 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8516 Src.getValueType() == MVT::i64)
8517 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8518
8519 // global <-> flat are no-ops and never emitted.
8520
8521 // Invalid casts are poison.
8522 return DAG.getPOISON(Op->getValueType(0));
8523}
8524
8525// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8526// the small vector and inserting them into the big vector. That is better than
8527// the default expansion of doing it via a stack slot. Even though the use of
8528// the stack slot would be optimized away afterwards, the stack slot itself
8529// remains.
8530SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8531 SelectionDAG &DAG) const {
8532 SDValue Vec = Op.getOperand(0);
8533 SDValue Ins = Op.getOperand(1);
8534 SDValue Idx = Op.getOperand(2);
8535 EVT VecVT = Vec.getValueType();
8536 EVT InsVT = Ins.getValueType();
8537 EVT EltVT = VecVT.getVectorElementType();
8538 unsigned InsNumElts = InsVT.getVectorNumElements();
8539 unsigned IdxVal = Idx->getAsZExtVal();
8540 SDLoc SL(Op);
8541
8542 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8543 // Insert 32-bit registers at a time.
8544 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8545
8546 unsigned VecNumElts = VecVT.getVectorNumElements();
8547 EVT NewVecVT =
8548 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8549 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8551 MVT::i32, InsNumElts / 2);
8552
8553 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8554 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8555
8556 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8557 SDValue Elt;
8558 if (InsNumElts == 2) {
8559 Elt = Ins;
8560 } else {
8561 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8562 DAG.getConstant(I, SL, MVT::i32));
8563 }
8564 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8565 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8566 }
8567
8568 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8569 }
8570
8571 for (unsigned I = 0; I != InsNumElts; ++I) {
8572 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8573 DAG.getConstant(I, SL, MVT::i32));
8574 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8575 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8576 }
8577 return Vec;
8578}
8579
8580SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8581 SelectionDAG &DAG) const {
8582 SDValue Vec = Op.getOperand(0);
8583 SDValue InsVal = Op.getOperand(1);
8584 SDValue Idx = Op.getOperand(2);
8585 EVT VecVT = Vec.getValueType();
8586 EVT EltVT = VecVT.getVectorElementType();
8587 unsigned VecSize = VecVT.getSizeInBits();
8588 unsigned EltSize = EltVT.getSizeInBits();
8589 SDLoc SL(Op);
8590
8591 // Specially handle the case of v4i16 with static indexing.
8592 unsigned NumElts = VecVT.getVectorNumElements();
8593 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8594 if (NumElts == 4 && EltSize == 16 && KIdx) {
8595 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8596
8597 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8598 DAG.getConstant(0, SL, MVT::i32));
8599 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8600 DAG.getConstant(1, SL, MVT::i32));
8601
8602 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8603 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8604
8605 unsigned Idx = KIdx->getZExtValue();
8606 bool InsertLo = Idx < 2;
8607 SDValue InsHalf = DAG.getNode(
8608 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8609 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8610 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8611
8612 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8613
8614 SDValue Concat =
8615 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8616 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8617
8618 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8619 }
8620
8621 // Static indexing does not lower to stack access, and hence there is no need
8622 // for special custom lowering to avoid stack access.
8623 if (isa<ConstantSDNode>(Idx))
8624 return SDValue();
8625
8626 // Avoid stack access for dynamic indexing by custom lowering to
8627 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8628
8629 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8630
8631 MVT IntVT = MVT::getIntegerVT(VecSize);
8632
8633 // Convert vector index to bit-index and get the required bit mask.
8634 assert(isPowerOf2_32(EltSize));
8635 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8636 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8637 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8638 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8639 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8640
8641 // 1. Create a congruent vector with the target value in each element.
8642 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8643 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8644
8645 // 2. Mask off all other indices except the required index within (1).
8646 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8647
8648 // 3. Mask off the required index within the target vector.
8649 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8650 SDValue RHS =
8651 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8652
8653 // 4. Get (2) and (3) ORed into the target vector.
8654 SDValue BFI =
8655 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8656
8657 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8658}
8659
8660SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8661 SelectionDAG &DAG) const {
8662 SDLoc SL(Op);
8663
8664 EVT ResultVT = Op.getValueType();
8665 SDValue Vec = Op.getOperand(0);
8666 SDValue Idx = Op.getOperand(1);
8667 EVT VecVT = Vec.getValueType();
8668 unsigned VecSize = VecVT.getSizeInBits();
8669 EVT EltVT = VecVT.getVectorElementType();
8670
8671 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8672
8673 // Make sure we do any optimizations that will make it easier to fold
8674 // source modifiers before obscuring it with bit operations.
8675
8676 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8677 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8678 return Combined;
8679
8680 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8681 SDValue Lo, Hi;
8682 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8683
8684 if (VecSize == 128) {
8685 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8686 Lo = DAG.getBitcast(LoVT,
8687 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8688 DAG.getConstant(0, SL, MVT::i32)));
8689 Hi = DAG.getBitcast(HiVT,
8690 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8691 DAG.getConstant(1, SL, MVT::i32)));
8692 } else if (VecSize == 256) {
8693 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8694 SDValue Parts[4];
8695 for (unsigned P = 0; P < 4; ++P) {
8696 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8697 DAG.getConstant(P, SL, MVT::i32));
8698 }
8699
8700 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8701 Parts[0], Parts[1]));
8702 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8703 Parts[2], Parts[3]));
8704 } else {
8705 assert(VecSize == 512);
8706
8707 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8708 SDValue Parts[8];
8709 for (unsigned P = 0; P < 8; ++P) {
8710 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8711 DAG.getConstant(P, SL, MVT::i32));
8712 }
8713
8714 Lo = DAG.getBitcast(LoVT,
8715 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8716 Parts[0], Parts[1], Parts[2], Parts[3]));
8717 Hi = DAG.getBitcast(HiVT,
8718 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8719 Parts[4], Parts[5], Parts[6], Parts[7]));
8720 }
8721
8722 EVT IdxVT = Idx.getValueType();
8723 unsigned NElem = VecVT.getVectorNumElements();
8724 assert(isPowerOf2_32(NElem));
8725 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8726 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8727 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8728 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8729 }
8730
8731 assert(VecSize <= 64);
8732
8733 MVT IntVT = MVT::getIntegerVT(VecSize);
8734
8735 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8736 SDValue VecBC = peekThroughBitcasts(Vec);
8737 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8738 SDValue Src = VecBC.getOperand(0);
8739 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8740 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8741 }
8742
8743 unsigned EltSize = EltVT.getSizeInBits();
8744 assert(isPowerOf2_32(EltSize));
8745
8746 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8747
8748 // Convert vector index to bit-index (* EltSize)
8749 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8750
8751 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8752 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8753
8754 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8755 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8756 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8757 }
8758
8759 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8760}
8761
8762static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8763 assert(Elt % 2 == 0);
8764 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8765}
8766
8767static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8768 assert(Elt % 2 == 0);
8769 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8770 !(Mask[Elt + 1] & 1);
8771}
8772
8773SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8774 SelectionDAG &DAG) const {
8775 SDLoc SL(Op);
8776 EVT ResultVT = Op.getValueType();
8777 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8778 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8779 const int NewSrcNumElts = 2;
8780 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8781 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8782
8783 // Break up the shuffle into registers sized pieces.
8784 //
8785 // We're trying to form sub-shuffles that the register allocation pipeline
8786 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8787 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8788 // pair of copies into a consecutive register copy, so use the ordinary
8789 // extract_vector_elt lowering unless we can use the shuffle.
8790 //
8791 // TODO: This is a bit of hack, and we should probably always use
8792 // extract_subvector for the largest possible subvector we can (or at least
8793 // use it for PackVT aligned pieces). However we have worse support for
8794 // combines on them don't directly treat extract_subvector / insert_subvector
8795 // as legal. The DAG scheduler also ends up doing a worse job with the
8796 // extract_subvectors.
8797 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8798
8799 // vector_shuffle <0,1,6,7> lhs, rhs
8800 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8801 //
8802 // vector_shuffle <6,7,2,3> lhs, rhs
8803 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8804 //
8805 // vector_shuffle <6,7,0,1> lhs, rhs
8806 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8807
8808 // Avoid scalarizing when both halves are reading from consecutive elements.
8809
8810 // If we're treating 2 element shuffles as legal, also create odd-to-even
8811 // shuffles of neighboring pairs.
8812 //
8813 // vector_shuffle <3,2,7,6> lhs, rhs
8814 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8815 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8816
8818 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8819 if (ShouldUseConsecutiveExtract &&
8821 const int Idx = SVN->getMaskElt(I);
8822 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8823 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8824 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8825 SVN->getOperand(VecIdx),
8826 DAG.getConstant(EltIdx, SL, MVT::i32));
8827 Pieces.push_back(SubVec);
8828 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8830 int Idx0 = SVN->getMaskElt(I);
8831 int Idx1 = SVN->getMaskElt(I + 1);
8832
8833 SDValue SrcOp0 = SVN->getOperand(0);
8834 SDValue SrcOp1 = SrcOp0;
8835 if (Idx0 >= SrcNumElts) {
8836 SrcOp0 = SVN->getOperand(1);
8837 Idx0 -= SrcNumElts;
8838 }
8839
8840 if (Idx1 >= SrcNumElts) {
8841 SrcOp1 = SVN->getOperand(1);
8842 Idx1 -= SrcNumElts;
8843 }
8844
8845 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8846 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8847
8848 // Extract nearest even aligned piece.
8849 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8850 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8851 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8852 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8853
8854 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8855 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8856
8857 SDValue Result0 = SubVec0;
8858 SDValue Result1 = SubVec0;
8859
8860 if (SubVec0 != SubVec1) {
8861 NewMaskIdx1 += NewSrcNumElts;
8862 Result1 = SubVec1;
8863 } else {
8864 Result1 = DAG.getPOISON(PackVT);
8865 }
8866
8867 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8868 {NewMaskIdx0, NewMaskIdx1});
8869 Pieces.push_back(Shuf);
8870 } else {
8871 const int Idx0 = SVN->getMaskElt(I);
8872 const int Idx1 = SVN->getMaskElt(I + 1);
8873 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8874 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8875 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8876 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8877
8878 SDValue Vec0 = SVN->getOperand(VecIdx0);
8879 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8880 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8881
8882 SDValue Vec1 = SVN->getOperand(VecIdx1);
8883 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8884 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8885 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8886 }
8887 }
8888
8889 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8890}
8891
8892SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8893 SelectionDAG &DAG) const {
8894 SDValue SVal = Op.getOperand(0);
8895 EVT ResultVT = Op.getValueType();
8896 EVT SValVT = SVal.getValueType();
8897 SDValue UndefVal = DAG.getPOISON(SValVT);
8898 SDLoc SL(Op);
8899
8901 VElts.push_back(SVal);
8902 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8903 VElts.push_back(UndefVal);
8904
8905 return DAG.getBuildVector(ResultVT, SL, VElts);
8906}
8907
8908SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8909 SelectionDAG &DAG) const {
8910 SDLoc SL(Op);
8911 EVT VT = Op.getValueType();
8912
8913 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8914 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8915
8916 SDValue Lo = Op.getOperand(0);
8917 SDValue Hi = Op.getOperand(1);
8918
8919 // Avoid adding defined bits with the zero_extend.
8920 if (Hi.isUndef()) {
8921 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8922 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8923 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8924 }
8925
8926 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8927 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8928
8929 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8930 DAG.getConstant(16, SL, MVT::i32));
8931 if (Lo.isUndef())
8932 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8933
8934 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8935 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8936
8937 SDValue Or =
8938 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8939 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8940 }
8941
8942 // Split into 2-element chunks.
8943 const unsigned NumParts = VT.getVectorNumElements() / 2;
8944 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8945 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8946
8948 for (unsigned P = 0; P < NumParts; ++P) {
8949 SDValue Vec = DAG.getBuildVector(
8950 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8951 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8952 }
8953
8954 SDValue Blend =
8955 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8956 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8957}
8958
8960 const GlobalAddressSDNode *GA) const {
8961 // OSes that use ELF REL relocations (instead of RELA) can only store a
8962 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8963 // which can create arbitrary 64-bit addends. (This is only a problem for
8964 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8965 // the high 32 bits of the addend.)
8966 //
8967 // This should be kept in sync with how HasRelocationAddend is initialized in
8968 // the constructor of ELFAMDGPUAsmBackend.
8969 if (!Subtarget->isAmdHsaOS())
8970 return false;
8971
8972 // We can fold offsets for anything that doesn't require a GOT relocation.
8973 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8977}
8978
8979static SDValue
8981 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8982 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8983 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8984 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8985 // lowered to the following code sequence:
8986 //
8987 // For constant address space:
8988 // s_getpc_b64 s[0:1]
8989 // s_add_u32 s0, s0, $symbol
8990 // s_addc_u32 s1, s1, 0
8991 //
8992 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8993 // a fixup or relocation is emitted to replace $symbol with a literal
8994 // constant, which is a pc-relative offset from the encoding of the $symbol
8995 // operand to the global variable.
8996 //
8997 // For global address space:
8998 // s_getpc_b64 s[0:1]
8999 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9000 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9001 //
9002 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9003 // fixups or relocations are emitted to replace $symbol@*@lo and
9004 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9005 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9006 // operand to the global variable.
9007 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9008 assert(GAFlags != SIInstrInfo::MO_NONE);
9009
9010 SDValue Ptr =
9011 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9012 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9013 }
9014
9015 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9016 SDValue PtrHi;
9017 if (GAFlags == SIInstrInfo::MO_NONE)
9018 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9019 else
9020 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9021 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9022}
9023
9024SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9025 SDValue Op,
9026 SelectionDAG &DAG) const {
9027 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9028 SDLoc DL(GSD);
9029 EVT PtrVT = Op.getValueType();
9030
9031 const GlobalValue *GV = GSD->getGlobal();
9037 GV->hasExternalLinkage()) {
9038 Type *Ty = GV->getValueType();
9039 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9040 // zero-sized type in other languages to declare the dynamic shared
9041 // memory which size is not known at the compile time. They will be
9042 // allocated by the runtime and placed directly after the static
9043 // allocated ones. They all share the same offset.
9044 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
9045 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9046 // Adjust alignment for that dynamic shared memory array.
9049 MFI->setUsesDynamicLDS(true);
9050 return SDValue(
9051 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9052 }
9053 }
9055 }
9056
9058 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9060 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9061 }
9062
9063 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9064 if (Subtarget->has64BitLiterals()) {
9066 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9067 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9068 0);
9069 }
9070
9071 SDValue AddrLo = DAG.getTargetGlobalAddress(
9072 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9073 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9074
9075 SDValue AddrHi = DAG.getTargetGlobalAddress(
9076 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9077 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9078
9079 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9080 }
9081
9082 if (shouldEmitFixup(GV))
9083 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9084
9085 if (shouldEmitPCReloc(GV))
9086 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9088
9089 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9091 PointerType *PtrTy =
9093 const DataLayout &DataLayout = DAG.getDataLayout();
9094 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9095 MachinePointerInfo PtrInfo =
9097
9098 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9101}
9102
9103SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9104 SelectionDAG &DAG) const {
9105 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9106 const Function &Fn = DAG.getMachineFunction().getFunction();
9107 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9108 Fn, "unsupported external symbol", Op.getDebugLoc()));
9109 return DAG.getPOISON(Op.getValueType());
9110}
9111
9113 const SDLoc &DL, SDValue V) const {
9114 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9115 // the destination register.
9116 //
9117 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9118 // so we will end up with redundant moves to m0.
9119 //
9120 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9121
9122 // A Null SDValue creates a glue result.
9123 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9124 V, Chain);
9125 return SDValue(M0, 0);
9126}
9127
9128SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9129 MVT VT,
9130 unsigned Offset) const {
9131 SDLoc SL(Op);
9132 SDValue Param = lowerKernargMemParameter(
9133 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9134 // The local size values will have the hi 16-bits as zero.
9135 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9136 DAG.getValueType(VT));
9137}
9138
9140 EVT VT) {
9143 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9144 return DAG.getPOISON(VT);
9145}
9146
9148 EVT VT) {
9151 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9152 return DAG.getPOISON(VT);
9153}
9154
9156 ArrayRef<SDValue> Elts) {
9157 assert(!Elts.empty());
9158 MVT Type;
9159 unsigned NumElts = Elts.size();
9160
9161 if (NumElts <= 12) {
9162 Type = MVT::getVectorVT(MVT::f32, NumElts);
9163 } else {
9164 assert(Elts.size() <= 16);
9165 Type = MVT::v16f32;
9166 NumElts = 16;
9167 }
9168
9169 SmallVector<SDValue, 16> VecElts(NumElts);
9170 for (unsigned i = 0; i < Elts.size(); ++i) {
9171 SDValue Elt = Elts[i];
9172 if (Elt.getValueType() != MVT::f32)
9173 Elt = DAG.getBitcast(MVT::f32, Elt);
9174 VecElts[i] = Elt;
9175 }
9176 for (unsigned i = Elts.size(); i < NumElts; ++i)
9177 VecElts[i] = DAG.getPOISON(MVT::f32);
9178
9179 if (NumElts == 1)
9180 return VecElts[0];
9181 return DAG.getBuildVector(Type, DL, VecElts);
9182}
9183
9184static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9185 SDValue Src, int ExtraElts) {
9186 EVT SrcVT = Src.getValueType();
9187
9189
9190 if (SrcVT.isVector())
9191 DAG.ExtractVectorElements(Src, Elts);
9192 else
9193 Elts.push_back(Src);
9194
9195 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9196 while (ExtraElts--)
9197 Elts.push_back(Undef);
9198
9199 return DAG.getBuildVector(CastVT, DL, Elts);
9200}
9201
9202// Re-construct the required return value for a image load intrinsic.
9203// This is more complicated due to the optional use TexFailCtrl which means the
9204// required return type is an aggregate
9206 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9207 bool Unpacked, bool IsD16, int DMaskPop,
9208 int NumVDataDwords, bool IsAtomicPacked16Bit,
9209 const SDLoc &DL) {
9210 // Determine the required return type. This is the same regardless of
9211 // IsTexFail flag
9212 EVT ReqRetVT = ResultTypes[0];
9213 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9214 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9215 ? (ReqRetNumElts + 1) / 2
9216 : ReqRetNumElts;
9217
9218 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9219
9220 MVT DataDwordVT =
9221 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9222
9223 MVT MaskPopVT =
9224 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9225
9226 SDValue Data(Result, 0);
9227 SDValue TexFail;
9228
9229 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9230 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9231 if (MaskPopVT.isVector()) {
9232 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9233 SDValue(Result, 0), ZeroIdx);
9234 } else {
9235 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9236 SDValue(Result, 0), ZeroIdx);
9237 }
9238 }
9239
9240 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9241 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9242 NumDataDwords - MaskPopDwords);
9243
9244 if (IsD16)
9245 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9246
9247 EVT LegalReqRetVT = ReqRetVT;
9248 if (!ReqRetVT.isVector()) {
9249 if (!Data.getValueType().isInteger())
9250 Data = DAG.getNode(ISD::BITCAST, DL,
9251 Data.getValueType().changeTypeToInteger(), Data);
9252 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9253 } else {
9254 // We need to widen the return vector to a legal type
9255 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9256 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9257 LegalReqRetVT =
9259 ReqRetVT.getVectorNumElements() + 1);
9260 }
9261 }
9262 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9263
9264 if (IsTexFail) {
9265 TexFail =
9266 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9267 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9268
9269 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9270 }
9271
9272 if (Result->getNumValues() == 1)
9273 return Data;
9274
9275 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9276}
9277
9278static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9279 SDValue *LWE, bool &IsTexFail) {
9280 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9281
9282 uint64_t Value = TexFailCtrlConst->getZExtValue();
9283 if (Value) {
9284 IsTexFail = true;
9285 }
9286
9287 SDLoc DL(TexFailCtrlConst);
9288 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9289 Value &= ~(uint64_t)0x1;
9290 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9291 Value &= ~(uint64_t)0x2;
9292
9293 return Value == 0;
9294}
9295
9297 MVT PackVectorVT,
9298 SmallVectorImpl<SDValue> &PackedAddrs,
9299 unsigned DimIdx, unsigned EndIdx,
9300 unsigned NumGradients) {
9301 SDLoc DL(Op);
9302 for (unsigned I = DimIdx; I < EndIdx; I++) {
9303 SDValue Addr = Op.getOperand(I);
9304
9305 // Gradients are packed with undef for each coordinate.
9306 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9307 // 1D: undef,dx/dh; undef,dx/dv
9308 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9309 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9310 if (((I + 1) >= EndIdx) ||
9311 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9312 I == DimIdx + NumGradients - 1))) {
9313 if (Addr.getValueType() != MVT::i16)
9314 Addr = DAG.getBitcast(MVT::i16, Addr);
9315 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9316 } else {
9317 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9318 I++;
9319 }
9320 Addr = DAG.getBitcast(MVT::f32, Addr);
9321 PackedAddrs.push_back(Addr);
9322 }
9323}
9324
9325SDValue SITargetLowering::lowerImage(SDValue Op,
9327 SelectionDAG &DAG, bool WithChain) const {
9328 SDLoc DL(Op);
9329 MachineFunction &MF = DAG.getMachineFunction();
9330 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9331 unsigned IntrOpcode = Intr->BaseOpcode;
9332 // For image atomic: use no-return opcode if result is unused.
9333 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9334 !Op.getNode()->hasAnyUseOfValue(0))
9335 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9336 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9338 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9339 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9340 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9341 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9342
9343 SmallVector<EVT, 3> ResultTypes(Op->values());
9344 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9345 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9346 ResultTypes.erase(&ResultTypes[0]);
9347
9348 bool IsD16 = false;
9349 bool IsG16 = false;
9350 bool IsA16 = false;
9351 SDValue VData;
9352 int NumVDataDwords = 0;
9353 bool AdjustRetType = false;
9354 bool IsAtomicPacked16Bit = false;
9355
9356 // Offset of intrinsic arguments
9357 const unsigned ArgOffset = WithChain ? 2 : 1;
9358
9359 unsigned DMask;
9360 unsigned DMaskLanes = 0;
9361
9362 if (BaseOpcode->Atomic) {
9363 VData = Op.getOperand(2);
9364
9365 IsAtomicPacked16Bit =
9366 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9367 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9368 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9369 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9370
9371 bool Is64Bit = VData.getValueSizeInBits() == 64;
9372 if (BaseOpcode->AtomicX2) {
9373 SDValue VData2 = Op.getOperand(3);
9374 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9375 {VData, VData2});
9376 if (Is64Bit)
9377 VData = DAG.getBitcast(MVT::v4i32, VData);
9378
9379 if (!BaseOpcode->NoReturn)
9380 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9381
9382 DMask = Is64Bit ? 0xf : 0x3;
9383 NumVDataDwords = Is64Bit ? 4 : 2;
9384 } else {
9385 DMask = Is64Bit ? 0x3 : 0x1;
9386 NumVDataDwords = Is64Bit ? 2 : 1;
9387 }
9388 } else {
9389 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9390 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9391
9392 if (BaseOpcode->Store) {
9393 VData = Op.getOperand(2);
9394
9395 MVT StoreVT = VData.getSimpleValueType();
9396 if (StoreVT.getScalarType() == MVT::f16) {
9397 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9398 return Op; // D16 is unsupported for this instruction
9399
9400 IsD16 = true;
9401 VData = handleD16VData(VData, DAG, true);
9402 }
9403
9404 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9405 } else if (!BaseOpcode->NoReturn) {
9406 // Work out the num dwords based on the dmask popcount and underlying type
9407 // and whether packing is supported.
9408 MVT LoadVT = ResultTypes[0].getSimpleVT();
9409 if (LoadVT.getScalarType() == MVT::f16) {
9410 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9411 return Op; // D16 is unsupported for this instruction
9412
9413 IsD16 = true;
9414 }
9415
9416 // Confirm that the return type is large enough for the dmask specified
9417 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9418 (!LoadVT.isVector() && DMaskLanes > 1))
9419 return Op;
9420
9421 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9422 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9423 // instructions.
9424 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9425 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9426 NumVDataDwords = (DMaskLanes + 1) / 2;
9427 else
9428 NumVDataDwords = DMaskLanes;
9429
9430 AdjustRetType = true;
9431 }
9432 }
9433
9434 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9436
9437 // Check for 16 bit addresses or derivatives and pack if true.
9438 MVT VAddrVT =
9439 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9440 MVT VAddrScalarVT = VAddrVT.getScalarType();
9441 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9442 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9443
9444 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9445 VAddrScalarVT = VAddrVT.getScalarType();
9446 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9447 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9448
9449 // Push back extra arguments.
9450 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9451 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9452 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9453 // Special handling of bias when A16 is on. Bias is of type half but
9454 // occupies full 32-bit.
9455 SDValue Bias = DAG.getBuildVector(
9456 MVT::v2f16, DL,
9457 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9458 VAddrs.push_back(Bias);
9459 } else {
9460 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9461 "Bias needs to be converted to 16 bit in A16 mode");
9462 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9463 }
9464 }
9465
9466 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9467 // 16 bit gradients are supported, but are tied to the A16 control
9468 // so both gradients and addresses must be 16 bit
9469 LLVM_DEBUG(
9470 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9471 "require 16 bit args for both gradients and addresses");
9472 return Op;
9473 }
9474
9475 if (IsA16) {
9476 if (!ST->hasA16()) {
9477 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9478 "support 16 bit addresses\n");
9479 return Op;
9480 }
9481 }
9482
9483 // We've dealt with incorrect input so we know that if IsA16, IsG16
9484 // are set then we have to compress/pack operands (either address,
9485 // gradient or both)
9486 // In the case where a16 and gradients are tied (no G16 support) then we
9487 // have already verified that both IsA16 and IsG16 are true
9488 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9489 // Activate g16
9490 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9492 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9493 }
9494
9495 // Add gradients (packed or unpacked)
9496 if (IsG16) {
9497 // Pack the gradients
9498 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9499 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9500 ArgOffset + Intr->GradientStart,
9501 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9502 } else {
9503 for (unsigned I = ArgOffset + Intr->GradientStart;
9504 I < ArgOffset + Intr->CoordStart; I++)
9505 VAddrs.push_back(Op.getOperand(I));
9506 }
9507
9508 // Add addresses (packed or unpacked)
9509 if (IsA16) {
9510 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9511 ArgOffset + Intr->CoordStart, VAddrEnd,
9512 0 /* No gradients */);
9513 } else {
9514 // Add uncompressed address
9515 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9516 VAddrs.push_back(Op.getOperand(I));
9517 }
9518
9519 // If the register allocator cannot place the address registers contiguously
9520 // without introducing moves, then using the non-sequential address encoding
9521 // is always preferable, since it saves VALU instructions and is usually a
9522 // wash in terms of code size or even better.
9523 //
9524 // However, we currently have no way of hinting to the register allocator that
9525 // MIMG addresses should be placed contiguously when it is possible to do so,
9526 // so force non-NSA for the common 2-address case as a heuristic.
9527 //
9528 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9529 // allocation when possible.
9530 //
9531 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9532 // set of the remaining addresses.
9533 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9534 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9535 const bool UseNSA = ST->hasNSAEncoding() &&
9536 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9537 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9538 const bool UsePartialNSA =
9539 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9540
9541 SDValue VAddr;
9542 if (UsePartialNSA) {
9543 VAddr = getBuildDwordsVector(DAG, DL,
9544 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9545 } else if (!UseNSA) {
9546 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9547 }
9548
9549 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9550 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9551 SDValue Unorm;
9552 if (!BaseOpcode->Sampler) {
9553 Unorm = True;
9554 } else {
9555 uint64_t UnormConst =
9556 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9557
9558 Unorm = UnormConst ? True : False;
9559 }
9560
9561 SDValue TFE;
9562 SDValue LWE;
9563 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9564 bool IsTexFail = false;
9565 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9566 return Op;
9567
9568 if (IsTexFail) {
9569 if (!DMaskLanes) {
9570 // Expecting to get an error flag since TFC is on - and dmask is 0
9571 // Force dmask to be at least 1 otherwise the instruction will fail
9572 DMask = 0x1;
9573 DMaskLanes = 1;
9574 NumVDataDwords = 1;
9575 }
9576 NumVDataDwords += 1;
9577 AdjustRetType = true;
9578 }
9579
9580 // Has something earlier tagged that the return type needs adjusting
9581 // This happens if the instruction is a load or has set TexFailCtrl flags
9582 if (AdjustRetType) {
9583 // NumVDataDwords reflects the true number of dwords required in the return
9584 // type
9585 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9586 // This is a no-op load. This can be eliminated
9587 SDValue Undef = DAG.getPOISON(Op.getValueType());
9588 if (isa<MemSDNode>(Op))
9589 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9590 return Undef;
9591 }
9592
9593 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9594 MVT::i32, NumVDataDwords)
9595 : MVT::i32;
9596
9597 ResultTypes[0] = NewVT;
9598 if (ResultTypes.size() == 3) {
9599 // Original result was aggregate type used for TexFailCtrl results
9600 // The actual instruction returns as a vector type which has now been
9601 // created. Remove the aggregate result.
9602 ResultTypes.erase(&ResultTypes[1]);
9603 }
9604 }
9605
9606 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9607 // Keep GLC only when the atomic's result is actually used.
9608 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9610 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9612 return Op;
9613
9615 if (BaseOpcode->Store || BaseOpcode->Atomic)
9616 Ops.push_back(VData); // vdata
9617 if (UsePartialNSA) {
9618 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9619 Ops.push_back(VAddr);
9620 } else if (UseNSA)
9621 append_range(Ops, VAddrs);
9622 else
9623 Ops.push_back(VAddr);
9624 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9625 EVT RsrcVT = Rsrc.getValueType();
9626 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9627 return Op;
9628 Ops.push_back(Rsrc);
9629 if (BaseOpcode->Sampler) {
9630 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9631 if (Samp.getValueType() != MVT::v4i32)
9632 return Op;
9633 Ops.push_back(Samp);
9634 }
9635 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9636 if (IsGFX10Plus)
9637 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9638 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9639 Ops.push_back(Unorm);
9640 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9641 Ops.push_back(IsA16 && // r128, a16 for gfx9
9642 ST->hasFeature(AMDGPU::FeatureR128A16)
9643 ? True
9644 : False);
9645 if (IsGFX10Plus)
9646 Ops.push_back(IsA16 ? True : False);
9647
9648 if (!Subtarget->hasGFX90AInsts())
9649 Ops.push_back(TFE); // tfe
9650 else if (TFE->getAsZExtVal()) {
9651 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9653 "TFE is not supported on this GPU", DL.getDebugLoc()));
9654 }
9655
9656 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9657 Ops.push_back(LWE); // lwe
9658 if (!IsGFX10Plus)
9659 Ops.push_back(DimInfo->DA ? True : False);
9660 if (BaseOpcode->HasD16)
9661 Ops.push_back(IsD16 ? True : False);
9662 if (isa<MemSDNode>(Op))
9663 Ops.push_back(Op.getOperand(0)); // chain
9664
9665 int NumVAddrDwords =
9666 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9667 int Opcode = -1;
9668
9669 if (IsGFX12Plus) {
9670 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9671 NumVDataDwords, NumVAddrDwords);
9672 } else if (IsGFX11Plus) {
9673 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9674 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9675 : AMDGPU::MIMGEncGfx11Default,
9676 NumVDataDwords, NumVAddrDwords);
9677 } else if (IsGFX10Plus) {
9678 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9679 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9680 : AMDGPU::MIMGEncGfx10Default,
9681 NumVDataDwords, NumVAddrDwords);
9682 } else {
9683 if (Subtarget->hasGFX90AInsts()) {
9684 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9685 NumVDataDwords, NumVAddrDwords);
9686 if (Opcode == -1) {
9687 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9689 "requested image instruction is not supported on this GPU",
9690 DL.getDebugLoc()));
9691
9692 unsigned Idx = 0;
9693 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9694 for (EVT VT : OrigResultTypes) {
9695 if (VT == MVT::Other)
9696 RetValues[Idx++] = Op.getOperand(0); // Chain
9697 else
9698 RetValues[Idx++] = DAG.getPOISON(VT);
9699 }
9700
9701 return DAG.getMergeValues(RetValues, DL);
9702 }
9703 }
9704 if (Opcode == -1 &&
9705 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9706 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9707 NumVDataDwords, NumVAddrDwords);
9708 if (Opcode == -1)
9709 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9710 NumVDataDwords, NumVAddrDwords);
9711 }
9712 if (Opcode == -1)
9713 return Op;
9714
9715 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9716 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9717 MachineMemOperand *MemRef = MemOp->getMemOperand();
9718 DAG.setNodeMemRefs(NewNode, {MemRef});
9719 }
9720
9721 if (BaseOpcode->NoReturn) {
9722 if (BaseOpcode->Atomic)
9723 return DAG.getMergeValues(
9724 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9725
9726 return SDValue(NewNode, 0);
9727 }
9728
9729 if (BaseOpcode->AtomicX2) {
9731 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9732 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9733 }
9734
9735 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9736 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9737 NumVDataDwords, IsAtomicPacked16Bit, DL);
9738}
9739
9740SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9741 SDValue Offset, SDValue CachePolicy,
9742 SelectionDAG &DAG) const {
9743 MachineFunction &MF = DAG.getMachineFunction();
9744
9745 const DataLayout &DataLayout = DAG.getDataLayout();
9746 Align Alignment =
9747 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9748
9749 MachineMemOperand *MMO = MF.getMachineMemOperand(
9750 MachinePointerInfo(),
9753 VT.getStoreSize(), Alignment);
9754
9755 if (!Offset->isDivergent()) {
9756 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9757
9758 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9759 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9760 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9761 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9762 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9763 SDValue BufferLoad =
9764 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9765 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9766 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9767 }
9768
9769 // Widen vec3 load to vec4.
9770 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9771 !Subtarget->hasScalarDwordx3Loads()) {
9772 EVT WidenedVT =
9774 auto WidenedOp = DAG.getMemIntrinsicNode(
9775 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9776 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9777 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9778 DAG.getVectorIdxConstant(0, DL));
9779 return Subvector;
9780 }
9781
9782 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9783 DAG.getVTList(VT), Ops, VT, MMO);
9784 }
9785
9786 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9787 // assume that the buffer is unswizzled.
9788 SDValue Ops[] = {
9789 DAG.getEntryNode(), // Chain
9790 Rsrc, // rsrc
9791 DAG.getConstant(0, DL, MVT::i32), // vindex
9792 {}, // voffset
9793 {}, // soffset
9794 {}, // offset
9795 CachePolicy, // cachepolicy
9796 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9797 };
9798 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9799 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9800 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9801 }
9802
9804 unsigned NumLoads = 1;
9805 MVT LoadVT = VT.getSimpleVT();
9806 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9807 assert((LoadVT.getScalarType() == MVT::i32 ||
9808 LoadVT.getScalarType() == MVT::f32));
9809
9810 if (NumElts == 8 || NumElts == 16) {
9811 NumLoads = NumElts / 4;
9812 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9813 }
9814
9815 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9816
9817 // Use the alignment to ensure that the required offsets will fit into the
9818 // immediate offsets.
9819 setBufferOffsets(Offset, DAG, &Ops[3],
9820 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9821
9822 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9823 for (unsigned i = 0; i < NumLoads; ++i) {
9824 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9825 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9826 LoadVT, MMO, DAG));
9827 }
9828
9829 if (NumElts == 8 || NumElts == 16)
9830 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9831
9832 return Loads[0];
9833}
9834
9835SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9836 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9837 if (!Subtarget->hasArchitectedSGPRs())
9838 return {};
9839 SDLoc SL(Op);
9840 MVT VT = MVT::i32;
9841 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9842 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9843 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9844}
9845
9846SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9847 AMDGPU::Hwreg::Id HwReg,
9848 unsigned LowBit,
9849 unsigned Width) const {
9850 SDLoc SL(Op);
9851 using namespace AMDGPU::Hwreg;
9852 return {DAG.getMachineNode(
9853 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9854 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9855 SL, MVT::i32)),
9856 0};
9857}
9858
9859SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9860 unsigned Dim,
9861 const ArgDescriptor &Arg) const {
9862 SDLoc SL(Op);
9863 MachineFunction &MF = DAG.getMachineFunction();
9864 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9865 if (MaxID == 0)
9866 return DAG.getConstant(0, SL, MVT::i32);
9867
9868 // It's undefined behavior if a function marked with the amdgpu-no-*
9869 // attributes uses the corresponding intrinsic.
9870 if (!Arg)
9871 return DAG.getPOISON(Op->getValueType(0));
9872
9873 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9874 SDLoc(DAG.getEntryNode()), Arg);
9875
9876 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9877 // masking operations anyway.
9878 //
9879 // TODO: We could assert the top bit is 0 for the source copy.
9880 if (Arg.isMasked())
9881 return Val;
9882
9883 // Preserve the known bits after expansion to a copy.
9884 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9885 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9886 DAG.getValueType(SmallVT));
9887}
9888
9889SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9890 SelectionDAG &DAG) const {
9891 MachineFunction &MF = DAG.getMachineFunction();
9892 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9893
9894 EVT VT = Op.getValueType();
9895 SDLoc DL(Op);
9896 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9897
9898 // TODO: Should this propagate fast-math-flags?
9899
9900 switch (IntrinsicID) {
9901 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9902 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9903 return emitNonHSAIntrinsicError(DAG, DL, VT);
9904 return getPreloadedValue(DAG, *MFI, VT,
9906 }
9907 case Intrinsic::amdgcn_dispatch_ptr:
9908 case Intrinsic::amdgcn_queue_ptr: {
9909 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9910 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9911 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9912 DL.getDebugLoc()));
9913 return DAG.getPOISON(VT);
9914 }
9915
9916 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9919 return getPreloadedValue(DAG, *MFI, VT, RegID);
9920 }
9921 case Intrinsic::amdgcn_implicitarg_ptr: {
9922 if (MFI->isEntryFunction())
9923 return getImplicitArgPtr(DAG, DL);
9924 return getPreloadedValue(DAG, *MFI, VT,
9926 }
9927 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9928 if (!AMDGPU::isKernel(MF.getFunction())) {
9929 // This only makes sense to call in a kernel, so just lower to null.
9930 return DAG.getConstant(0, DL, VT);
9931 }
9932
9933 return getPreloadedValue(DAG, *MFI, VT,
9935 }
9936 case Intrinsic::amdgcn_dispatch_id: {
9937 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9938 }
9939 case Intrinsic::amdgcn_rcp:
9940 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9941 case Intrinsic::amdgcn_rsq:
9942 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9943 case Intrinsic::amdgcn_rsq_legacy:
9944 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9945 return emitRemovedIntrinsicError(DAG, DL, VT);
9946 return SDValue();
9947 case Intrinsic::amdgcn_rcp_legacy:
9948 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9949 return emitRemovedIntrinsicError(DAG, DL, VT);
9950 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9951 case Intrinsic::amdgcn_rsq_clamp: {
9952 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9953 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9954
9955 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9956 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9957 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9958
9959 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9960 SDValue Tmp =
9961 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9962 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9963 DAG.getConstantFP(Min, DL, VT));
9964 }
9965 case Intrinsic::r600_read_ngroups_x:
9966 if (Subtarget->isAmdHsaOS())
9967 return emitNonHSAIntrinsicError(DAG, DL, VT);
9968
9969 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9971 false);
9972 case Intrinsic::r600_read_ngroups_y:
9973 if (Subtarget->isAmdHsaOS())
9974 return emitNonHSAIntrinsicError(DAG, DL, VT);
9975
9976 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9978 false);
9979 case Intrinsic::r600_read_ngroups_z:
9980 if (Subtarget->isAmdHsaOS())
9981 return emitNonHSAIntrinsicError(DAG, DL, VT);
9982
9983 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9985 false);
9986 case Intrinsic::r600_read_local_size_x:
9987 if (Subtarget->isAmdHsaOS())
9988 return emitNonHSAIntrinsicError(DAG, DL, VT);
9989
9990 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9992 case Intrinsic::r600_read_local_size_y:
9993 if (Subtarget->isAmdHsaOS())
9994 return emitNonHSAIntrinsicError(DAG, DL, VT);
9995
9996 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9998 case Intrinsic::r600_read_local_size_z:
9999 if (Subtarget->isAmdHsaOS())
10000 return emitNonHSAIntrinsicError(DAG, DL, VT);
10001
10002 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10004 case Intrinsic::amdgcn_workgroup_id_x:
10005 return lowerWorkGroupId(DAG, *MFI, VT,
10009 case Intrinsic::amdgcn_workgroup_id_y:
10010 return lowerWorkGroupId(DAG, *MFI, VT,
10014 case Intrinsic::amdgcn_workgroup_id_z:
10015 return lowerWorkGroupId(DAG, *MFI, VT,
10019 case Intrinsic::amdgcn_cluster_id_x:
10020 return Subtarget->hasClusters()
10021 ? getPreloadedValue(DAG, *MFI, VT,
10023 : DAG.getPOISON(VT);
10024 case Intrinsic::amdgcn_cluster_id_y:
10025 return Subtarget->hasClusters()
10026 ? getPreloadedValue(DAG, *MFI, VT,
10028 : DAG.getPOISON(VT);
10029 case Intrinsic::amdgcn_cluster_id_z:
10030 return Subtarget->hasClusters()
10031 ? getPreloadedValue(DAG, *MFI, VT,
10033 : DAG.getPOISON(VT);
10034 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10035 return Subtarget->hasClusters()
10036 ? getPreloadedValue(
10037 DAG, *MFI, VT,
10039 : DAG.getPOISON(VT);
10040 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10041 return Subtarget->hasClusters()
10042 ? getPreloadedValue(
10043 DAG, *MFI, VT,
10045 : DAG.getPOISON(VT);
10046 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10047 return Subtarget->hasClusters()
10048 ? getPreloadedValue(
10049 DAG, *MFI, VT,
10051 : DAG.getPOISON(VT);
10052 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10053 return Subtarget->hasClusters()
10054 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10055 : SDValue();
10056 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10057 return Subtarget->hasClusters()
10058 ? getPreloadedValue(
10059 DAG, *MFI, VT,
10061 : DAG.getPOISON(VT);
10062 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10063 return Subtarget->hasClusters()
10064 ? getPreloadedValue(
10065 DAG, *MFI, VT,
10067 : DAG.getPOISON(VT);
10068 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10069 return Subtarget->hasClusters()
10070 ? getPreloadedValue(
10071 DAG, *MFI, VT,
10073 : DAG.getPOISON(VT);
10074 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10075 return Subtarget->hasClusters()
10076 ? getPreloadedValue(
10077 DAG, *MFI, VT,
10079 : DAG.getPOISON(VT);
10080 case Intrinsic::amdgcn_wave_id:
10081 return lowerWaveID(DAG, Op);
10082 case Intrinsic::amdgcn_lds_kernel_id: {
10083 if (MFI->isEntryFunction())
10084 return getLDSKernelId(DAG, DL);
10085 return getPreloadedValue(DAG, *MFI, VT,
10087 }
10088 case Intrinsic::amdgcn_workitem_id_x:
10089 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10090 case Intrinsic::amdgcn_workitem_id_y:
10091 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10092 case Intrinsic::amdgcn_workitem_id_z:
10093 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10094 case Intrinsic::amdgcn_wavefrontsize:
10095 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10096 SDLoc(Op), MVT::i32);
10097 case Intrinsic::amdgcn_s_buffer_load: {
10098 unsigned CPol = Op.getConstantOperandVal(3);
10099 // s_buffer_load, because of how it's optimized, can't be volatile
10100 // so reject ones with the volatile bit set.
10101 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10104 return Op;
10105 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10106 Op.getOperand(3), DAG);
10107 }
10108 case Intrinsic::amdgcn_fdiv_fast:
10109 return lowerFDIV_FAST(Op, DAG);
10110 case Intrinsic::amdgcn_sin:
10111 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10112
10113 case Intrinsic::amdgcn_cos:
10114 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10115
10116 case Intrinsic::amdgcn_mul_u24:
10117 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10118 Op.getOperand(2));
10119 case Intrinsic::amdgcn_mul_i24:
10120 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10121 Op.getOperand(2));
10122
10123 case Intrinsic::amdgcn_log_clamp: {
10124 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10125 return SDValue();
10126
10127 return emitRemovedIntrinsicError(DAG, DL, VT);
10128 }
10129 case Intrinsic::amdgcn_fract:
10130 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10131
10132 case Intrinsic::amdgcn_class:
10133 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10134 Op.getOperand(2));
10135 case Intrinsic::amdgcn_div_fmas:
10136 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10137 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10138
10139 case Intrinsic::amdgcn_div_fixup:
10140 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10141 Op.getOperand(2), Op.getOperand(3));
10142
10143 case Intrinsic::amdgcn_div_scale: {
10144 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10145
10146 // Translate to the operands expected by the machine instruction. The
10147 // first parameter must be the same as the first instruction.
10148 SDValue Numerator = Op.getOperand(1);
10149 SDValue Denominator = Op.getOperand(2);
10150
10151 // Note this order is opposite of the machine instruction's operations,
10152 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10153 // intrinsic has the numerator as the first operand to match a normal
10154 // division operation.
10155
10156 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10157
10158 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10159 Denominator, Numerator);
10160 }
10161 case Intrinsic::amdgcn_icmp: {
10162 // There is a Pat that handles this variant, so return it as-is.
10163 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10164 Op.getConstantOperandVal(2) == 0 &&
10165 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10166 return Op;
10167 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10168 }
10169 case Intrinsic::amdgcn_fcmp: {
10170 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10171 }
10172 case Intrinsic::amdgcn_ballot:
10173 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10174 case Intrinsic::amdgcn_fmed3:
10175 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10176 Op.getOperand(2), Op.getOperand(3));
10177 case Intrinsic::amdgcn_fdot2:
10178 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10179 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10180 case Intrinsic::amdgcn_fmul_legacy:
10181 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10182 Op.getOperand(2));
10183 case Intrinsic::amdgcn_sffbh:
10184 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10185 case Intrinsic::amdgcn_sbfe:
10186 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10187 Op.getOperand(2), Op.getOperand(3));
10188 case Intrinsic::amdgcn_ubfe:
10189 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10190 Op.getOperand(2), Op.getOperand(3));
10191 case Intrinsic::amdgcn_cvt_pkrtz:
10192 case Intrinsic::amdgcn_cvt_pknorm_i16:
10193 case Intrinsic::amdgcn_cvt_pknorm_u16:
10194 case Intrinsic::amdgcn_cvt_pk_i16:
10195 case Intrinsic::amdgcn_cvt_pk_u16: {
10196 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10197 EVT VT = Op.getValueType();
10198 unsigned Opcode;
10199
10200 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10201 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10202 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10203 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10204 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10205 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10206 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10207 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10208 else
10209 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10210
10211 if (isTypeLegal(VT))
10212 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10213
10214 SDValue Node =
10215 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10216 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10217 }
10218 case Intrinsic::amdgcn_fmad_ftz:
10219 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10220 Op.getOperand(2), Op.getOperand(3));
10221
10222 case Intrinsic::amdgcn_if_break:
10223 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10224 Op->getOperand(1), Op->getOperand(2)),
10225 0);
10226
10227 case Intrinsic::amdgcn_groupstaticsize: {
10229 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10230 return Op;
10231
10232 const Module *M = MF.getFunction().getParent();
10233 const GlobalValue *GV =
10234 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10235 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10237 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10238 }
10239 case Intrinsic::amdgcn_is_shared:
10240 case Intrinsic::amdgcn_is_private: {
10241 SDLoc SL(Op);
10242 SDValue SrcVec =
10243 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10244 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10245 DAG.getConstant(1, SL, MVT::i32));
10246
10247 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10249 : AMDGPUAS::PRIVATE_ADDRESS;
10250 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10251 Subtarget->hasGloballyAddressableScratch()) {
10252 SDValue FlatScratchBaseHi(
10253 DAG.getMachineNode(
10254 AMDGPU::S_MOV_B32, DL, MVT::i32,
10255 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10256 0);
10257 // Test bits 63..58 against the aperture address.
10258 return DAG.getSetCC(
10259 SL, MVT::i1,
10260 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10261 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10262 }
10263
10264 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10265 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10266 }
10267 case Intrinsic::amdgcn_perm:
10268 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10269 Op.getOperand(2), Op.getOperand(3));
10270 case Intrinsic::amdgcn_reloc_constant: {
10271 Module *M = MF.getFunction().getParent();
10272 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10273 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10274 auto *RelocSymbol = cast<GlobalVariable>(
10275 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10276 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10278 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10279 }
10280 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10281 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10282 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10283 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10284 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10285 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10286 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10287 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10288 if (Op.getOperand(4).getValueType() == MVT::i32)
10289 return SDValue();
10290
10291 SDLoc SL(Op);
10292 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10293 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10294 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10295 Op.getOperand(3), IndexKeyi32);
10296 }
10297 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10298 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10299 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10300 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10301 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10302 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10303 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10304 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10305 if (Op.getOperand(4).getValueType() == MVT::i64)
10306 return SDValue();
10307
10308 SDLoc SL(Op);
10309 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10310 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10311 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10312 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10313 Op.getOperand(6)});
10314 }
10315 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10316 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10317 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10318 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10319 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10320 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10321 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10322 ? MVT::i64
10323 : MVT::i32;
10324 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10325 return SDValue();
10326
10327 SDLoc SL(Op);
10328 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10330 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10331 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10332 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10333 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10334 Args.push_back(Op.getOperand(9));
10335 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10336 }
10337 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10338 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10339 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10340 if (Op.getOperand(6).getValueType() == MVT::i32)
10341 return SDValue();
10342
10343 SDLoc SL(Op);
10344 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10345 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10346 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10347 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10348 IndexKeyi32, Op.getOperand(7)});
10349 }
10350 case Intrinsic::amdgcn_addrspacecast_nonnull:
10351 return lowerADDRSPACECAST(Op, DAG);
10352 case Intrinsic::amdgcn_readlane:
10353 case Intrinsic::amdgcn_readfirstlane:
10354 case Intrinsic::amdgcn_writelane:
10355 case Intrinsic::amdgcn_permlane16:
10356 case Intrinsic::amdgcn_permlanex16:
10357 case Intrinsic::amdgcn_permlane64:
10358 case Intrinsic::amdgcn_set_inactive:
10359 case Intrinsic::amdgcn_set_inactive_chain_arg:
10360 case Intrinsic::amdgcn_mov_dpp8:
10361 case Intrinsic::amdgcn_update_dpp:
10362 return lowerLaneOp(*this, Op.getNode(), DAG);
10363 case Intrinsic::amdgcn_dead: {
10365 for (const EVT ValTy : Op.getNode()->values())
10366 Poisons.push_back(DAG.getPOISON(ValTy));
10367 return DAG.getMergeValues(Poisons, SDLoc(Op));
10368 }
10369 case Intrinsic::amdgcn_wave_shuffle:
10370 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10371 default:
10372 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10374 return lowerImage(Op, ImageDimIntr, DAG, false);
10375
10376 return Op;
10377 }
10378}
10379
10380// On targets not supporting constant in soffset field, turn zero to
10381// SGPR_NULL to avoid generating an extra s_mov with zero.
10383 const GCNSubtarget *Subtarget) {
10384 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10385 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10386 return SOffset;
10387}
10388
10389SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10390 SelectionDAG &DAG,
10391 unsigned NewOpcode) const {
10392 SDLoc DL(Op);
10393
10394 SDValue VData = Op.getOperand(2);
10395 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10396 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10397 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10398 SDValue Ops[] = {
10399 Op.getOperand(0), // Chain
10400 VData, // vdata
10401 Rsrc, // rsrc
10402 DAG.getConstant(0, DL, MVT::i32), // vindex
10403 VOffset, // voffset
10404 SOffset, // soffset
10405 Offset, // offset
10406 Op.getOperand(6), // cachepolicy
10407 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10408 };
10409
10410 auto *M = cast<MemSDNode>(Op);
10411
10412 EVT MemVT = VData.getValueType();
10413 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10414 M->getMemOperand());
10415}
10416
10417SDValue
10418SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10419 unsigned NewOpcode) const {
10420 SDLoc DL(Op);
10421
10422 SDValue VData = Op.getOperand(2);
10423 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10424 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10425 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10426 SDValue Ops[] = {
10427 Op.getOperand(0), // Chain
10428 VData, // vdata
10429 Rsrc, // rsrc
10430 Op.getOperand(4), // vindex
10431 VOffset, // voffset
10432 SOffset, // soffset
10433 Offset, // offset
10434 Op.getOperand(7), // cachepolicy
10435 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10436 };
10437
10438 auto *M = cast<MemSDNode>(Op);
10439
10440 EVT MemVT = VData.getValueType();
10441 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10442 M->getMemOperand());
10443}
10444
10445SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10446 SelectionDAG &DAG) const {
10447 unsigned IntrID = Op.getConstantOperandVal(1);
10448 SDLoc DL(Op);
10449
10450 switch (IntrID) {
10451 case Intrinsic::amdgcn_ds_ordered_add:
10452 case Intrinsic::amdgcn_ds_ordered_swap: {
10453 MemSDNode *M = cast<MemSDNode>(Op);
10454 SDValue Chain = M->getOperand(0);
10455 SDValue M0 = M->getOperand(2);
10456 SDValue Value = M->getOperand(3);
10457 unsigned IndexOperand = M->getConstantOperandVal(7);
10458 unsigned WaveRelease = M->getConstantOperandVal(8);
10459 unsigned WaveDone = M->getConstantOperandVal(9);
10460
10461 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10462 IndexOperand &= ~0x3f;
10463 unsigned CountDw = 0;
10464
10465 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10466 CountDw = (IndexOperand >> 24) & 0xf;
10467 IndexOperand &= ~(0xf << 24);
10468
10469 if (CountDw < 1 || CountDw > 4) {
10470 const Function &Fn = DAG.getMachineFunction().getFunction();
10471 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10472 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10473 DL.getDebugLoc()));
10474 CountDw = 1;
10475 }
10476 }
10477
10478 if (IndexOperand) {
10479 const Function &Fn = DAG.getMachineFunction().getFunction();
10480 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10481 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10482 }
10483
10484 if (WaveDone && !WaveRelease) {
10485 // TODO: Move this to IR verifier
10486 const Function &Fn = DAG.getMachineFunction().getFunction();
10487 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10488 Fn, "ds_ordered_count: wave_done requires wave_release",
10489 DL.getDebugLoc()));
10490 }
10491
10492 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10493 unsigned ShaderType =
10495 unsigned Offset0 = OrderedCountIndex << 2;
10496 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10497
10498 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10499 Offset1 |= (CountDw - 1) << 6;
10500
10501 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10502 Offset1 |= ShaderType << 2;
10503
10504 unsigned Offset = Offset0 | (Offset1 << 8);
10505
10506 SDValue Ops[] = {
10507 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10508 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10509 };
10510 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10511 M->getVTList(), Ops, M->getMemoryVT(),
10512 M->getMemOperand());
10513 }
10514 case Intrinsic::amdgcn_raw_buffer_load:
10515 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10516 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10517 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10518 case Intrinsic::amdgcn_raw_buffer_load_format:
10519 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10520 const bool IsFormat =
10521 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10522 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10523
10524 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10525 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10526 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10527 SDValue Ops[] = {
10528 Op.getOperand(0), // Chain
10529 Rsrc, // rsrc
10530 DAG.getConstant(0, DL, MVT::i32), // vindex
10531 VOffset, // voffset
10532 SOffset, // soffset
10533 Offset, // offset
10534 Op.getOperand(5), // cachepolicy, swizzled buffer
10535 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10536 };
10537
10538 auto *M = cast<MemSDNode>(Op);
10539 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10540 }
10541 case Intrinsic::amdgcn_struct_buffer_load:
10542 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10543 case Intrinsic::amdgcn_struct_buffer_load_format:
10544 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10545 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10546 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10547 const bool IsFormat =
10548 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10549 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10550
10551 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10552 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10553 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10554 SDValue Ops[] = {
10555 Op.getOperand(0), // Chain
10556 Rsrc, // rsrc
10557 Op.getOperand(3), // vindex
10558 VOffset, // voffset
10559 SOffset, // soffset
10560 Offset, // offset
10561 Op.getOperand(6), // cachepolicy, swizzled buffer
10562 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10563 };
10564
10565 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10566 }
10567 case Intrinsic::amdgcn_raw_tbuffer_load:
10568 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10569 MemSDNode *M = cast<MemSDNode>(Op);
10570 EVT LoadVT = Op.getValueType();
10571 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10572 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10573 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10574
10575 SDValue Ops[] = {
10576 Op.getOperand(0), // Chain
10577 Rsrc, // rsrc
10578 DAG.getConstant(0, DL, MVT::i32), // vindex
10579 VOffset, // voffset
10580 SOffset, // soffset
10581 Offset, // offset
10582 Op.getOperand(5), // format
10583 Op.getOperand(6), // cachepolicy, swizzled buffer
10584 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10585 };
10586
10587 if (LoadVT.getScalarType() == MVT::f16)
10588 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10589 Ops);
10590 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10591 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10592 DAG);
10593 }
10594 case Intrinsic::amdgcn_struct_tbuffer_load:
10595 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10596 MemSDNode *M = cast<MemSDNode>(Op);
10597 EVT LoadVT = Op.getValueType();
10598 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10599 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10600 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10601
10602 SDValue Ops[] = {
10603 Op.getOperand(0), // Chain
10604 Rsrc, // rsrc
10605 Op.getOperand(3), // vindex
10606 VOffset, // voffset
10607 SOffset, // soffset
10608 Offset, // offset
10609 Op.getOperand(6), // format
10610 Op.getOperand(7), // cachepolicy, swizzled buffer
10611 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10612 };
10613
10614 if (LoadVT.getScalarType() == MVT::f16)
10615 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10616 Ops);
10617 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10618 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10619 DAG);
10620 }
10621 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10623 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10624 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10626 return lowerStructBufferAtomicIntrin(Op, DAG,
10627 AMDGPUISD::BUFFER_ATOMIC_FADD);
10628 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10629 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10630 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10631 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10632 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10633 return lowerStructBufferAtomicIntrin(Op, DAG,
10634 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10635 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10636 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10637 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10638 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10640 return lowerStructBufferAtomicIntrin(Op, DAG,
10641 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10642 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10644 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10645 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10647 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10648 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10650 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10651 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10653 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10654 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10655 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10656 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10657 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10658 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10659 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10660 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10661 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10662 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10663 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10664 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10665 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10666 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10667 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10668 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10669 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10670 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10671 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10672 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10673 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10674 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10675 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10676 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10677 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10678 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10679 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10680 return lowerStructBufferAtomicIntrin(Op, DAG,
10681 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10682 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10683 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10684 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10685 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10686 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10687 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10688 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10689 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10690 return lowerStructBufferAtomicIntrin(Op, DAG,
10691 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10692 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10694 return lowerStructBufferAtomicIntrin(Op, DAG,
10695 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10696 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10697 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10698 return lowerStructBufferAtomicIntrin(Op, DAG,
10699 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10700 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10701 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10702 return lowerStructBufferAtomicIntrin(Op, DAG,
10703 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10704 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10705 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10706 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10707 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10708 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10709 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10710 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10711 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10712 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10713 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10714 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10715 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10716 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10717 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10718 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10719 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10720 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10721 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10722 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10723 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10724 return lowerStructBufferAtomicIntrin(Op, DAG,
10725 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10726 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10728 return lowerRawBufferAtomicIntrin(Op, DAG,
10729 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10730 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10731 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10732 return lowerStructBufferAtomicIntrin(Op, DAG,
10733 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10734 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10735 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10736 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10737 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10738 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10739 SDValue Ops[] = {
10740 Op.getOperand(0), // Chain
10741 Op.getOperand(2), // src
10742 Op.getOperand(3), // cmp
10743 Rsrc, // rsrc
10744 DAG.getConstant(0, DL, MVT::i32), // vindex
10745 VOffset, // voffset
10746 SOffset, // soffset
10747 Offset, // offset
10748 Op.getOperand(7), // cachepolicy
10749 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10750 };
10751 EVT VT = Op.getValueType();
10752 auto *M = cast<MemSDNode>(Op);
10753
10754 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10755 Op->getVTList(), Ops, VT,
10756 M->getMemOperand());
10757 }
10758 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10759 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10760 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10761 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10762 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10763 SDValue Ops[] = {
10764 Op.getOperand(0), // Chain
10765 Op.getOperand(2), // src
10766 Op.getOperand(3), // cmp
10767 Rsrc, // rsrc
10768 Op.getOperand(5), // vindex
10769 VOffset, // voffset
10770 SOffset, // soffset
10771 Offset, // offset
10772 Op.getOperand(8), // cachepolicy
10773 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10774 };
10775 EVT VT = Op.getValueType();
10776 auto *M = cast<MemSDNode>(Op);
10777
10778 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10779 Op->getVTList(), Ops, VT,
10780 M->getMemOperand());
10781 }
10782 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10783 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10784 MemSDNode *M = cast<MemSDNode>(Op);
10785 SDValue NodePtr = M->getOperand(2);
10786 SDValue RayExtent = M->getOperand(3);
10787 SDValue InstanceMask = M->getOperand(4);
10788 SDValue RayOrigin = M->getOperand(5);
10789 SDValue RayDir = M->getOperand(6);
10790 SDValue Offsets = M->getOperand(7);
10791 SDValue TDescr = M->getOperand(8);
10792
10793 assert(NodePtr.getValueType() == MVT::i64);
10794 assert(RayDir.getValueType() == MVT::v3f32);
10795
10796 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10797 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10798 return SDValue();
10799 }
10800
10801 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10802 const unsigned NumVDataDwords = 10;
10803 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10804 int Opcode = AMDGPU::getMIMGOpcode(
10805 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10806 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10807 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10808 assert(Opcode != -1);
10809
10811 Ops.push_back(NodePtr);
10812 Ops.push_back(DAG.getBuildVector(
10813 MVT::v2i32, DL,
10814 {DAG.getBitcast(MVT::i32, RayExtent),
10815 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10816 Ops.push_back(RayOrigin);
10817 Ops.push_back(RayDir);
10818 Ops.push_back(Offsets);
10819 Ops.push_back(TDescr);
10820 Ops.push_back(M->getChain());
10821
10822 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10823 MachineMemOperand *MemRef = M->getMemOperand();
10824 DAG.setNodeMemRefs(NewNode, {MemRef});
10825 return SDValue(NewNode, 0);
10826 }
10827 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10828 MemSDNode *M = cast<MemSDNode>(Op);
10829 SDValue NodePtr = M->getOperand(2);
10830 SDValue RayExtent = M->getOperand(3);
10831 SDValue RayOrigin = M->getOperand(4);
10832 SDValue RayDir = M->getOperand(5);
10833 SDValue RayInvDir = M->getOperand(6);
10834 SDValue TDescr = M->getOperand(7);
10835
10836 assert(NodePtr.getValueType() == MVT::i32 ||
10837 NodePtr.getValueType() == MVT::i64);
10838 assert(RayDir.getValueType() == MVT::v3f16 ||
10839 RayDir.getValueType() == MVT::v3f32);
10840
10841 if (!Subtarget->hasGFX10_AEncoding()) {
10842 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10843 return SDValue();
10844 }
10845
10846 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10847 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10848 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10849 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10850 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10851 const unsigned NumVDataDwords = 4;
10852 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10853 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10854 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10855 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10856 IsGFX12Plus;
10857 const unsigned BaseOpcodes[2][2] = {
10858 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10859 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10860 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10861 int Opcode;
10862 if (UseNSA) {
10863 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10864 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10865 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10866 : AMDGPU::MIMGEncGfx10NSA,
10867 NumVDataDwords, NumVAddrDwords);
10868 } else {
10869 assert(!IsGFX12Plus);
10870 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10871 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10872 : AMDGPU::MIMGEncGfx10Default,
10873 NumVDataDwords, NumVAddrDwords);
10874 }
10875 assert(Opcode != -1);
10876
10878
10879 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10881 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10882 if (Lanes[0].getValueSizeInBits() == 32) {
10883 for (unsigned I = 0; I < 3; ++I)
10884 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10885 } else {
10886 if (IsAligned) {
10887 Ops.push_back(DAG.getBitcast(
10888 MVT::i32,
10889 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10890 Ops.push_back(Lanes[2]);
10891 } else {
10892 SDValue Elt0 = Ops.pop_back_val();
10893 Ops.push_back(DAG.getBitcast(
10894 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10895 Ops.push_back(DAG.getBitcast(
10896 MVT::i32,
10897 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10898 }
10899 }
10900 };
10901
10902 if (UseNSA && IsGFX11Plus) {
10903 Ops.push_back(NodePtr);
10904 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10905 Ops.push_back(RayOrigin);
10906 if (IsA16) {
10907 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10908 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10909 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10910 for (unsigned I = 0; I < 3; ++I) {
10911 MergedLanes.push_back(DAG.getBitcast(
10912 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10913 {DirLanes[I], InvDirLanes[I]})));
10914 }
10915 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10916 } else {
10917 Ops.push_back(RayDir);
10918 Ops.push_back(RayInvDir);
10919 }
10920 } else {
10921 if (Is64)
10922 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10923 2);
10924 else
10925 Ops.push_back(NodePtr);
10926
10927 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10928 packLanes(RayOrigin, true);
10929 packLanes(RayDir, true);
10930 packLanes(RayInvDir, false);
10931 }
10932
10933 if (!UseNSA) {
10934 // Build a single vector containing all the operands so far prepared.
10935 if (NumVAddrDwords > 12) {
10936 SDValue Undef = DAG.getPOISON(MVT::i32);
10937 Ops.append(16 - Ops.size(), Undef);
10938 }
10939 assert(Ops.size() >= 8 && Ops.size() <= 12);
10940 SDValue MergedOps =
10941 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10942 Ops.clear();
10943 Ops.push_back(MergedOps);
10944 }
10945
10946 Ops.push_back(TDescr);
10947 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10948 Ops.push_back(M->getChain());
10949
10950 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10951 MachineMemOperand *MemRef = M->getMemOperand();
10952 DAG.setNodeMemRefs(NewNode, {MemRef});
10953 return SDValue(NewNode, 0);
10954 }
10955 case Intrinsic::amdgcn_global_atomic_fmin_num:
10956 case Intrinsic::amdgcn_global_atomic_fmax_num:
10957 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10958 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10959 MemSDNode *M = cast<MemSDNode>(Op);
10960 SDValue Ops[] = {
10961 M->getOperand(0), // Chain
10962 M->getOperand(2), // Ptr
10963 M->getOperand(3) // Value
10964 };
10965 unsigned Opcode = 0;
10966 switch (IntrID) {
10967 case Intrinsic::amdgcn_global_atomic_fmin_num:
10968 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10969 Opcode = ISD::ATOMIC_LOAD_FMIN;
10970 break;
10971 }
10972 case Intrinsic::amdgcn_global_atomic_fmax_num:
10973 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10974 Opcode = ISD::ATOMIC_LOAD_FMAX;
10975 break;
10976 }
10977 default:
10978 llvm_unreachable("unhandled atomic opcode");
10979 }
10980 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10981 Ops, M->getMemOperand());
10982 }
10983 case Intrinsic::amdgcn_s_get_barrier_state:
10984 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10985 SDValue Chain = Op->getOperand(0);
10987 unsigned Opc;
10988
10989 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10990 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10991 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10992 BarID = (BarID >> 4) & 0x3F;
10993 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10994 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10995 Ops.push_back(K);
10996 Ops.push_back(Chain);
10997 } else {
10998 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10999 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11000 SDValue M0Val;
11001 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11002 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11003 M0Val = SDValue(
11004 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11005 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11006 0);
11007 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11008 } else
11009 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11010 }
11011
11012 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11013 return SDValue(NewMI, 0);
11014 }
11015 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11016 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11017 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11018 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11019 SDValue Chain = Op->getOperand(0);
11020 SDValue Ptr = Op->getOperand(2);
11021 EVT VT = Op->getValueType(0);
11022 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11023 Chain, Ptr, MII->getMemOperand());
11024 }
11025 default:
11026
11027 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11029 return lowerImage(Op, ImageDimIntr, DAG, true);
11030
11031 return SDValue();
11032 }
11033}
11034
11035// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11036// dwordx4 if on SI and handle TFE loads.
11037SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11038 SDVTList VTList,
11039 ArrayRef<SDValue> Ops, EVT MemVT,
11040 MachineMemOperand *MMO,
11041 SelectionDAG &DAG) const {
11042 LLVMContext &C = *DAG.getContext();
11043 MachineFunction &MF = DAG.getMachineFunction();
11044 EVT VT = VTList.VTs[0];
11045
11046 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11047 bool IsTFE = VTList.NumVTs == 3;
11048 if (IsTFE) {
11049 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11050 unsigned NumOpDWords = NumValueDWords + 1;
11051 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11052 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11053 MachineMemOperand *OpDWordsMMO =
11054 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11055 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11056 OpDWordsVT, OpDWordsMMO, DAG);
11057 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11058 DAG.getVectorIdxConstant(NumValueDWords, DL));
11059 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11060 SDValue ValueDWords =
11061 NumValueDWords == 1
11062 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11064 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11065 ZeroIdx);
11066 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11067 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11068 }
11069
11070 if (!Subtarget->hasDwordx3LoadStores() &&
11071 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11072 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11073 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11074 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11075 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11076 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11077 WidenedMemVT, WidenedMMO);
11079 DAG.getVectorIdxConstant(0, DL));
11080 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11081 }
11082
11083 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11084}
11085
11086SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11087 bool ImageStore) const {
11088 EVT StoreVT = VData.getValueType();
11089
11090 // No change for f16 and legal vector D16 types.
11091 if (!StoreVT.isVector())
11092 return VData;
11093
11094 SDLoc DL(VData);
11095 unsigned NumElements = StoreVT.getVectorNumElements();
11096
11097 if (Subtarget->hasUnpackedD16VMem()) {
11098 // We need to unpack the packed data to store.
11099 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11100 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11101
11102 EVT EquivStoreVT =
11103 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11104 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11105 return DAG.UnrollVectorOp(ZExt.getNode());
11106 }
11107
11108 // The sq block of gfx8.1 does not estimate register use correctly for d16
11109 // image store instructions. The data operand is computed as if it were not a
11110 // d16 image instruction.
11111 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11112 // Bitcast to i16
11113 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11114 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11115
11116 // Decompose into scalars
11118 DAG.ExtractVectorElements(IntVData, Elts);
11119
11120 // Group pairs of i16 into v2i16 and bitcast to i32
11121 SmallVector<SDValue, 4> PackedElts;
11122 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11123 SDValue Pair =
11124 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11125 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11126 PackedElts.push_back(IntPair);
11127 }
11128 if ((NumElements % 2) == 1) {
11129 // Handle v3i16
11130 unsigned I = Elts.size() / 2;
11131 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11132 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11133 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11134 PackedElts.push_back(IntPair);
11135 }
11136
11137 // Pad using UNDEF
11138 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11139
11140 // Build final vector
11141 EVT VecVT =
11142 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11143 return DAG.getBuildVector(VecVT, DL, PackedElts);
11144 }
11145
11146 if (NumElements == 3) {
11147 EVT IntStoreVT =
11149 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11150
11151 EVT WidenedStoreVT = EVT::getVectorVT(
11152 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11153 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11154 WidenedStoreVT.getStoreSizeInBits());
11155 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11156 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11157 }
11158
11159 assert(isTypeLegal(StoreVT));
11160 return VData;
11161}
11162
11163SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11164 SelectionDAG &DAG) const {
11165 SDLoc DL(Op);
11166 SDValue Chain = Op.getOperand(0);
11167 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11168 MachineFunction &MF = DAG.getMachineFunction();
11169
11170 switch (IntrinsicID) {
11171 case Intrinsic::amdgcn_exp_compr: {
11172 if (!Subtarget->hasCompressedExport()) {
11173 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11175 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11176 }
11177 SDValue Src0 = Op.getOperand(4);
11178 SDValue Src1 = Op.getOperand(5);
11179 // Hack around illegal type on SI by directly selecting it.
11180 if (isTypeLegal(Src0.getValueType()))
11181 return SDValue();
11182
11183 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11184 SDValue Undef = DAG.getPOISON(MVT::f32);
11185 const SDValue Ops[] = {
11186 Op.getOperand(2), // tgt
11187 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11188 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11189 Undef, // src2
11190 Undef, // src3
11191 Op.getOperand(7), // vm
11192 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11193 Op.getOperand(3), // en
11194 Op.getOperand(0) // Chain
11195 };
11196
11197 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11198 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11199 }
11200
11201 case Intrinsic::amdgcn_struct_tbuffer_store:
11202 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11203 SDValue VData = Op.getOperand(2);
11204 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11205 if (IsD16)
11206 VData = handleD16VData(VData, DAG);
11207 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11208 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11209 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11210 SDValue Ops[] = {
11211 Chain,
11212 VData, // vdata
11213 Rsrc, // rsrc
11214 Op.getOperand(4), // vindex
11215 VOffset, // voffset
11216 SOffset, // soffset
11217 Offset, // offset
11218 Op.getOperand(7), // format
11219 Op.getOperand(8), // cachepolicy, swizzled buffer
11220 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11221 };
11222 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11223 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11224 MemSDNode *M = cast<MemSDNode>(Op);
11225 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11226 M->getMemoryVT(), M->getMemOperand());
11227 }
11228
11229 case Intrinsic::amdgcn_raw_tbuffer_store:
11230 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11231 SDValue VData = Op.getOperand(2);
11232 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11233 if (IsD16)
11234 VData = handleD16VData(VData, DAG);
11235 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11236 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11237 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11238 SDValue Ops[] = {
11239 Chain,
11240 VData, // vdata
11241 Rsrc, // rsrc
11242 DAG.getConstant(0, DL, MVT::i32), // vindex
11243 VOffset, // voffset
11244 SOffset, // soffset
11245 Offset, // offset
11246 Op.getOperand(6), // format
11247 Op.getOperand(7), // cachepolicy, swizzled buffer
11248 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11249 };
11250 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11251 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11252 MemSDNode *M = cast<MemSDNode>(Op);
11253 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11254 M->getMemoryVT(), M->getMemOperand());
11255 }
11256
11257 case Intrinsic::amdgcn_raw_buffer_store:
11258 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11259 case Intrinsic::amdgcn_raw_buffer_store_format:
11260 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11261 const bool IsFormat =
11262 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11263 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11264
11265 SDValue VData = Op.getOperand(2);
11266 EVT VDataVT = VData.getValueType();
11267 EVT EltType = VDataVT.getScalarType();
11268 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11269 if (IsD16) {
11270 VData = handleD16VData(VData, DAG);
11271 VDataVT = VData.getValueType();
11272 }
11273
11274 if (!isTypeLegal(VDataVT)) {
11275 VData =
11276 DAG.getNode(ISD::BITCAST, DL,
11277 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11278 }
11279
11280 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11281 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11282 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11283 SDValue Ops[] = {
11284 Chain,
11285 VData,
11286 Rsrc,
11287 DAG.getConstant(0, DL, MVT::i32), // vindex
11288 VOffset, // voffset
11289 SOffset, // soffset
11290 Offset, // offset
11291 Op.getOperand(6), // cachepolicy, swizzled buffer
11292 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11293 };
11294 unsigned Opc =
11295 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11296 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11297 MemSDNode *M = cast<MemSDNode>(Op);
11298
11299 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11300 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11301 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11302
11303 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11304 M->getMemoryVT(), M->getMemOperand());
11305 }
11306
11307 case Intrinsic::amdgcn_struct_buffer_store:
11308 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11309 case Intrinsic::amdgcn_struct_buffer_store_format:
11310 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11311 const bool IsFormat =
11312 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11313 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11314
11315 SDValue VData = Op.getOperand(2);
11316 EVT VDataVT = VData.getValueType();
11317 EVT EltType = VDataVT.getScalarType();
11318 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11319
11320 if (IsD16) {
11321 VData = handleD16VData(VData, DAG);
11322 VDataVT = VData.getValueType();
11323 }
11324
11325 if (!isTypeLegal(VDataVT)) {
11326 VData =
11327 DAG.getNode(ISD::BITCAST, DL,
11328 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11329 }
11330
11331 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11332 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11333 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11334 SDValue Ops[] = {
11335 Chain,
11336 VData,
11337 Rsrc,
11338 Op.getOperand(4), // vindex
11339 VOffset, // voffset
11340 SOffset, // soffset
11341 Offset, // offset
11342 Op.getOperand(7), // cachepolicy, swizzled buffer
11343 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11344 };
11345 unsigned Opc =
11346 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11347 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11348 MemSDNode *M = cast<MemSDNode>(Op);
11349
11350 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11351 EVT VDataType = VData.getValueType().getScalarType();
11352 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11353 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11354
11355 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11356 M->getMemoryVT(), M->getMemOperand());
11357 }
11358 case Intrinsic::amdgcn_raw_buffer_load_lds:
11359 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11360 case Intrinsic::amdgcn_struct_buffer_load_lds:
11361 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11362 if (!Subtarget->hasVMemToLDSLoad())
11363 return SDValue();
11364 unsigned Opc;
11365 bool HasVIndex =
11366 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11367 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11368 unsigned OpOffset = HasVIndex ? 1 : 0;
11369 SDValue VOffset = Op.getOperand(5 + OpOffset);
11370 bool HasVOffset = !isNullConstant(VOffset);
11371 unsigned Size = Op->getConstantOperandVal(4);
11372
11373 switch (Size) {
11374 default:
11375 return SDValue();
11376 case 1:
11377 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11378 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11379 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11380 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11381 break;
11382 case 2:
11383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11384 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11385 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11386 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11387 break;
11388 case 4:
11389 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11390 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11391 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11392 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11393 break;
11394 case 12:
11395 if (!Subtarget->hasLDSLoadB96_B128())
11396 return SDValue();
11397 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11398 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11399 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11400 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11401 break;
11402 case 16:
11403 if (!Subtarget->hasLDSLoadB96_B128())
11404 return SDValue();
11405 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11406 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11407 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11408 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11409 break;
11410 }
11411
11412 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11413
11415
11416 if (HasVIndex && HasVOffset)
11417 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11418 {Op.getOperand(5), // VIndex
11419 VOffset}));
11420 else if (HasVIndex)
11421 Ops.push_back(Op.getOperand(5));
11422 else if (HasVOffset)
11423 Ops.push_back(VOffset);
11424
11425 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11426 Ops.push_back(Rsrc);
11427 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11428 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11429 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11430 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11431 Ops.push_back(DAG.getTargetConstant(
11432 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11433 DL, MVT::i8)); // cpol
11434 Ops.push_back(DAG.getTargetConstant(
11435 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11436 ? 1
11437 : 0,
11438 DL, MVT::i8)); // swz
11439 Ops.push_back(M0Val.getValue(0)); // Chain
11440 Ops.push_back(M0Val.getValue(1)); // Glue
11441
11442 auto *M = cast<MemSDNode>(Op);
11443 MachineMemOperand *LoadMMO = M->getMemOperand();
11444 // Don't set the offset value here because the pointer points to the base of
11445 // the buffer.
11446 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11447
11448 MachinePointerInfo StorePtrI = LoadPtrI;
11449 LoadPtrI.V = PoisonValue::get(
11453
11454 auto F = LoadMMO->getFlags() &
11456 LoadMMO =
11458 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11459
11460 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11461 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11462 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11463
11464 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11465 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11466
11467 return SDValue(Load, 0);
11468 }
11469 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11470 // for "trust me" that the remaining cases are global pointers until
11471 // such time as we can put two mem operands on an intrinsic.
11472 case Intrinsic::amdgcn_load_to_lds:
11473 case Intrinsic::amdgcn_global_load_lds: {
11474 if (!Subtarget->hasVMemToLDSLoad())
11475 return SDValue();
11476
11477 unsigned Opc;
11478 unsigned Size = Op->getConstantOperandVal(4);
11479 switch (Size) {
11480 default:
11481 return SDValue();
11482 case 1:
11483 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11484 break;
11485 case 2:
11486 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11487 break;
11488 case 4:
11489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11490 break;
11491 case 12:
11492 if (!Subtarget->hasLDSLoadB96_B128())
11493 return SDValue();
11494 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11495 break;
11496 case 16:
11497 if (!Subtarget->hasLDSLoadB96_B128())
11498 return SDValue();
11499 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11500 break;
11501 }
11502
11503 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11504
11506
11507 SDValue Addr = Op.getOperand(2); // Global ptr
11508 SDValue VOffset;
11509 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11510 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11511 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11512 SDValue LHS = Addr.getOperand(0);
11513 SDValue RHS = Addr.getOperand(1);
11514
11515 if (LHS->isDivergent())
11516 std::swap(LHS, RHS);
11517
11518 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11519 RHS.getOperand(0).getValueType() == MVT::i32) {
11520 // add (i64 sgpr), (zero_extend (i32 vgpr))
11521 Addr = LHS;
11522 VOffset = RHS.getOperand(0);
11523 }
11524 }
11525
11526 Ops.push_back(Addr);
11527 if (!Addr->isDivergent()) {
11529 if (!VOffset)
11530 VOffset =
11531 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11532 DAG.getTargetConstant(0, DL, MVT::i32)),
11533 0);
11534 Ops.push_back(VOffset);
11535 }
11536
11537 Ops.push_back(Op.getOperand(5)); // Offset
11538
11539 unsigned Aux = Op.getConstantOperandVal(6);
11540 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11541 MVT::i32)); // CPol
11542
11543 Ops.push_back(M0Val.getValue(0)); // Chain
11544 Ops.push_back(M0Val.getValue(1)); // Glue
11545
11546 auto *M = cast<MemSDNode>(Op);
11547 MachineMemOperand *LoadMMO = M->getMemOperand();
11548 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11549 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11550 MachinePointerInfo StorePtrI = LoadPtrI;
11551 LoadPtrI.V = PoisonValue::get(
11555 auto F = LoadMMO->getFlags() &
11557 LoadMMO =
11559 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11560 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11561 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11562 LoadMMO->getAAInfo());
11563
11564 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11565 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11566
11567 return SDValue(Load, 0);
11568 }
11569 case Intrinsic::amdgcn_end_cf:
11570 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11571 Op->getOperand(2), Chain),
11572 0);
11573 case Intrinsic::amdgcn_s_barrier_init:
11574 case Intrinsic::amdgcn_s_barrier_signal_var: {
11575 // these two intrinsics have two operands: barrier pointer and member count
11576 SDValue Chain = Op->getOperand(0);
11578 SDValue BarOp = Op->getOperand(2);
11579 SDValue CntOp = Op->getOperand(3);
11580 SDValue M0Val;
11581 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11582 ? AMDGPU::S_BARRIER_INIT_M0
11583 : AMDGPU::S_BARRIER_SIGNAL_M0;
11584 // extract the BarrierID from bits 4-9 of BarOp
11585 SDValue BarID;
11586 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11587 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11588 BarID =
11589 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11590 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11591 0);
11592 // Member count should be put into M0[ShAmt:+6]
11593 // Barrier ID should be put into M0[5:0]
11594 M0Val =
11595 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11596 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11597 0);
11598 constexpr unsigned ShAmt = 16;
11599 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11600 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11601
11602 M0Val = SDValue(
11603 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11604
11605 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11606
11607 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11608 return SDValue(NewMI, 0);
11609 }
11610 case Intrinsic::amdgcn_s_wakeup_barrier: {
11611 if (!Subtarget->hasSWakeupBarrier())
11612 return SDValue();
11613 [[fallthrough]];
11614 }
11615 case Intrinsic::amdgcn_s_barrier_join: {
11616 // these three intrinsics have one operand: barrier pointer
11617 SDValue Chain = Op->getOperand(0);
11619 SDValue BarOp = Op->getOperand(2);
11620 unsigned Opc;
11621
11622 if (isa<ConstantSDNode>(BarOp)) {
11623 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11624 switch (IntrinsicID) {
11625 default:
11626 return SDValue();
11627 case Intrinsic::amdgcn_s_barrier_join:
11628 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11629 break;
11630 case Intrinsic::amdgcn_s_wakeup_barrier:
11631 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11632 break;
11633 }
11634 // extract the BarrierID from bits 4-9 of the immediate
11635 unsigned BarID = (BarVal >> 4) & 0x3F;
11636 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11637 Ops.push_back(K);
11638 Ops.push_back(Chain);
11639 } else {
11640 switch (IntrinsicID) {
11641 default:
11642 return SDValue();
11643 case Intrinsic::amdgcn_s_barrier_join:
11644 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11645 break;
11646 case Intrinsic::amdgcn_s_wakeup_barrier:
11647 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11648 break;
11649 }
11650 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11651 SDValue M0Val;
11652 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11653 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11654 M0Val =
11655 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11656 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11657 0);
11658 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11659 }
11660
11661 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11662 return SDValue(NewMI, 0);
11663 }
11664 case Intrinsic::amdgcn_s_prefetch_data: {
11665 // For non-global address space preserve the chain and remove the call.
11667 return Op.getOperand(0);
11668 return Op;
11669 }
11670 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11671 SDValue Ops[] = {
11672 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11673 Op.getOperand(3), // offset
11674 Op.getOperand(4), // length
11675 };
11676
11677 MemSDNode *M = cast<MemSDNode>(Op);
11678 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11679 Op->getVTList(), Ops, M->getMemoryVT(),
11680 M->getMemOperand());
11681 }
11682 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11683 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11684 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11685 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11686 SDValue Chain = Op->getOperand(0);
11687 SDValue Ptr = Op->getOperand(2);
11688 SDValue Val = Op->getOperand(3);
11689 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11690 Ptr, MII->getMemOperand());
11691 }
11692 default: {
11693 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11695 return lowerImage(Op, ImageDimIntr, DAG, true);
11696
11697 return Op;
11698 }
11699 }
11700}
11701
11702// Return whether the operation has NoUnsignedWrap property.
11703static bool isNoUnsignedWrap(SDValue Addr) {
11704 return (Addr.getOpcode() == ISD::ADD &&
11705 Addr->getFlags().hasNoUnsignedWrap()) ||
11706 Addr->getOpcode() == ISD::OR;
11707}
11708
11710 EVT PtrVT) const {
11711 return PtrVT == MVT::i64;
11712}
11713
11715 EVT PtrVT) const {
11716 return true;
11717}
11718
11719// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11720// offset (the offset that is included in bounds checking and swizzling, to be
11721// split between the instruction's voffset and immoffset fields) and soffset
11722// (the offset that is excluded from bounds checking and swizzling, to go in
11723// the instruction's soffset field). This function takes the first kind of
11724// offset and figures out how to split it between voffset and immoffset.
11725std::pair<SDValue, SDValue>
11726SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11727 SDLoc DL(Offset);
11728 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11729 SDValue N0 = Offset;
11730 ConstantSDNode *C1 = nullptr;
11731
11732 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11733 N0 = SDValue();
11734 else if (DAG.isBaseWithConstantOffset(N0)) {
11735 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11736 // being added, so we can only safely match a 32-bit addition with no
11737 // unsigned overflow.
11738 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11739 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11740 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11741 N0 = N0.getOperand(0);
11742 }
11743 }
11744
11745 if (C1) {
11746 unsigned ImmOffset = C1->getZExtValue();
11747 // If the immediate value is too big for the immoffset field, put only bits
11748 // that would normally fit in the immoffset field. The remaining value that
11749 // is copied/added for the voffset field is a large power of 2, and it
11750 // stands more chance of being CSEd with the copy/add for another similar
11751 // load/store.
11752 // However, do not do that rounding down if that is a negative
11753 // number, as it appears to be illegal to have a negative offset in the
11754 // vgpr, even if adding the immediate offset makes it positive.
11755 unsigned Overflow = ImmOffset & ~MaxImm;
11756 ImmOffset -= Overflow;
11757 if ((int32_t)Overflow < 0) {
11758 Overflow += ImmOffset;
11759 ImmOffset = 0;
11760 }
11761 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11762 if (Overflow) {
11763 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11764 if (!N0)
11765 N0 = OverflowVal;
11766 else {
11767 SDValue Ops[] = {N0, OverflowVal};
11768 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11769 }
11770 }
11771 }
11772 if (!N0)
11773 N0 = DAG.getConstant(0, DL, MVT::i32);
11774 if (!C1)
11775 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11776 return {N0, SDValue(C1, 0)};
11777}
11778
11779// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11780// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11781// pointed to by Offsets.
11782void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11783 SelectionDAG &DAG, SDValue *Offsets,
11784 Align Alignment) const {
11785 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11786 SDLoc DL(CombinedOffset);
11787 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11788 uint32_t Imm = C->getZExtValue();
11789 uint32_t SOffset, ImmOffset;
11790 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11791 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11792 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11793 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11794 return;
11795 }
11796 }
11797 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11798 SDValue N0 = CombinedOffset.getOperand(0);
11799 SDValue N1 = CombinedOffset.getOperand(1);
11800 uint32_t SOffset, ImmOffset;
11801 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11802 if (Offset >= 0 &&
11803 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11804 Offsets[0] = N0;
11805 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11806 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11807 return;
11808 }
11809 }
11810
11811 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11812 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11813 : DAG.getConstant(0, DL, MVT::i32);
11814
11815 Offsets[0] = CombinedOffset;
11816 Offsets[1] = SOffsetZero;
11817 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11818}
11819
11820SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11821 SelectionDAG &DAG) const {
11822 if (!MaybePointer.getValueType().isScalarInteger())
11823 return MaybePointer;
11824
11825 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11826 return Rsrc;
11827}
11828
11829// Wrap a global or flat pointer into a buffer intrinsic using the flags
11830// specified in the intrinsic.
11831SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11832 SelectionDAG &DAG) const {
11833 SDLoc Loc(Op);
11834
11835 SDValue Pointer = Op->getOperand(1);
11836 SDValue Stride = Op->getOperand(2);
11837 SDValue NumRecords = Op->getOperand(3);
11838 SDValue Flags = Op->getOperand(4);
11839
11840 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11841 SDValue Rsrc;
11842
11843 if (Subtarget->has45BitNumRecordsBufferResource()) {
11844 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11845 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11846 // num_records.
11847 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11848 SDValue NumRecordsLHS =
11849 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11850 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11851 SDValue LowHalf =
11852 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11853
11854 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11855 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11856 SDValue NumRecordsRHS =
11857 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11858 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11859 SDValue ShiftedStride =
11860 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11861 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11862 SDValue ExtShiftedStrideVec =
11863 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11864 SDValue ExtShiftedStride =
11865 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11866 SDValue ShiftedFlags =
11867 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11868 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11869 SDValue ExtShiftedFlagsVec =
11870 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11871 SDValue ExtShiftedFlags =
11872 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11873 SDValue CombinedFields =
11874 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11875 SDValue HighHalf =
11876 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11877
11878 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11879 } else {
11880 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11881 auto [LowHalf, HighHalf] =
11882 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11883 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11884 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11885 SDValue ShiftedStride =
11886 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11887 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11888 SDValue NewHighHalf =
11889 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11890
11891 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11892 NumRecords, Flags);
11893 }
11894
11895 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11896 return RsrcPtr;
11897}
11898
11899// Handle 8 bit and 16 bit buffer loads
11900SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11901 EVT LoadVT, SDLoc DL,
11903 MachineMemOperand *MMO,
11904 bool IsTFE) const {
11905 EVT IntVT = LoadVT.changeTypeToInteger();
11906
11907 if (IsTFE) {
11908 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11909 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11910 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11911 MachineFunction &MF = DAG.getMachineFunction();
11912 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11913 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11914 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11915 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11916 DAG.getConstant(1, DL, MVT::i32));
11917 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11918 DAG.getConstant(0, DL, MVT::i32));
11919 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11920 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11921 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11922 }
11923
11924 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11925 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11926 : AMDGPUISD::BUFFER_LOAD_USHORT;
11927
11928 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11929 SDValue BufferLoad =
11930 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11931 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11932 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11933
11934 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11935}
11936
11937// Handle 8 bit and 16 bit buffer stores
11938SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11939 EVT VDataType, SDLoc DL,
11940 SDValue Ops[],
11941 MemSDNode *M) const {
11942 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11943 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11944
11945 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11946 Ops[1] = BufferStoreExt;
11947 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11948 : AMDGPUISD::BUFFER_STORE_SHORT;
11949 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11950 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11951 M->getMemOperand());
11952}
11953
11955 SDValue Op, const SDLoc &SL, EVT VT) {
11956 if (VT.bitsLT(Op.getValueType()))
11957 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11958
11959 switch (ExtType) {
11960 case ISD::SEXTLOAD:
11961 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11962 case ISD::ZEXTLOAD:
11963 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11964 case ISD::EXTLOAD:
11965 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11966 case ISD::NON_EXTLOAD:
11967 return Op;
11968 }
11969
11970 llvm_unreachable("invalid ext type");
11971}
11972
11973// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11974// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11975SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11976 DAGCombinerInfo &DCI) const {
11977 SelectionDAG &DAG = DCI.DAG;
11978 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11979 return SDValue();
11980
11981 // FIXME: Constant loads should all be marked invariant.
11982 unsigned AS = Ld->getAddressSpace();
11983 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11985 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11986 return SDValue();
11987
11988 // Don't do this early, since it may interfere with adjacent load merging for
11989 // illegal types. We can avoid losing alignment information for exotic types
11990 // pre-legalize.
11991 EVT MemVT = Ld->getMemoryVT();
11992 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11993 MemVT.getSizeInBits() >= 32)
11994 return SDValue();
11995
11996 SDLoc SL(Ld);
11997
11998 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11999 "unexpected vector extload");
12000
12001 // TODO: Drop only high part of range.
12002 SDValue Ptr = Ld->getBasePtr();
12003 SDValue NewLoad = DAG.getLoad(
12004 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12005 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12006 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12007 nullptr); // Drop ranges
12008
12009 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12010 if (MemVT.isFloatingPoint()) {
12012 "unexpected fp extload");
12013 TruncVT = MemVT.changeTypeToInteger();
12014 }
12015
12016 SDValue Cvt = NewLoad;
12017 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12018 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12019 DAG.getValueType(TruncVT));
12020 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12022 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12023 } else {
12025 }
12026
12027 EVT VT = Ld->getValueType(0);
12028 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12029
12030 DCI.AddToWorklist(Cvt.getNode());
12031
12032 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12033 // the appropriate extension from the 32-bit load.
12034 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12035 DCI.AddToWorklist(Cvt.getNode());
12036
12037 // Handle conversion back to floating point if necessary.
12038 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12039
12040 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12041}
12042
12044 const SIMachineFunctionInfo &Info) {
12045 // TODO: Should check if the address can definitely not access stack.
12046 if (Info.isEntryFunction())
12047 return Info.getUserSGPRInfo().hasFlatScratchInit();
12048 return true;
12049}
12050
12051SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12052 SDLoc DL(Op);
12053 LoadSDNode *Load = cast<LoadSDNode>(Op);
12054 ISD::LoadExtType ExtType = Load->getExtensionType();
12055 EVT MemVT = Load->getMemoryVT();
12056 MachineMemOperand *MMO = Load->getMemOperand();
12057
12058 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12059 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12060 return SDValue();
12061
12062 // FIXME: Copied from PPC
12063 // First, load into 32 bits, then truncate to 1 bit.
12064
12065 SDValue Chain = Load->getChain();
12066 SDValue BasePtr = Load->getBasePtr();
12067
12068 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12069
12070 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12071 RealMemVT, MMO);
12072
12073 if (!MemVT.isVector()) {
12074 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12075 NewLD.getValue(1)};
12076
12077 return DAG.getMergeValues(Ops, DL);
12078 }
12079
12081 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12082 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12083 DAG.getConstant(I, DL, MVT::i32));
12084
12085 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12086 }
12087
12088 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12089
12090 return DAG.getMergeValues(Ops, DL);
12091 }
12092
12093 if (!MemVT.isVector())
12094 return SDValue();
12095
12096 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12097 "Custom lowering for non-i32 vectors hasn't been implemented.");
12098
12099 Align Alignment = Load->getAlign();
12100 unsigned AS = Load->getAddressSpace();
12101 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12102 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12103 return SplitVectorLoad(Op, DAG);
12104 }
12105
12106 MachineFunction &MF = DAG.getMachineFunction();
12107 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12108 // If there is a possibility that flat instruction access scratch memory
12109 // then we need to use the same legalization rules we use for private.
12110 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12111 !Subtarget->hasMultiDwordFlatScratchAddressing())
12112 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12115
12116 unsigned NumElements = MemVT.getVectorNumElements();
12117
12118 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12120 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12121 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12122 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12123 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12124 Alignment >= Align(4) && NumElements < 32) {
12125 if (MemVT.isPow2VectorType() ||
12126 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12127 return SDValue();
12128 return WidenOrSplitVectorLoad(Op, DAG);
12129 }
12130 // Non-uniform loads will be selected to MUBUF instructions, so they
12131 // have the same legalization requirements as global and private
12132 // loads.
12133 //
12134 }
12135 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12138 if (NumElements > 4)
12139 return SplitVectorLoad(Op, DAG);
12140 // v3 loads not supported on SI.
12141 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12142 return WidenOrSplitVectorLoad(Op, DAG);
12143
12144 // v3 and v4 loads are supported for private and global memory.
12145 return SDValue();
12146 }
12147 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12148 // Depending on the setting of the private_element_size field in the
12149 // resource descriptor, we can only make private accesses up to a certain
12150 // size.
12151 switch (Subtarget->getMaxPrivateElementSize()) {
12152 case 4: {
12153 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12154 return DAG.getMergeValues({Op0, Op1}, DL);
12155 }
12156 case 8:
12157 if (NumElements > 2)
12158 return SplitVectorLoad(Op, DAG);
12159 return SDValue();
12160 case 16:
12161 // Same as global/flat
12162 if (NumElements > 4)
12163 return SplitVectorLoad(Op, DAG);
12164 // v3 loads not supported on SI.
12165 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12166 return WidenOrSplitVectorLoad(Op, DAG);
12167
12168 return SDValue();
12169 default:
12170 llvm_unreachable("unsupported private_element_size");
12171 }
12172 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12173 unsigned Fast = 0;
12174 auto Flags = Load->getMemOperand()->getFlags();
12176 Load->getAlign(), Flags, &Fast) &&
12177 Fast > 1)
12178 return SDValue();
12179
12180 if (MemVT.isVector())
12181 return SplitVectorLoad(Op, DAG);
12182 }
12183
12185 MemVT, *Load->getMemOperand())) {
12186 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12187 return DAG.getMergeValues({Op0, Op1}, DL);
12188 }
12189
12190 return SDValue();
12191}
12192
12193SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12194 EVT VT = Op.getValueType();
12195 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12196 VT.getSizeInBits() == 512)
12197 return splitTernaryVectorOp(Op, DAG);
12198
12199 assert(VT.getSizeInBits() == 64);
12200
12201 SDLoc DL(Op);
12202 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12203
12204 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12205 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12206
12207 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12208 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12209
12210 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12211 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12212
12213 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12214
12215 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12216 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12217
12218 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12219
12220 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12221 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12222}
12223
12224// Catch division cases where we can use shortcuts with rcp and rsq
12225// instructions.
12226SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12227 SelectionDAG &DAG) const {
12228 SDLoc SL(Op);
12229 SDValue LHS = Op.getOperand(0);
12230 SDValue RHS = Op.getOperand(1);
12231 EVT VT = Op.getValueType();
12232 const SDNodeFlags Flags = Op->getFlags();
12233
12234 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12235
12236 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12237 // Without !fpmath accuracy information, we can't do more because we don't
12238 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12239 // f16 is always accurate enough
12240 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12241 return SDValue();
12242
12243 if (CLHS->isExactlyValue(1.0)) {
12244 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12245 // the CI documentation has a worst case error of 1 ulp.
12246 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12247 // use it as long as we aren't trying to use denormals.
12248 //
12249 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12250
12251 // 1.0 / sqrt(x) -> rsq(x)
12252
12253 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12254 // error seems really high at 2^29 ULP.
12255 // 1.0 / x -> rcp(x)
12256 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12257 }
12258
12259 // Same as for 1.0, but expand the sign out of the constant.
12260 if (CLHS->isExactlyValue(-1.0)) {
12261 // -1.0 / x -> rcp (fneg x)
12262 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12263 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12264 }
12265 }
12266
12267 // For f16 and bf16 require afn or arcp.
12268 // For f32 require afn.
12269 if (!AllowInaccurateRcp &&
12270 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12271 return SDValue();
12272
12273 // Turn into multiply by the reciprocal.
12274 // x / y -> x * (1.0 / y)
12275 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12276 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12277}
12278
12279SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12280 SelectionDAG &DAG) const {
12281 SDLoc SL(Op);
12282 SDValue X = Op.getOperand(0);
12283 SDValue Y = Op.getOperand(1);
12284 EVT VT = Op.getValueType();
12285 const SDNodeFlags Flags = Op->getFlags();
12286
12287 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12288 if (!AllowInaccurateDiv)
12289 return SDValue();
12290
12291 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12292 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12293
12294 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12295 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12296
12297 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12298 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12299 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12300 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12301 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12302 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12303}
12304
12305static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12306 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12307 SDNodeFlags Flags) {
12308 if (GlueChain->getNumValues() <= 1) {
12309 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12310 }
12311
12312 assert(GlueChain->getNumValues() == 3);
12313
12314 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12315 switch (Opcode) {
12316 default:
12317 llvm_unreachable("no chain equivalent for opcode");
12318 case ISD::FMUL:
12319 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12320 break;
12321 }
12322
12323 return DAG.getNode(Opcode, SL, VTList,
12324 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12325 Flags);
12326}
12327
12328static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12329 EVT VT, SDValue A, SDValue B, SDValue C,
12330 SDValue GlueChain, SDNodeFlags Flags) {
12331 if (GlueChain->getNumValues() <= 1) {
12332 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12333 }
12334
12335 assert(GlueChain->getNumValues() == 3);
12336
12337 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12338 switch (Opcode) {
12339 default:
12340 llvm_unreachable("no chain equivalent for opcode");
12341 case ISD::FMA:
12342 Opcode = AMDGPUISD::FMA_W_CHAIN;
12343 break;
12344 }
12345
12346 return DAG.getNode(Opcode, SL, VTList,
12347 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12348 Flags);
12349}
12350
12351SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12352 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12353 return FastLowered;
12354
12355 SDLoc SL(Op);
12356 EVT VT = Op.getValueType();
12357 SDValue LHS = Op.getOperand(0);
12358 SDValue RHS = Op.getOperand(1);
12359
12360 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12361 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12362
12363 if (VT == MVT::bf16) {
12364 SDValue ExtDiv =
12365 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12366 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12367 DAG.getTargetConstant(0, SL, MVT::i32));
12368 }
12369
12370 assert(VT == MVT::f16);
12371
12372 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12373 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12374 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12375 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12376 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12377 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12378 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12379 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12380 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12381 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12382 // q16.u = opx(V_CVT_F16_F32, q32.u);
12383 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12384
12385 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12386 unsigned FMADOpCode =
12388 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12389 SDValue Rcp =
12390 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12391 SDValue Quot =
12392 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12393 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12394 Op->getFlags());
12395 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12396 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12397 Op->getFlags());
12398 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12399 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12400 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12401 DAG.getConstant(0xff800000, SL, MVT::i32));
12402 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12403 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12404 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12405 DAG.getTargetConstant(0, SL, MVT::i32));
12406 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12407 Op->getFlags());
12408}
12409
12410// Faster 2.5 ULP division that does not support denormals.
12411SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12412 SDNodeFlags Flags = Op->getFlags();
12413 SDLoc SL(Op);
12414 SDValue LHS = Op.getOperand(1);
12415 SDValue RHS = Op.getOperand(2);
12416
12417 // TODO: The combiner should probably handle elimination of redundant fabs.
12419 ? RHS
12420 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12421
12422 const APFloat K0Val(0x1p+96f);
12423 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12424
12425 const APFloat K1Val(0x1p-32f);
12426 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12427
12428 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12429
12430 EVT SetCCVT =
12431 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12432
12433 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12434
12435 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12436
12437 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12438
12439 // rcp does not support denormals.
12440 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12441
12442 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12443
12444 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12445}
12446
12447// Returns immediate value for setting the F32 denorm mode when using the
12448// S_DENORM_MODE instruction.
12451 const GCNSubtarget *ST) {
12452 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12453 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12454 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12455 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12456}
12457
12458SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12459 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12460 return FastLowered;
12461
12462 // The selection matcher assumes anything with a chain selecting to a
12463 // mayRaiseFPException machine instruction. Since we're introducing a chain
12464 // here, we need to explicitly report nofpexcept for the regular fdiv
12465 // lowering.
12466 SDNodeFlags Flags = Op->getFlags();
12467 Flags.setNoFPExcept(true);
12468
12469 SDLoc SL(Op);
12470 SDValue LHS = Op.getOperand(0);
12471 SDValue RHS = Op.getOperand(1);
12472
12473 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12474
12475 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12476
12477 SDValue DenominatorScaled =
12478 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12479 SDValue NumeratorScaled =
12480 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12481
12482 // Denominator is scaled to not be denormal, so using rcp is ok.
12483 SDValue ApproxRcp =
12484 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12485 SDValue NegDivScale0 =
12486 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12487
12488 using namespace AMDGPU::Hwreg;
12489 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12490 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12491
12492 const MachineFunction &MF = DAG.getMachineFunction();
12493 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12494 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12495
12496 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12497 const bool HasDynamicDenormals =
12498 (DenormMode.Input == DenormalMode::Dynamic) ||
12499 (DenormMode.Output == DenormalMode::Dynamic);
12500
12501 SDValue SavedDenormMode;
12502
12503 if (!PreservesDenormals) {
12504 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12505 // lowering. The chain dependence is insufficient, and we need glue. We do
12506 // not need the glue variants in a strictfp function.
12507
12508 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12509
12510 SDValue Glue = DAG.getEntryNode();
12511 if (HasDynamicDenormals) {
12512 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12513 DAG.getVTList(MVT::i32, MVT::Glue),
12514 {BitField, Glue});
12515 SavedDenormMode = SDValue(GetReg, 0);
12516
12517 Glue = DAG.getMergeValues(
12518 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12519 }
12520
12521 SDNode *EnableDenorm;
12522 if (Subtarget->hasDenormModeInst()) {
12523 const SDValue EnableDenormValue =
12525
12526 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12527 EnableDenormValue)
12528 .getNode();
12529 } else {
12530 const SDValue EnableDenormValue =
12531 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12532 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12533 {EnableDenormValue, BitField, Glue});
12534 }
12535
12536 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12537 SDValue(EnableDenorm, 1)};
12538
12539 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12540 }
12541
12542 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12543 ApproxRcp, One, NegDivScale0, Flags);
12544
12545 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12546 ApproxRcp, Fma0, Flags);
12547
12548 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12549 Fma1, Flags);
12550
12551 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12552 NumeratorScaled, Mul, Flags);
12553
12554 SDValue Fma3 =
12555 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12556
12557 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12558 NumeratorScaled, Fma3, Flags);
12559
12560 if (!PreservesDenormals) {
12561 SDNode *DisableDenorm;
12562 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12563 const SDValue DisableDenormValue = getSPDenormModeValue(
12564 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12565
12566 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12567 DisableDenorm =
12568 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12569 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12570 .getNode();
12571 } else {
12572 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12573 const SDValue DisableDenormValue =
12574 HasDynamicDenormals
12575 ? SavedDenormMode
12576 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12577
12578 DisableDenorm = DAG.getMachineNode(
12579 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12580 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12581 }
12582
12583 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12584 SDValue(DisableDenorm, 0), DAG.getRoot());
12585 DAG.setRoot(OutputChain);
12586 }
12587
12588 SDValue Scale = NumeratorScaled.getValue(1);
12589 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12590 {Fma4, Fma1, Fma3, Scale}, Flags);
12591
12592 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12593}
12594
12595SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12596 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12597 return FastLowered;
12598
12599 SDLoc SL(Op);
12600 SDValue X = Op.getOperand(0);
12601 SDValue Y = Op.getOperand(1);
12602
12603 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12604
12605 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12606
12607 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12608
12609 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12610
12611 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12612
12613 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12614
12615 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12616
12617 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12618
12619 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12620
12621 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12622 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12623
12624 SDValue Fma4 =
12625 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12626
12627 SDValue Scale;
12628
12629 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12630 // Workaround a hardware bug on SI where the condition output from div_scale
12631 // is not usable.
12632
12633 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12634
12635 // Figure out if the scale to use for div_fmas.
12636 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12637 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12638 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12639 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12640
12641 SDValue NumHi =
12642 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12643 SDValue DenHi =
12644 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12645
12646 SDValue Scale0Hi =
12647 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12648 SDValue Scale1Hi =
12649 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12650
12651 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12652 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12653 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12654 } else {
12655 Scale = DivScale1.getValue(1);
12656 }
12657
12658 SDValue Fmas =
12659 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12660
12661 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12662}
12663
12664SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12665 EVT VT = Op.getValueType();
12666
12667 if (VT == MVT::f32)
12668 return LowerFDIV32(Op, DAG);
12669
12670 if (VT == MVT::f64)
12671 return LowerFDIV64(Op, DAG);
12672
12673 if (VT == MVT::f16 || VT == MVT::bf16)
12674 return LowerFDIV16(Op, DAG);
12675
12676 llvm_unreachable("Unexpected type for fdiv");
12677}
12678
12679SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12680 SDLoc dl(Op);
12681 SDValue Val = Op.getOperand(0);
12682 EVT VT = Val.getValueType();
12683 EVT ResultExpVT = Op->getValueType(1);
12684 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12685
12686 SDValue Mant = DAG.getNode(
12688 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12689
12690 SDValue Exp = DAG.getNode(
12691 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12692 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12693
12694 if (Subtarget->hasFractBug()) {
12695 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12696 SDValue Inf =
12698
12699 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12700 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12701 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12702 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12703 }
12704
12705 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12706 return DAG.getMergeValues({Mant, CastExp}, dl);
12707}
12708
12709SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12710 SDLoc DL(Op);
12711 StoreSDNode *Store = cast<StoreSDNode>(Op);
12712 EVT VT = Store->getMemoryVT();
12713
12714 if (VT == MVT::i1) {
12715 return DAG.getTruncStore(
12716 Store->getChain(), DL,
12717 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12718 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12719 }
12720
12721 assert(VT.isVector() &&
12722 Store->getValue().getValueType().getScalarType() == MVT::i32);
12723
12724 unsigned AS = Store->getAddressSpace();
12725 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12726 Store->getAlign().value() < VT.getStoreSize() &&
12727 VT.getSizeInBits() > 32) {
12728 return SplitVectorStore(Op, DAG);
12729 }
12730
12731 MachineFunction &MF = DAG.getMachineFunction();
12732 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12733 // If there is a possibility that flat instruction access scratch memory
12734 // then we need to use the same legalization rules we use for private.
12735 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12736 !Subtarget->hasMultiDwordFlatScratchAddressing())
12737 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12740
12741 unsigned NumElements = VT.getVectorNumElements();
12743 if (NumElements > 4)
12744 return SplitVectorStore(Op, DAG);
12745 // v3 stores not supported on SI.
12746 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12747 return SplitVectorStore(Op, DAG);
12748
12750 VT, *Store->getMemOperand()))
12751 return expandUnalignedStore(Store, DAG);
12752
12753 return SDValue();
12754 }
12755 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12756 switch (Subtarget->getMaxPrivateElementSize()) {
12757 case 4:
12758 return scalarizeVectorStore(Store, DAG);
12759 case 8:
12760 if (NumElements > 2)
12761 return SplitVectorStore(Op, DAG);
12762 return SDValue();
12763 case 16:
12764 if (NumElements > 4 ||
12765 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12766 return SplitVectorStore(Op, DAG);
12767 return SDValue();
12768 default:
12769 llvm_unreachable("unsupported private_element_size");
12770 }
12771 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12772 unsigned Fast = 0;
12773 auto Flags = Store->getMemOperand()->getFlags();
12775 Store->getAlign(), Flags, &Fast) &&
12776 Fast > 1)
12777 return SDValue();
12778
12779 if (VT.isVector())
12780 return SplitVectorStore(Op, DAG);
12781
12782 return expandUnalignedStore(Store, DAG);
12783 }
12784
12785 // Probably an invalid store. If so we'll end up emitting a selection error.
12786 return SDValue();
12787}
12788
12789// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12790SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12791 SDLoc SL(Op);
12792 assert(!Subtarget->has16BitInsts());
12793 SDNodeFlags Flags = Op->getFlags();
12794 SDValue Ext =
12795 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12796
12797 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12798 SDValue Sqrt =
12799 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12800
12801 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12802 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12803}
12804
12805SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12806 SDLoc DL(Op);
12807 SDNodeFlags Flags = Op->getFlags();
12808 MVT VT = Op.getValueType().getSimpleVT();
12809 const SDValue X = Op.getOperand(0);
12810
12811 if (allowApproxFunc(DAG, Flags)) {
12812 // Instruction is 1ulp but ignores denormals.
12813 return DAG.getNode(
12815 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12816 }
12817
12818 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12819 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12820
12821 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12822
12823 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12824
12825 SDValue SqrtX =
12826 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12827
12828 SDValue SqrtS;
12829 if (needsDenormHandlingF32(DAG, X, Flags)) {
12830 SDValue SqrtID =
12831 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12832 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12833
12834 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12835 SDValue SqrtSNextDownInt =
12836 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12837 DAG.getAllOnesConstant(DL, MVT::i32));
12838 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12839
12840 SDValue NegSqrtSNextDown =
12841 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12842
12843 SDValue SqrtVP =
12844 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12845
12846 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12847 DAG.getConstant(1, DL, MVT::i32));
12848 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12849
12850 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12851 SDValue SqrtVS =
12852 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12853
12854 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12855 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12856
12857 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12858 Flags);
12859
12860 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12861 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12862 Flags);
12863 } else {
12864 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12865
12866 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12867
12868 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12869 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12870 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12871
12872 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12873 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12874 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12875
12876 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12877 SDValue SqrtD =
12878 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12879 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12880 }
12881
12882 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12883
12884 SDValue ScaledDown =
12885 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12886
12887 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12888 SDValue IsZeroOrInf =
12889 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12890 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12891
12892 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12893}
12894
12895SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12896 // For double type, the SQRT and RSQ instructions don't have required
12897 // precision, we apply Goldschmidt's algorithm to improve the result:
12898 //
12899 // y0 = rsq(x)
12900 // g0 = x * y0
12901 // h0 = 0.5 * y0
12902 //
12903 // r0 = 0.5 - h0 * g0
12904 // g1 = g0 * r0 + g0
12905 // h1 = h0 * r0 + h0
12906 //
12907 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12908 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12909 // h2 = h1 * r1 + h1
12910 //
12911 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12912 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12913 //
12914 // sqrt(x) = g3
12915
12916 SDNodeFlags Flags = Op->getFlags();
12917
12918 SDLoc DL(Op);
12919
12920 SDValue X = Op.getOperand(0);
12921 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12922
12923 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12924
12925 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12926
12927 // Scale up input if it is too small.
12928 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12929 SDValue ScaleUp =
12930 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12931 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12932
12933 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12934
12935 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12936
12937 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12938 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12939
12940 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12941 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12942
12943 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12944
12945 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12946
12947 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12948 SDValue SqrtD0 =
12949 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12950
12951 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12952
12953 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12954 SDValue SqrtD1 =
12955 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12956
12957 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12958
12959 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12960 SDValue ScaleDown =
12961 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12962 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12963
12964 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12965 // with finite only or nsz because rsq(+/-0) = +/-inf
12966
12967 // TODO: Check for DAZ and expand to subnormals
12968 SDValue IsZeroOrInf =
12969 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12970 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12971
12972 // If x is +INF, +0, or -0, use its original value
12973 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12974 Flags);
12975}
12976
12977SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12978 SDLoc DL(Op);
12979 EVT VT = Op.getValueType();
12980 SDValue Arg = Op.getOperand(0);
12981 SDValue TrigVal;
12982
12983 // Propagate fast-math flags so that the multiply we introduce can be folded
12984 // if Arg is already the result of a multiply by constant.
12985 auto Flags = Op->getFlags();
12986
12987 // AMDGPUISD nodes of vector type must be unrolled here since
12988 // they will not be expanded elsewhere.
12989 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
12990 if (!V.getValueType().isVector())
12991 return V;
12992
12993 return DAG.UnrollVectorOp(cast<SDNode>(V));
12994 };
12995
12996 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12997
12998 if (Subtarget->hasTrigReducedRange()) {
12999 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13000 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13001 } else {
13002 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13003 }
13004
13005 switch (Op.getOpcode()) {
13006 case ISD::FCOS:
13007 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13008 break;
13009 case ISD::FSIN:
13010 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13011 break;
13012 default:
13013 llvm_unreachable("Wrong trig opcode");
13014 }
13015
13016 return UnrollIfVec(TrigVal);
13017}
13018
13019SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13020 SelectionDAG &DAG) const {
13021 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13022 assert(AtomicNode->isCompareAndSwap());
13023 unsigned AS = AtomicNode->getAddressSpace();
13024
13025 // No custom lowering required for local address space
13027 return Op;
13028
13029 // Non-local address space requires custom lowering for atomic compare
13030 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13031 SDLoc DL(Op);
13032 SDValue ChainIn = Op.getOperand(0);
13033 SDValue Addr = Op.getOperand(1);
13034 SDValue Old = Op.getOperand(2);
13035 SDValue New = Op.getOperand(3);
13036 EVT VT = Op.getValueType();
13037 MVT SimpleVT = VT.getSimpleVT();
13038 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13039
13040 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13041 SDValue Ops[] = {ChainIn, Addr, NewOld};
13042
13043 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13044 Op->getVTList(), Ops, VT,
13045 AtomicNode->getMemOperand());
13046}
13047
13048//===----------------------------------------------------------------------===//
13049// Custom DAG optimizations
13050//===----------------------------------------------------------------------===//
13051
13052SDValue
13053SITargetLowering::performUCharToFloatCombine(SDNode *N,
13054 DAGCombinerInfo &DCI) const {
13055 EVT VT = N->getValueType(0);
13056 EVT ScalarVT = VT.getScalarType();
13057 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13058 return SDValue();
13059
13060 SelectionDAG &DAG = DCI.DAG;
13061 SDLoc DL(N);
13062
13063 SDValue Src = N->getOperand(0);
13064 EVT SrcVT = Src.getValueType();
13065
13066 // TODO: We could try to match extracting the higher bytes, which would be
13067 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13068 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13069 // about in practice.
13070 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13071 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13072 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13073 DCI.AddToWorklist(Cvt.getNode());
13074
13075 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13076 if (ScalarVT != MVT::f32) {
13077 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13078 DAG.getTargetConstant(0, DL, MVT::i32));
13079 }
13080 return Cvt;
13081 }
13082 }
13083
13084 return SDValue();
13085}
13086
13087SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13088 DAGCombinerInfo &DCI) const {
13089 SDValue MagnitudeOp = N->getOperand(0);
13090 SDValue SignOp = N->getOperand(1);
13091
13092 // The generic combine for fcopysign + fp cast is too conservative with
13093 // vectors, and also gets confused by the splitting we will perform here, so
13094 // peek through FP casts.
13095 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13096 SignOp.getOpcode() == ISD::FP_ROUND)
13097 SignOp = SignOp.getOperand(0);
13098
13099 SelectionDAG &DAG = DCI.DAG;
13100 SDLoc DL(N);
13101 EVT SignVT = SignOp.getValueType();
13102
13103 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13104 // lower half with a copy.
13105 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13106 EVT MagVT = MagnitudeOp.getValueType();
13107
13108 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13109
13110 if (MagVT.getScalarType() == MVT::f64) {
13111 EVT F32VT = MagVT.isVector()
13112 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13113 : MVT::v2f32;
13114
13115 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13116
13118 for (unsigned I = 0; I != NumElts; ++I) {
13119 SDValue MagLo =
13120 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13121 DAG.getConstant(2 * I, DL, MVT::i32));
13122 SDValue MagHi =
13123 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13124 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13125
13126 SDValue SignOpElt =
13127 MagVT.isVector()
13129 SignOp, DAG.getConstant(I, DL, MVT::i32))
13130 : SignOp;
13131
13132 SDValue HiOp =
13133 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13134
13135 SDValue Vector =
13136 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13137
13138 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13139 NewElts.push_back(NewElt);
13140 }
13141
13142 if (NewElts.size() == 1)
13143 return NewElts[0];
13144
13145 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13146 }
13147
13148 if (SignVT.getScalarType() != MVT::f64)
13149 return SDValue();
13150
13151 // Reduce width of sign operand, we only need the highest bit.
13152 //
13153 // fcopysign f64:x, f64:y ->
13154 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13155 // TODO: In some cases it might make sense to go all the way to f16.
13156
13157 EVT F32VT = MagVT.isVector()
13158 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13159 : MVT::v2f32;
13160
13161 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13162
13163 SmallVector<SDValue, 8> F32Signs;
13164 for (unsigned I = 0; I != NumElts; ++I) {
13165 // Take sign from odd elements of cast vector
13166 SDValue SignAsF32 =
13167 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13168 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13169 F32Signs.push_back(SignAsF32);
13170 }
13171
13172 SDValue NewSign =
13173 NumElts == 1
13174 ? F32Signs.back()
13176 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13177 F32Signs);
13178
13179 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13180 NewSign);
13181}
13182
13183// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13184// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13185// bits
13186
13187// This is a variant of
13188// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13189//
13190// The normal DAG combiner will do this, but only if the add has one use since
13191// that would increase the number of instructions.
13192//
13193// This prevents us from seeing a constant offset that can be folded into a
13194// memory instruction's addressing mode. If we know the resulting add offset of
13195// a pointer can be folded into an addressing offset, we can replace the pointer
13196// operand with the add of new constant offset. This eliminates one of the uses,
13197// and may allow the remaining use to also be simplified.
13198//
13199SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13200 EVT MemVT,
13201 DAGCombinerInfo &DCI) const {
13202 SDValue N0 = N->getOperand(0);
13203 SDValue N1 = N->getOperand(1);
13204
13205 // We only do this to handle cases where it's profitable when there are
13206 // multiple uses of the add, so defer to the standard combine.
13207 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13208 return SDValue();
13209
13210 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13211 if (!CN1)
13212 return SDValue();
13213
13214 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13215 if (!CAdd)
13216 return SDValue();
13217
13218 SelectionDAG &DAG = DCI.DAG;
13219
13220 if (N0->getOpcode() == ISD::OR &&
13221 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13222 return SDValue();
13223
13224 // If the resulting offset is too large, we can't fold it into the
13225 // addressing mode offset.
13226 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13227 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13228
13229 AddrMode AM;
13230 AM.HasBaseReg = true;
13231 AM.BaseOffs = Offset.getSExtValue();
13232 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13233 return SDValue();
13234
13235 SDLoc SL(N);
13236 EVT VT = N->getValueType(0);
13237
13238 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13239 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13240
13241 SDNodeFlags Flags;
13242 Flags.setNoUnsignedWrap(
13243 N->getFlags().hasNoUnsignedWrap() &&
13244 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13245
13246 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13247 // be sure that the new left operand is a proper base pointer.
13248 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13249}
13250
13251/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13252/// by the chain and intrinsic ID. Theoretically we would also need to check the
13253/// specific intrinsic, but they all place the pointer operand first.
13254static unsigned getBasePtrIndex(const MemSDNode *N) {
13255 switch (N->getOpcode()) {
13256 case ISD::STORE:
13259 return 2;
13260 default:
13261 return 1;
13262 }
13263}
13264
13265SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13266 DAGCombinerInfo &DCI) const {
13267 SelectionDAG &DAG = DCI.DAG;
13268
13269 unsigned PtrIdx = getBasePtrIndex(N);
13270 SDValue Ptr = N->getOperand(PtrIdx);
13271
13272 // TODO: We could also do this for multiplies.
13273 if (Ptr.getOpcode() == ISD::SHL) {
13274 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13275 N->getMemoryVT(), DCI);
13276 if (NewPtr) {
13277 SmallVector<SDValue, 8> NewOps(N->ops());
13278
13279 NewOps[PtrIdx] = NewPtr;
13280 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13281 }
13282 }
13283
13284 return SDValue();
13285}
13286
13287static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13288 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13289 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13290 (Opc == ISD::XOR && Val == 0);
13291}
13292
13293// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13294// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13295// integer combine opportunities since most 64-bit operations are decomposed
13296// this way. TODO: We won't want this for SALU especially if it is an inline
13297// immediate.
13298SDValue SITargetLowering::splitBinaryBitConstantOp(
13299 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13300 const ConstantSDNode *CRHS) const {
13301 uint64_t Val = CRHS->getZExtValue();
13302 uint32_t ValLo = Lo_32(Val);
13303 uint32_t ValHi = Hi_32(Val);
13304 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13305
13306 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13308 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13309 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13310 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13311 !CRHS->user_begin()->isDivergent())
13312 return SDValue();
13313
13314 // If we need to materialize a 64-bit immediate, it will be split up later
13315 // anyway. Avoid creating the harder to understand 64-bit immediate
13316 // materialization.
13317 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13318 }
13319
13320 return SDValue();
13321}
13322
13324 if (V.getValueType() != MVT::i1)
13325 return false;
13326 switch (V.getOpcode()) {
13327 default:
13328 break;
13329 case ISD::SETCC:
13330 case ISD::IS_FPCLASS:
13331 case AMDGPUISD::FP_CLASS:
13332 return true;
13333 case ISD::AND:
13334 case ISD::OR:
13335 case ISD::XOR:
13336 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13337 case ISD::SADDO:
13338 case ISD::UADDO:
13339 case ISD::SSUBO:
13340 case ISD::USUBO:
13341 case ISD::SMULO:
13342 case ISD::UMULO:
13343 return V.getResNo() == 1;
13345 unsigned IntrinsicID = V.getConstantOperandVal(0);
13346 switch (IntrinsicID) {
13347 case Intrinsic::amdgcn_is_shared:
13348 case Intrinsic::amdgcn_is_private:
13349 return true;
13350 default:
13351 return false;
13352 }
13353
13354 return false;
13355 }
13356 }
13357 return false;
13358}
13359
13360// If a constant has all zeroes or all ones within each byte return it.
13361// Otherwise return 0.
13363 // 0xff for any zero byte in the mask
13364 uint32_t ZeroByteMask = 0;
13365 if (!(C & 0x000000ff))
13366 ZeroByteMask |= 0x000000ff;
13367 if (!(C & 0x0000ff00))
13368 ZeroByteMask |= 0x0000ff00;
13369 if (!(C & 0x00ff0000))
13370 ZeroByteMask |= 0x00ff0000;
13371 if (!(C & 0xff000000))
13372 ZeroByteMask |= 0xff000000;
13373 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13374 if ((NonZeroByteMask & C) != NonZeroByteMask)
13375 return 0; // Partial bytes selected.
13376 return C;
13377}
13378
13379// Check if a node selects whole bytes from its operand 0 starting at a byte
13380// boundary while masking the rest. Returns select mask as in the v_perm_b32
13381// or -1 if not succeeded.
13382// Note byte select encoding:
13383// value 0-3 selects corresponding source byte;
13384// value 0xc selects zero;
13385// value 0xff selects 0xff.
13387 assert(V.getValueSizeInBits() == 32);
13388
13389 if (V.getNumOperands() != 2)
13390 return ~0;
13391
13392 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13393 if (!N1)
13394 return ~0;
13395
13396 uint32_t C = N1->getZExtValue();
13397
13398 switch (V.getOpcode()) {
13399 default:
13400 break;
13401 case ISD::AND:
13402 if (uint32_t ConstMask = getConstantPermuteMask(C))
13403 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13404 break;
13405
13406 case ISD::OR:
13407 if (uint32_t ConstMask = getConstantPermuteMask(C))
13408 return (0x03020100 & ~ConstMask) | ConstMask;
13409 break;
13410
13411 case ISD::SHL:
13412 if (C % 8)
13413 return ~0;
13414
13415 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13416
13417 case ISD::SRL:
13418 if (C % 8)
13419 return ~0;
13420
13421 return uint32_t(0x0c0c0c0c03020100ull >> C);
13422 }
13423
13424 return ~0;
13425}
13426
13427SDValue SITargetLowering::performAndCombine(SDNode *N,
13428 DAGCombinerInfo &DCI) const {
13429 if (DCI.isBeforeLegalize())
13430 return SDValue();
13431
13432 SelectionDAG &DAG = DCI.DAG;
13433 EVT VT = N->getValueType(0);
13434 SDValue LHS = N->getOperand(0);
13435 SDValue RHS = N->getOperand(1);
13436
13437 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13438 if (VT == MVT::i64 && CRHS) {
13439 if (SDValue Split =
13440 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13441 return Split;
13442 }
13443
13444 if (CRHS && VT == MVT::i32) {
13445 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13446 // nb = number of trailing zeroes in mask
13447 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13448 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13449 uint64_t Mask = CRHS->getZExtValue();
13450 unsigned Bits = llvm::popcount(Mask);
13451 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13452 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13453 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13454 unsigned Shift = CShift->getZExtValue();
13455 unsigned NB = CRHS->getAPIntValue().countr_zero();
13456 unsigned Offset = NB + Shift;
13457 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13458 SDLoc SL(N);
13459 SDValue BFE =
13460 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13461 DAG.getConstant(Offset, SL, MVT::i32),
13462 DAG.getConstant(Bits, SL, MVT::i32));
13463 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13464 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13465 DAG.getValueType(NarrowVT));
13466 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13467 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13468 return Shl;
13469 }
13470 }
13471 }
13472
13473 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13474 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13475 isa<ConstantSDNode>(LHS.getOperand(2))) {
13476 uint32_t Sel = getConstantPermuteMask(Mask);
13477 if (!Sel)
13478 return SDValue();
13479
13480 // Select 0xc for all zero bytes
13481 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13482 SDLoc DL(N);
13483 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13484 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13485 }
13486 }
13487
13488 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13489 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13490 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13491 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13492 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13493
13494 SDValue X = LHS.getOperand(0);
13495 SDValue Y = RHS.getOperand(0);
13496 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13497 !isTypeLegal(X.getValueType()))
13498 return SDValue();
13499
13500 if (LCC == ISD::SETO) {
13501 if (X != LHS.getOperand(1))
13502 return SDValue();
13503
13504 if (RCC == ISD::SETUNE) {
13505 const ConstantFPSDNode *C1 =
13506 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13507 if (!C1 || !C1->isInfinity() || C1->isNegative())
13508 return SDValue();
13509
13510 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13514
13515 static_assert(
13518 0x3ff) == Mask,
13519 "mask not equal");
13520
13521 SDLoc DL(N);
13522 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13523 DAG.getConstant(Mask, DL, MVT::i32));
13524 }
13525 }
13526 }
13527
13528 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13529 std::swap(LHS, RHS);
13530
13531 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13532 RHS.hasOneUse()) {
13533 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13534 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13535 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13536 // | n_nan)
13537 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13538 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13539 (RHS.getOperand(0) == LHS.getOperand(0) &&
13540 LHS.getOperand(0) == LHS.getOperand(1))) {
13541 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13542 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13543 : Mask->getZExtValue() & OrdMask;
13544
13545 SDLoc DL(N);
13546 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13547 DAG.getConstant(NewMask, DL, MVT::i32));
13548 }
13549 }
13550
13551 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13552 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13553 // and x, (sext cc from i1) => select cc, x, 0
13554 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13555 std::swap(LHS, RHS);
13556 if (isBoolSGPR(RHS.getOperand(0)))
13557 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13558 DAG.getConstant(0, SDLoc(N), MVT::i32));
13559 }
13560
13561 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13562 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13563 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13564 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13565 uint32_t LHSMask = getPermuteMask(LHS);
13566 uint32_t RHSMask = getPermuteMask(RHS);
13567 if (LHSMask != ~0u && RHSMask != ~0u) {
13568 // Canonicalize the expression in an attempt to have fewer unique masks
13569 // and therefore fewer registers used to hold the masks.
13570 if (LHSMask > RHSMask) {
13571 std::swap(LHSMask, RHSMask);
13572 std::swap(LHS, RHS);
13573 }
13574
13575 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13576 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13577 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13578 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13579
13580 // Check of we need to combine values from two sources within a byte.
13581 if (!(LHSUsedLanes & RHSUsedLanes) &&
13582 // If we select high and lower word keep it for SDWA.
13583 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13584 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13585 // Each byte in each mask is either selector mask 0-3, or has higher
13586 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13587 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13588 // mask which is not 0xff wins. By anding both masks we have a correct
13589 // result except that 0x0c shall be corrected to give 0x0c only.
13590 uint32_t Mask = LHSMask & RHSMask;
13591 for (unsigned I = 0; I < 32; I += 8) {
13592 uint32_t ByteSel = 0xff << I;
13593 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13594 Mask &= (0x0c << I) & 0xffffffff;
13595 }
13596
13597 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13598 // or 0x0c.
13599 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13600 SDLoc DL(N);
13601
13602 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13603 RHS.getOperand(0),
13604 DAG.getConstant(Sel, DL, MVT::i32));
13605 }
13606 }
13607 }
13608
13609 return SDValue();
13610}
13611
13612// A key component of v_perm is a mapping between byte position of the src
13613// operands, and the byte position of the dest. To provide such, we need: 1. the
13614// node that provides x byte of the dest of the OR, and 2. the byte of the node
13615// used to provide that x byte. calculateByteProvider finds which node provides
13616// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13617// and finds an ultimate src and byte position For example: The supported
13618// LoadCombine pattern for vector loads is as follows
13619// t1
13620// or
13621// / \
13622// t2 t3
13623// zext shl
13624// | | \
13625// t4 t5 16
13626// or anyext
13627// / \ |
13628// t6 t7 t8
13629// srl shl or
13630// / | / \ / \
13631// t9 t10 t11 t12 t13 t14
13632// trunc* 8 trunc* 8 and and
13633// | | / | | \
13634// t15 t16 t17 t18 t19 t20
13635// trunc* 255 srl -256
13636// | / \
13637// t15 t15 16
13638//
13639// *In this example, the truncs are from i32->i16
13640//
13641// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13642// respectively. calculateSrcByte would find (given node) -> ultimate src &
13643// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13644// After finding the mapping, we can combine the tree into vperm t15, t16,
13645// 0x05000407
13646
13647// Find the source and byte position from a node.
13648// \p DestByte is the byte position of the dest of the or that the src
13649// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13650// dest of the or byte. \p Depth tracks how many recursive iterations we have
13651// performed.
13652static const std::optional<ByteProvider<SDValue>>
13653calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13654 unsigned Depth = 0) {
13655 // We may need to recursively traverse a series of SRLs
13656 if (Depth >= 6)
13657 return std::nullopt;
13658
13659 if (Op.getValueSizeInBits() < 8)
13660 return std::nullopt;
13661
13662 if (Op.getValueType().isVector())
13663 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13664
13665 switch (Op->getOpcode()) {
13666 case ISD::TRUNCATE: {
13667 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13668 }
13669
13670 case ISD::SIGN_EXTEND:
13671 case ISD::ZERO_EXTEND:
13673 SDValue NarrowOp = Op->getOperand(0);
13674 auto NarrowVT = NarrowOp.getValueType();
13675 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13676 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13677 NarrowVT = VTSign->getVT();
13678 }
13679 if (!NarrowVT.isByteSized())
13680 return std::nullopt;
13681 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13682
13683 if (SrcIndex >= NarrowByteWidth)
13684 return std::nullopt;
13685 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13686 }
13687
13688 case ISD::SRA:
13689 case ISD::SRL: {
13690 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13691 if (!ShiftOp)
13692 return std::nullopt;
13693
13694 uint64_t BitShift = ShiftOp->getZExtValue();
13695
13696 if (BitShift % 8 != 0)
13697 return std::nullopt;
13698
13699 SrcIndex += BitShift / 8;
13700
13701 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13702 }
13703
13704 default: {
13705 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13706 }
13707 }
13708 llvm_unreachable("fully handled switch");
13709}
13710
13711// For a byte position in the result of an Or, traverse the tree and find the
13712// node (and the byte of the node) which ultimately provides this {Or,
13713// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13714// the byte position of the Op that corresponds with the originally requested
13715// byte of the Or \p Depth tracks how many recursive iterations we have
13716// performed. \p StartingIndex is the originally requested byte of the Or
13717static const std::optional<ByteProvider<SDValue>>
13718calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13719 unsigned StartingIndex = 0) {
13720 // Finding Src tree of RHS of or typically requires at least 1 additional
13721 // depth
13722 if (Depth > 6)
13723 return std::nullopt;
13724
13725 unsigned BitWidth = Op.getScalarValueSizeInBits();
13726 if (BitWidth % 8 != 0)
13727 return std::nullopt;
13728 if (Index > BitWidth / 8 - 1)
13729 return std::nullopt;
13730
13731 bool IsVec = Op.getValueType().isVector();
13732 switch (Op.getOpcode()) {
13733 case ISD::OR: {
13734 if (IsVec)
13735 return std::nullopt;
13736
13737 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13738 StartingIndex);
13739 if (!RHS)
13740 return std::nullopt;
13741 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13742 StartingIndex);
13743 if (!LHS)
13744 return std::nullopt;
13745 // A well formed Or will have two ByteProviders for each byte, one of which
13746 // is constant zero
13747 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13748 return std::nullopt;
13749 if (!LHS || LHS->isConstantZero())
13750 return RHS;
13751 if (!RHS || RHS->isConstantZero())
13752 return LHS;
13753 return std::nullopt;
13754 }
13755
13756 case ISD::AND: {
13757 if (IsVec)
13758 return std::nullopt;
13759
13760 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13761 if (!BitMaskOp)
13762 return std::nullopt;
13763
13764 uint32_t BitMask = BitMaskOp->getZExtValue();
13765 // Bits we expect for our StartingIndex
13766 uint32_t IndexMask = 0xFF << (Index * 8);
13767
13768 if ((IndexMask & BitMask) != IndexMask) {
13769 // If the result of the and partially provides the byte, then it
13770 // is not well formatted
13771 if (IndexMask & BitMask)
13772 return std::nullopt;
13774 }
13775
13776 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13777 }
13778
13779 case ISD::FSHR: {
13780 if (IsVec)
13781 return std::nullopt;
13782
13783 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13784 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13785 if (!ShiftOp || Op.getValueType().isVector())
13786 return std::nullopt;
13787
13788 uint64_t BitsProvided = Op.getValueSizeInBits();
13789 if (BitsProvided % 8 != 0)
13790 return std::nullopt;
13791
13792 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13793 if (BitShift % 8)
13794 return std::nullopt;
13795
13796 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13797 uint64_t ByteShift = BitShift / 8;
13798
13799 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13800 uint64_t BytesProvided = BitsProvided / 8;
13801 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13802 NewIndex %= BytesProvided;
13803 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13804 }
13805
13806 case ISD::SRA:
13807 case ISD::SRL: {
13808 if (IsVec)
13809 return std::nullopt;
13810
13811 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13812 if (!ShiftOp)
13813 return std::nullopt;
13814
13815 uint64_t BitShift = ShiftOp->getZExtValue();
13816 if (BitShift % 8)
13817 return std::nullopt;
13818
13819 auto BitsProvided = Op.getScalarValueSizeInBits();
13820 if (BitsProvided % 8 != 0)
13821 return std::nullopt;
13822
13823 uint64_t BytesProvided = BitsProvided / 8;
13824 uint64_t ByteShift = BitShift / 8;
13825 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13826 // If the byte we are trying to provide (as tracked by index) falls in this
13827 // range, then the SRL provides the byte. The byte of interest of the src of
13828 // the SRL is Index + ByteShift
13829 return BytesProvided - ByteShift > Index
13830 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13831 Index + ByteShift)
13833 }
13834
13835 case ISD::SHL: {
13836 if (IsVec)
13837 return std::nullopt;
13838
13839 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13840 if (!ShiftOp)
13841 return std::nullopt;
13842
13843 uint64_t BitShift = ShiftOp->getZExtValue();
13844 if (BitShift % 8 != 0)
13845 return std::nullopt;
13846 uint64_t ByteShift = BitShift / 8;
13847
13848 // If we are shifting by an amount greater than (or equal to)
13849 // the index we are trying to provide, then it provides 0s. If not,
13850 // then this bytes are not definitively 0s, and the corresponding byte
13851 // of interest is Index - ByteShift of the src
13852 return Index < ByteShift
13854 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13855 Depth + 1, StartingIndex);
13856 }
13857 case ISD::ANY_EXTEND:
13858 case ISD::SIGN_EXTEND:
13859 case ISD::ZERO_EXTEND:
13861 case ISD::AssertZext:
13862 case ISD::AssertSext: {
13863 if (IsVec)
13864 return std::nullopt;
13865
13866 SDValue NarrowOp = Op->getOperand(0);
13867 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13868 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13869 Op->getOpcode() == ISD::AssertZext ||
13870 Op->getOpcode() == ISD::AssertSext) {
13871 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13872 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13873 }
13874 if (NarrowBitWidth % 8 != 0)
13875 return std::nullopt;
13876 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13877
13878 if (Index >= NarrowByteWidth)
13879 return Op.getOpcode() == ISD::ZERO_EXTEND
13880 ? std::optional<ByteProvider<SDValue>>(
13882 : std::nullopt;
13883 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13884 }
13885
13886 case ISD::TRUNCATE: {
13887 if (IsVec)
13888 return std::nullopt;
13889
13890 uint64_t NarrowByteWidth = BitWidth / 8;
13891
13892 if (NarrowByteWidth >= Index) {
13893 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13894 StartingIndex);
13895 }
13896
13897 return std::nullopt;
13898 }
13899
13900 case ISD::CopyFromReg: {
13901 if (BitWidth / 8 > Index)
13902 return calculateSrcByte(Op, StartingIndex, Index);
13903
13904 return std::nullopt;
13905 }
13906
13907 case ISD::LOAD: {
13908 auto *L = cast<LoadSDNode>(Op.getNode());
13909
13910 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13911 if (NarrowBitWidth % 8 != 0)
13912 return std::nullopt;
13913 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13914
13915 // If the width of the load does not reach byte we are trying to provide for
13916 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13917 // question
13918 if (Index >= NarrowByteWidth) {
13919 return L->getExtensionType() == ISD::ZEXTLOAD
13920 ? std::optional<ByteProvider<SDValue>>(
13922 : std::nullopt;
13923 }
13924
13925 if (NarrowByteWidth > Index) {
13926 return calculateSrcByte(Op, StartingIndex, Index);
13927 }
13928
13929 return std::nullopt;
13930 }
13931
13932 case ISD::BSWAP: {
13933 if (IsVec)
13934 return std::nullopt;
13935
13936 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13937 Depth + 1, StartingIndex);
13938 }
13939
13941 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13942 if (!IdxOp)
13943 return std::nullopt;
13944 auto VecIdx = IdxOp->getZExtValue();
13945 auto ScalarSize = Op.getScalarValueSizeInBits();
13946 if (ScalarSize < 32)
13947 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13948 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13949 StartingIndex, Index);
13950 }
13951
13952 case AMDGPUISD::PERM: {
13953 if (IsVec)
13954 return std::nullopt;
13955
13956 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13957 if (!PermMask)
13958 return std::nullopt;
13959
13960 auto IdxMask =
13961 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13962 if (IdxMask > 0x07 && IdxMask != 0x0c)
13963 return std::nullopt;
13964
13965 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13966 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13967
13968 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13971 }
13972
13973 default: {
13974 return std::nullopt;
13975 }
13976 }
13977
13978 llvm_unreachable("fully handled switch");
13979}
13980
13981// Returns true if the Operand is a scalar and is 16 bits
13982static bool isExtendedFrom16Bits(SDValue &Operand) {
13983
13984 switch (Operand.getOpcode()) {
13985 case ISD::ANY_EXTEND:
13986 case ISD::SIGN_EXTEND:
13987 case ISD::ZERO_EXTEND: {
13988 auto OpVT = Operand.getOperand(0).getValueType();
13989 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13990 }
13991 case ISD::LOAD: {
13992 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13993 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13994 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13995 ExtType == ISD::EXTLOAD) {
13996 auto MemVT = L->getMemoryVT();
13997 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13998 }
13999 return L->getMemoryVT().getSizeInBits() == 16;
14000 }
14001 default:
14002 return false;
14003 }
14004}
14005
14006// Returns true if the mask matches consecutive bytes, and the first byte
14007// begins at a power of 2 byte offset from 0th byte
14008static bool addresses16Bits(int Mask) {
14009 int Low8 = Mask & 0xff;
14010 int Hi8 = (Mask & 0xff00) >> 8;
14011
14012 assert(Low8 < 8 && Hi8 < 8);
14013 // Are the bytes contiguous in the order of increasing addresses.
14014 bool IsConsecutive = (Hi8 - Low8 == 1);
14015 // Is the first byte at location that is aligned for 16 bit instructions.
14016 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14017 // In this case, we still need code to extract the 16 bit operand, so it
14018 // is better to use i8 v_perm
14019 bool Is16Aligned = !(Low8 % 2);
14020
14021 return IsConsecutive && Is16Aligned;
14022}
14023
14024// Do not lower into v_perm if the operands are actually 16 bit
14025// and the selected bits (based on PermMask) correspond with two
14026// easily addressable 16 bit operands.
14028 SDValue &OtherOp) {
14029 int Low16 = PermMask & 0xffff;
14030 int Hi16 = (PermMask & 0xffff0000) >> 16;
14031
14032 auto TempOp = peekThroughBitcasts(Op);
14033 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14034
14035 auto OpIs16Bit =
14036 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14037 if (!OpIs16Bit)
14038 return true;
14039
14040 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14041 isExtendedFrom16Bits(TempOtherOp);
14042 if (!OtherOpIs16Bit)
14043 return true;
14044
14045 // Do we cleanly address both
14046 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14047}
14048
14050 unsigned DWordOffset) {
14051 SDValue Ret;
14052
14053 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14054 // ByteProvider must be at least 8 bits
14055 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14056
14057 if (TypeSize <= 32)
14058 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14059
14060 if (Src.getValueType().isVector()) {
14061 auto ScalarTySize = Src.getScalarValueSizeInBits();
14062 auto ScalarTy = Src.getValueType().getScalarType();
14063 if (ScalarTySize == 32) {
14064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14065 DAG.getConstant(DWordOffset, SL, MVT::i32));
14066 }
14067 if (ScalarTySize > 32) {
14068 Ret = DAG.getNode(
14069 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14070 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14071 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14072 if (ShiftVal)
14073 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14074 DAG.getConstant(ShiftVal, SL, MVT::i32));
14075 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14076 }
14077
14078 assert(ScalarTySize < 32);
14079 auto NumElements = TypeSize / ScalarTySize;
14080 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14081 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14082 auto NumElementsIn32 = 32 / ScalarTySize;
14083 auto NumAvailElements = DWordOffset < Trunc32Elements
14084 ? NumElementsIn32
14085 : NumElements - NormalizedTrunc;
14086
14088 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14089 NumAvailElements);
14090
14091 Ret = DAG.getBuildVector(
14092 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14093 VecSrcs);
14094 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14095 }
14096
14097 /// Scalar Type
14098 auto ShiftVal = 32 * DWordOffset;
14099 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14100 DAG.getConstant(ShiftVal, SL, MVT::i32));
14101 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14102}
14103
14105 SelectionDAG &DAG = DCI.DAG;
14106 [[maybe_unused]] EVT VT = N->getValueType(0);
14108
14109 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14110 assert(VT == MVT::i32);
14111 for (int i = 0; i < 4; i++) {
14112 // Find the ByteProvider that provides the ith byte of the result of OR
14113 std::optional<ByteProvider<SDValue>> P =
14114 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14115 // TODO support constantZero
14116 if (!P || P->isConstantZero())
14117 return SDValue();
14118
14119 PermNodes.push_back(*P);
14120 }
14121 if (PermNodes.size() != 4)
14122 return SDValue();
14123
14124 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14125 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14126 uint64_t PermMask = 0x00000000;
14127 for (size_t i = 0; i < PermNodes.size(); i++) {
14128 auto PermOp = PermNodes[i];
14129 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14130 // by sizeof(Src2) = 4
14131 int SrcByteAdjust = 4;
14132
14133 // If the Src uses a byte from a different DWORD, then it corresponds
14134 // with a difference source
14135 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14136 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14137 if (SecondSrc)
14138 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14139 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14140 return SDValue();
14141
14142 // Set the index of the second distinct Src node
14143 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14144 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14145 SrcByteAdjust = 0;
14146 }
14147 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14149 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14150 }
14151 SDLoc DL(N);
14152 SDValue Op = *PermNodes[FirstSrc.first].Src;
14153 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14154 assert(Op.getValueSizeInBits() == 32);
14155
14156 // Check that we are not just extracting the bytes in order from an op
14157 if (!SecondSrc) {
14158 int Low16 = PermMask & 0xffff;
14159 int Hi16 = (PermMask & 0xffff0000) >> 16;
14160
14161 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14162 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14163
14164 // The perm op would really just produce Op. So combine into Op
14165 if (WellFormedLow && WellFormedHi)
14166 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14167 }
14168
14169 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14170
14171 if (SecondSrc) {
14172 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14173 assert(OtherOp.getValueSizeInBits() == 32);
14174 }
14175
14176 // Check that we haven't just recreated the same FSHR node.
14177 if (N->getOpcode() == ISD::FSHR &&
14178 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14179 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14180 return SDValue();
14181
14182 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14183
14184 assert(Op.getValueType().isByteSized() &&
14185 OtherOp.getValueType().isByteSized());
14186
14187 // If the ultimate src is less than 32 bits, then we will only be
14188 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14189 // CalculateByteProvider would not have returned Op as source if we
14190 // used a byte that is outside its ValueType. Thus, we are free to
14191 // ANY_EXTEND as the extended bits are dont-cares.
14192 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14193 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14194
14195 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14196 DAG.getConstant(PermMask, DL, MVT::i32));
14197 }
14198 return SDValue();
14199}
14200
14201SDValue SITargetLowering::performOrCombine(SDNode *N,
14202 DAGCombinerInfo &DCI) const {
14203 SelectionDAG &DAG = DCI.DAG;
14204 SDValue LHS = N->getOperand(0);
14205 SDValue RHS = N->getOperand(1);
14206
14207 EVT VT = N->getValueType(0);
14208 if (VT == MVT::i1) {
14209 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14210 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14211 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14212 SDValue Src = LHS.getOperand(0);
14213 if (Src != RHS.getOperand(0))
14214 return SDValue();
14215
14216 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14217 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14218 if (!CLHS || !CRHS)
14219 return SDValue();
14220
14221 // Only 10 bits are used.
14222 static const uint32_t MaxMask = 0x3ff;
14223
14224 uint32_t NewMask =
14225 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14226 SDLoc DL(N);
14227 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14228 DAG.getConstant(NewMask, DL, MVT::i32));
14229 }
14230
14231 return SDValue();
14232 }
14233
14234 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14236 LHS.getOpcode() == AMDGPUISD::PERM &&
14237 isa<ConstantSDNode>(LHS.getOperand(2))) {
14238 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14239 if (!Sel)
14240 return SDValue();
14241
14242 Sel |= LHS.getConstantOperandVal(2);
14243 SDLoc DL(N);
14244 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14245 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14246 }
14247
14248 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14249 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14250 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14251 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14252
14253 // If all the uses of an or need to extract the individual elements, do not
14254 // attempt to lower into v_perm
14255 auto usesCombinedOperand = [](SDNode *OrUse) {
14256 // If we have any non-vectorized use, then it is a candidate for v_perm
14257 if (OrUse->getOpcode() != ISD::BITCAST ||
14258 !OrUse->getValueType(0).isVector())
14259 return true;
14260
14261 // If we have any non-vectorized use, then it is a candidate for v_perm
14262 for (auto *VUser : OrUse->users()) {
14263 if (!VUser->getValueType(0).isVector())
14264 return true;
14265
14266 // If the use of a vector is a store, then combining via a v_perm
14267 // is beneficial.
14268 // TODO -- whitelist more uses
14269 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14270 if (VUser->getOpcode() == VectorwiseOp)
14271 return true;
14272 }
14273 return false;
14274 };
14275
14276 if (!any_of(N->users(), usesCombinedOperand))
14277 return SDValue();
14278
14279 uint32_t LHSMask = getPermuteMask(LHS);
14280 uint32_t RHSMask = getPermuteMask(RHS);
14281
14282 if (LHSMask != ~0u && RHSMask != ~0u) {
14283 // Canonicalize the expression in an attempt to have fewer unique masks
14284 // and therefore fewer registers used to hold the masks.
14285 if (LHSMask > RHSMask) {
14286 std::swap(LHSMask, RHSMask);
14287 std::swap(LHS, RHS);
14288 }
14289
14290 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14291 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14292 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14293 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14294
14295 // Check of we need to combine values from two sources within a byte.
14296 if (!(LHSUsedLanes & RHSUsedLanes) &&
14297 // If we select high and lower word keep it for SDWA.
14298 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14299 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14300 // Kill zero bytes selected by other mask. Zero value is 0xc.
14301 LHSMask &= ~RHSUsedLanes;
14302 RHSMask &= ~LHSUsedLanes;
14303 // Add 4 to each active LHS lane
14304 LHSMask |= LHSUsedLanes & 0x04040404;
14305 // Combine masks
14306 uint32_t Sel = LHSMask | RHSMask;
14307 SDLoc DL(N);
14308
14309 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14310 RHS.getOperand(0),
14311 DAG.getConstant(Sel, DL, MVT::i32));
14312 }
14313 }
14314 if (LHSMask == ~0u || RHSMask == ~0u) {
14315 if (SDValue Perm = matchPERM(N, DCI))
14316 return Perm;
14317 }
14318 }
14319
14320 // Detect identity v2i32 OR and replace with identity source node.
14321 // Specifically an Or that has operands constructed from the same source node
14322 // via extract_vector_elt and build_vector. I.E.
14323 // v2i32 or(
14324 // v2i32 build_vector(
14325 // i32 extract_elt(%IdentitySrc, 0),
14326 // i32 0
14327 // ),
14328 // v2i32 build_vector(
14329 // i32 0,
14330 // i32 extract_elt(%IdentitySrc, 1)
14331 // ) )
14332 // =>
14333 // v2i32 %IdentitySrc
14334
14335 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14336 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14337
14338 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14339 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14340
14341 // Test for and normalise build vectors.
14342 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14343
14344 // Get the extract_vector_element operands.
14345 SDValue LEVE = LHS->getOperand(0);
14346 SDValue REVE = RHS->getOperand(1);
14347
14348 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14350 // Check that different elements from the same vector are
14351 // extracted.
14352 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14353 LEVE->getOperand(1) != REVE->getOperand(1)) {
14354 SDValue IdentitySrc = LEVE.getOperand(0);
14355 return IdentitySrc;
14356 }
14357 }
14358 }
14359 }
14360
14361 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14362 return SDValue();
14363
14364 // TODO: This could be a generic combine with a predicate for extracting the
14365 // high half of an integer being free.
14366
14367 // (or i64:x, (zero_extend i32:y)) ->
14368 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14369 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14370 RHS.getOpcode() != ISD::ZERO_EXTEND)
14371 std::swap(LHS, RHS);
14372
14373 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14374 SDValue ExtSrc = RHS.getOperand(0);
14375 EVT SrcVT = ExtSrc.getValueType();
14376 if (SrcVT == MVT::i32) {
14377 SDLoc SL(N);
14378 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14379 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14380
14381 DCI.AddToWorklist(LowOr.getNode());
14382 DCI.AddToWorklist(HiBits.getNode());
14383
14384 SDValue Vec =
14385 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14386 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14387 }
14388 }
14389
14390 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14391 if (CRHS) {
14392 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14393 N->getOperand(0), CRHS))
14394 return Split;
14395 }
14396
14397 return SDValue();
14398}
14399
14400SDValue SITargetLowering::performXorCombine(SDNode *N,
14401 DAGCombinerInfo &DCI) const {
14402 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14403 return RV;
14404
14405 SDValue LHS = N->getOperand(0);
14406 SDValue RHS = N->getOperand(1);
14407
14408 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14409 SelectionDAG &DAG = DCI.DAG;
14410
14411 EVT VT = N->getValueType(0);
14412 if (CRHS && VT == MVT::i64) {
14413 if (SDValue Split =
14414 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14415 return Split;
14416 }
14417
14418 // v2i32 (xor (vselect cc, x, y), K) ->
14419 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14420 // replaced with source modifiers when the select is lowered to CNDMASK.
14421 unsigned Opc = LHS.getOpcode();
14422 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14423 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14424 CRHS && CRHS->getAPIntValue().isSignMask()) {
14425 SDValue CC = LHS->getOperand(0);
14426 SDValue TRUE = LHS->getOperand(1);
14427 SDValue FALSE = LHS->getOperand(2);
14428 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14429 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14430 SDValue XSelect =
14431 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14432 return XSelect;
14433 }
14434
14435 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14436 // fneg-like xors into 64-bit select.
14437 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14438 // This looks like an fneg, try to fold as a source modifier.
14439 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14441 // xor (select c, a, b), 0x80000000 ->
14442 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14443 SDLoc DL(N);
14444 SDValue CastLHS =
14445 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14446 SDValue CastRHS =
14447 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14448 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14449 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14450 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14451 LHS->getOperand(0), FNegLHS, FNegRHS);
14452 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14453 }
14454 }
14455
14456 return SDValue();
14457}
14458
14459SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14460 DAGCombinerInfo &DCI) const {
14461 if (!Subtarget->has16BitInsts() ||
14462 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14463 return SDValue();
14464
14465 EVT VT = N->getValueType(0);
14466 if (VT != MVT::i32)
14467 return SDValue();
14468
14469 SDValue Src = N->getOperand(0);
14470 if (Src.getValueType() != MVT::i16)
14471 return SDValue();
14472
14473 return SDValue();
14474}
14475
14476SDValue
14477SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14478 DAGCombinerInfo &DCI) const {
14479 SDValue Src = N->getOperand(0);
14480 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14481
14482 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14483 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14484 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14485 VTSign->getVT() == MVT::i8) ||
14486 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14487 VTSign->getVT() == MVT::i16))) {
14488 assert(Subtarget->hasScalarSubwordLoads() &&
14489 "s_buffer_load_{u8, i8} are supported "
14490 "in GFX12 (or newer) architectures.");
14491 EVT VT = Src.getValueType();
14492 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14493 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14494 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14495 SDLoc DL(N);
14496 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14497 SDValue Ops[] = {
14498 Src.getOperand(0), // source register
14499 Src.getOperand(1), // offset
14500 Src.getOperand(2) // cachePolicy
14501 };
14502 auto *M = cast<MemSDNode>(Src);
14503 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14504 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14505 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14506 return LoadVal;
14507 }
14508 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14509 VTSign->getVT() == MVT::i8) ||
14510 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14511 VTSign->getVT() == MVT::i16)) &&
14512 Src.hasOneUse()) {
14513 auto *M = cast<MemSDNode>(Src);
14514 SDValue Ops[] = {Src.getOperand(0), // Chain
14515 Src.getOperand(1), // rsrc
14516 Src.getOperand(2), // vindex
14517 Src.getOperand(3), // voffset
14518 Src.getOperand(4), // soffset
14519 Src.getOperand(5), // offset
14520 Src.getOperand(6), Src.getOperand(7)};
14521 // replace with BUFFER_LOAD_BYTE/SHORT
14522 SDVTList ResList =
14523 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14524 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14525 ? AMDGPUISD::BUFFER_LOAD_BYTE
14526 : AMDGPUISD::BUFFER_LOAD_SHORT;
14527 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14528 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14529 return DCI.DAG.getMergeValues(
14530 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14531 }
14532 return SDValue();
14533}
14534
14535SDValue SITargetLowering::performClassCombine(SDNode *N,
14536 DAGCombinerInfo &DCI) const {
14537 SelectionDAG &DAG = DCI.DAG;
14538 SDValue Mask = N->getOperand(1);
14539
14540 // fp_class x, 0 -> false
14541 if (isNullConstant(Mask))
14542 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14543
14544 if (N->getOperand(0).isUndef())
14545 return DAG.getUNDEF(MVT::i1);
14546
14547 return SDValue();
14548}
14549
14550SDValue SITargetLowering::performRcpCombine(SDNode *N,
14551 DAGCombinerInfo &DCI) const {
14552 EVT VT = N->getValueType(0);
14553 SDValue N0 = N->getOperand(0);
14554
14555 if (N0.isUndef()) {
14556 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14557 SDLoc(N), VT);
14558 }
14559
14560 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14561 N0.getOpcode() == ISD::SINT_TO_FP)) {
14562 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14563 N->getFlags());
14564 }
14565
14566 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14567 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14568 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14569 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14570 N->getFlags());
14571 }
14572
14574}
14575
14577 unsigned MaxDepth) const {
14578 unsigned Opcode = Op.getOpcode();
14579 if (Opcode == ISD::FCANONICALIZE)
14580 return true;
14581
14582 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14583 const auto &F = CFP->getValueAPF();
14584 if (F.isNaN() && F.isSignaling())
14585 return false;
14586 if (!F.isDenormal())
14587 return true;
14588
14589 DenormalMode Mode =
14590 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14591 return Mode == DenormalMode::getIEEE();
14592 }
14593
14594 // If source is a result of another standard FP operation it is already in
14595 // canonical form.
14596 if (MaxDepth == 0)
14597 return false;
14598
14599 switch (Opcode) {
14600 // These will flush denorms if required.
14601 case ISD::FADD:
14602 case ISD::FSUB:
14603 case ISD::FMUL:
14604 case ISD::FCEIL:
14605 case ISD::FFLOOR:
14606 case ISD::FMA:
14607 case ISD::FMAD:
14608 case ISD::FSQRT:
14609 case ISD::FDIV:
14610 case ISD::FREM:
14611 case ISD::FP_ROUND:
14612 case ISD::FP_EXTEND:
14613 case ISD::FP16_TO_FP:
14614 case ISD::FP_TO_FP16:
14615 case ISD::BF16_TO_FP:
14616 case ISD::FP_TO_BF16:
14617 case ISD::FLDEXP:
14618 case AMDGPUISD::FMUL_LEGACY:
14619 case AMDGPUISD::FMAD_FTZ:
14620 case AMDGPUISD::RCP:
14621 case AMDGPUISD::RSQ:
14622 case AMDGPUISD::RSQ_CLAMP:
14623 case AMDGPUISD::RCP_LEGACY:
14624 case AMDGPUISD::RCP_IFLAG:
14625 case AMDGPUISD::LOG:
14626 case AMDGPUISD::EXP:
14627 case AMDGPUISD::DIV_SCALE:
14628 case AMDGPUISD::DIV_FMAS:
14629 case AMDGPUISD::DIV_FIXUP:
14630 case AMDGPUISD::FRACT:
14631 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14632 case AMDGPUISD::CVT_F32_UBYTE0:
14633 case AMDGPUISD::CVT_F32_UBYTE1:
14634 case AMDGPUISD::CVT_F32_UBYTE2:
14635 case AMDGPUISD::CVT_F32_UBYTE3:
14636 case AMDGPUISD::FP_TO_FP16:
14637 case AMDGPUISD::SIN_HW:
14638 case AMDGPUISD::COS_HW:
14639 return true;
14640
14641 // It can/will be lowered or combined as a bit operation.
14642 // Need to check their input recursively to handle.
14643 case ISD::FNEG:
14644 case ISD::FABS:
14645 case ISD::FCOPYSIGN:
14646 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14647
14648 case ISD::AND:
14649 if (Op.getValueType() == MVT::i32) {
14650 // Be careful as we only know it is a bitcast floating point type. It
14651 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14652 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14653 // is valid to optimize for all types.
14654 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14655 if (RHS->getZExtValue() == 0xffff0000) {
14656 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14657 }
14658 }
14659 }
14660 break;
14661
14662 case ISD::FSIN:
14663 case ISD::FCOS:
14664 case ISD::FSINCOS:
14665 return Op.getValueType().getScalarType() != MVT::f16;
14666
14667 case ISD::FMINNUM:
14668 case ISD::FMAXNUM:
14669 case ISD::FMINNUM_IEEE:
14670 case ISD::FMAXNUM_IEEE:
14671 case ISD::FMINIMUM:
14672 case ISD::FMAXIMUM:
14673 case ISD::FMINIMUMNUM:
14674 case ISD::FMAXIMUMNUM:
14675 case AMDGPUISD::CLAMP:
14676 case AMDGPUISD::FMED3:
14677 case AMDGPUISD::FMAX3:
14678 case AMDGPUISD::FMIN3:
14679 case AMDGPUISD::FMAXIMUM3:
14680 case AMDGPUISD::FMINIMUM3: {
14681 // FIXME: Shouldn't treat the generic operations different based these.
14682 // However, we aren't really required to flush the result from
14683 // minnum/maxnum..
14684
14685 // snans will be quieted, so we only need to worry about denormals.
14686 if (Subtarget->supportsMinMaxDenormModes() ||
14687 // FIXME: denormalsEnabledForType is broken for dynamic
14688 denormalsEnabledForType(DAG, Op.getValueType()))
14689 return true;
14690
14691 // Flushing may be required.
14692 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14693 // targets need to check their input recursively.
14694
14695 // FIXME: Does this apply with clamp? It's implemented with max.
14696 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14697 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14698 return false;
14699 }
14700
14701 return true;
14702 }
14703 case ISD::SELECT: {
14704 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14705 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14706 }
14707 case ISD::BUILD_VECTOR: {
14708 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14709 SDValue SrcOp = Op.getOperand(i);
14710 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14711 return false;
14712 }
14713
14714 return true;
14715 }
14718 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14719 }
14721 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14722 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14723 }
14724 case ISD::UNDEF:
14725 // Could be anything.
14726 return false;
14727
14728 case ISD::BITCAST:
14729 // TODO: This is incorrect as it loses track of the operand's type. We may
14730 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14731 // same bits that are canonicalized in one type need not be in the other.
14732 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14733 case ISD::TRUNCATE: {
14734 // Hack round the mess we make when legalizing extract_vector_elt
14735 if (Op.getValueType() == MVT::i16) {
14736 SDValue TruncSrc = Op.getOperand(0);
14737 if (TruncSrc.getValueType() == MVT::i32 &&
14738 TruncSrc.getOpcode() == ISD::BITCAST &&
14739 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14740 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14741 }
14742 }
14743 return false;
14744 }
14746 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14747 // TODO: Handle more intrinsics
14748 switch (IntrinsicID) {
14749 case Intrinsic::amdgcn_cvt_pkrtz:
14750 case Intrinsic::amdgcn_cubeid:
14751 case Intrinsic::amdgcn_frexp_mant:
14752 case Intrinsic::amdgcn_fdot2:
14753 case Intrinsic::amdgcn_rcp:
14754 case Intrinsic::amdgcn_rsq:
14755 case Intrinsic::amdgcn_rsq_clamp:
14756 case Intrinsic::amdgcn_rcp_legacy:
14757 case Intrinsic::amdgcn_rsq_legacy:
14758 case Intrinsic::amdgcn_trig_preop:
14759 case Intrinsic::amdgcn_tanh:
14760 case Intrinsic::amdgcn_log:
14761 case Intrinsic::amdgcn_exp2:
14762 case Intrinsic::amdgcn_sqrt:
14763 return true;
14764 default:
14765 break;
14766 }
14767
14768 break;
14769 }
14770 default:
14771 break;
14772 }
14773
14774 // FIXME: denormalsEnabledForType is broken for dynamic
14775 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14776 DAG.isKnownNeverSNaN(Op);
14777}
14778
14780 unsigned MaxDepth) const {
14781 const MachineRegisterInfo &MRI = MF.getRegInfo();
14782 MachineInstr *MI = MRI.getVRegDef(Reg);
14783 unsigned Opcode = MI->getOpcode();
14784
14785 if (Opcode == AMDGPU::G_FCANONICALIZE)
14786 return true;
14787
14788 std::optional<FPValueAndVReg> FCR;
14789 // Constant splat (can be padded with undef) or scalar constant.
14791 if (FCR->Value.isSignaling())
14792 return false;
14793 if (!FCR->Value.isDenormal())
14794 return true;
14795
14796 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14797 return Mode == DenormalMode::getIEEE();
14798 }
14799
14800 if (MaxDepth == 0)
14801 return false;
14802
14803 switch (Opcode) {
14804 case AMDGPU::G_FADD:
14805 case AMDGPU::G_FSUB:
14806 case AMDGPU::G_FMUL:
14807 case AMDGPU::G_FCEIL:
14808 case AMDGPU::G_FFLOOR:
14809 case AMDGPU::G_FRINT:
14810 case AMDGPU::G_FNEARBYINT:
14811 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14812 case AMDGPU::G_INTRINSIC_TRUNC:
14813 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14814 case AMDGPU::G_FMA:
14815 case AMDGPU::G_FMAD:
14816 case AMDGPU::G_FSQRT:
14817 case AMDGPU::G_FDIV:
14818 case AMDGPU::G_FREM:
14819 case AMDGPU::G_FPOW:
14820 case AMDGPU::G_FPEXT:
14821 case AMDGPU::G_FLOG:
14822 case AMDGPU::G_FLOG2:
14823 case AMDGPU::G_FLOG10:
14824 case AMDGPU::G_FPTRUNC:
14825 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14826 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14827 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14828 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14829 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14830 return true;
14831 case AMDGPU::G_FNEG:
14832 case AMDGPU::G_FABS:
14833 case AMDGPU::G_FCOPYSIGN:
14834 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14835 case AMDGPU::G_FMINNUM:
14836 case AMDGPU::G_FMAXNUM:
14837 case AMDGPU::G_FMINNUM_IEEE:
14838 case AMDGPU::G_FMAXNUM_IEEE:
14839 case AMDGPU::G_FMINIMUM:
14840 case AMDGPU::G_FMAXIMUM:
14841 case AMDGPU::G_FMINIMUMNUM:
14842 case AMDGPU::G_FMAXIMUMNUM: {
14843 if (Subtarget->supportsMinMaxDenormModes() ||
14844 // FIXME: denormalsEnabledForType is broken for dynamic
14845 denormalsEnabledForType(MRI.getType(Reg), MF))
14846 return true;
14847
14848 [[fallthrough]];
14849 }
14850 case AMDGPU::G_BUILD_VECTOR:
14851 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14852 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14853 return false;
14854 return true;
14855 case AMDGPU::G_INTRINSIC:
14856 case AMDGPU::G_INTRINSIC_CONVERGENT:
14857 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14858 case Intrinsic::amdgcn_fmul_legacy:
14859 case Intrinsic::amdgcn_fmad_ftz:
14860 case Intrinsic::amdgcn_sqrt:
14861 case Intrinsic::amdgcn_fmed3:
14862 case Intrinsic::amdgcn_sin:
14863 case Intrinsic::amdgcn_cos:
14864 case Intrinsic::amdgcn_log:
14865 case Intrinsic::amdgcn_exp2:
14866 case Intrinsic::amdgcn_log_clamp:
14867 case Intrinsic::amdgcn_rcp:
14868 case Intrinsic::amdgcn_rcp_legacy:
14869 case Intrinsic::amdgcn_rsq:
14870 case Intrinsic::amdgcn_rsq_clamp:
14871 case Intrinsic::amdgcn_rsq_legacy:
14872 case Intrinsic::amdgcn_div_scale:
14873 case Intrinsic::amdgcn_div_fmas:
14874 case Intrinsic::amdgcn_div_fixup:
14875 case Intrinsic::amdgcn_fract:
14876 case Intrinsic::amdgcn_cvt_pkrtz:
14877 case Intrinsic::amdgcn_cubeid:
14878 case Intrinsic::amdgcn_cubema:
14879 case Intrinsic::amdgcn_cubesc:
14880 case Intrinsic::amdgcn_cubetc:
14881 case Intrinsic::amdgcn_frexp_mant:
14882 case Intrinsic::amdgcn_fdot2:
14883 case Intrinsic::amdgcn_trig_preop:
14884 case Intrinsic::amdgcn_tanh:
14885 return true;
14886 default:
14887 break;
14888 }
14889
14890 [[fallthrough]];
14891 default:
14892 return false;
14893 }
14894
14895 llvm_unreachable("invalid operation");
14896}
14897
14898// Constant fold canonicalize.
14899SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14900 const SDLoc &SL, EVT VT,
14901 const APFloat &C) const {
14902 // Flush denormals to 0 if not enabled.
14903 if (C.isDenormal()) {
14904 DenormalMode Mode =
14905 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14906 if (Mode == DenormalMode::getPreserveSign()) {
14907 return DAG.getConstantFP(
14908 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14909 }
14910
14911 if (Mode != DenormalMode::getIEEE())
14912 return SDValue();
14913 }
14914
14915 if (C.isNaN()) {
14916 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14917 if (C.isSignaling()) {
14918 // Quiet a signaling NaN.
14919 // FIXME: Is this supposed to preserve payload bits?
14920 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14921 }
14922
14923 // Make sure it is the canonical NaN bitpattern.
14924 //
14925 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14926 // immediate?
14927 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14928 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14929 }
14930
14931 // Already canonical.
14932 return DAG.getConstantFP(C, SL, VT);
14933}
14934
14936 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14937}
14938
14939SDValue
14940SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14941 DAGCombinerInfo &DCI) const {
14942 SelectionDAG &DAG = DCI.DAG;
14943 SDValue N0 = N->getOperand(0);
14944 EVT VT = N->getValueType(0);
14945
14946 // fcanonicalize undef -> qnan
14947 if (N0.isUndef()) {
14949 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14950 }
14951
14952 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14953 EVT VT = N->getValueType(0);
14954 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14955 }
14956
14957 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14958 // (fcanonicalize k)
14959 //
14960 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14961
14962 // TODO: This could be better with wider vectors that will be split to v2f16,
14963 // and to consider uses since there aren't that many packed operations.
14964 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14965 isTypeLegal(MVT::v2f16)) {
14966 SDLoc SL(N);
14967 SDValue NewElts[2];
14968 SDValue Lo = N0.getOperand(0);
14969 SDValue Hi = N0.getOperand(1);
14970 EVT EltVT = Lo.getValueType();
14971
14973 for (unsigned I = 0; I != 2; ++I) {
14974 SDValue Op = N0.getOperand(I);
14975 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14976 NewElts[I] =
14977 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14978 } else if (Op.isUndef()) {
14979 // Handled below based on what the other operand is.
14980 NewElts[I] = Op;
14981 } else {
14982 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14983 }
14984 }
14985
14986 // If one half is undef, and one is constant, prefer a splat vector rather
14987 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14988 // cheaper to use and may be free with a packed operation.
14989 if (NewElts[0].isUndef()) {
14990 if (isa<ConstantFPSDNode>(NewElts[1]))
14991 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14992 ? NewElts[1]
14993 : DAG.getConstantFP(0.0f, SL, EltVT);
14994 }
14995
14996 if (NewElts[1].isUndef()) {
14997 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14998 ? NewElts[0]
14999 : DAG.getConstantFP(0.0f, SL, EltVT);
15000 }
15001
15002 return DAG.getBuildVector(VT, SL, NewElts);
15003 }
15004 }
15005
15006 return SDValue();
15007}
15008
15009static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15010 switch (Opc) {
15011 case ISD::FMAXNUM:
15012 case ISD::FMAXNUM_IEEE:
15013 case ISD::FMAXIMUMNUM:
15014 return AMDGPUISD::FMAX3;
15015 case ISD::FMAXIMUM:
15016 return AMDGPUISD::FMAXIMUM3;
15017 case ISD::SMAX:
15018 return AMDGPUISD::SMAX3;
15019 case ISD::UMAX:
15020 return AMDGPUISD::UMAX3;
15021 case ISD::FMINNUM:
15022 case ISD::FMINNUM_IEEE:
15023 case ISD::FMINIMUMNUM:
15024 return AMDGPUISD::FMIN3;
15025 case ISD::FMINIMUM:
15026 return AMDGPUISD::FMINIMUM3;
15027 case ISD::SMIN:
15028 return AMDGPUISD::SMIN3;
15029 case ISD::UMIN:
15030 return AMDGPUISD::UMIN3;
15031 default:
15032 llvm_unreachable("Not a min/max opcode");
15033 }
15034}
15035
15036SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15037 const SDLoc &SL, SDValue Src,
15038 SDValue MinVal,
15039 SDValue MaxVal,
15040 bool Signed) const {
15041
15042 // med3 comes from
15043 // min(max(x, K0), K1), K0 < K1
15044 // max(min(x, K0), K1), K1 < K0
15045 //
15046 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15047 // min/max op.
15048 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15049 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15050
15051 if (!MinK || !MaxK)
15052 return SDValue();
15053
15054 if (Signed) {
15055 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15056 return SDValue();
15057 } else {
15058 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15059 return SDValue();
15060 }
15061
15062 EVT VT = MinK->getValueType(0);
15063 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15064 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15065 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15066
15067 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15068 // not available, but this is unlikely to be profitable as constants
15069 // will often need to be materialized & extended, especially on
15070 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15071 return SDValue();
15072}
15073
15076 return C;
15077
15079 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15080 return C;
15081 }
15082
15083 return nullptr;
15084}
15085
15086SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15087 const SDLoc &SL, SDValue Op0,
15088 SDValue Op1) const {
15089 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15090 if (!K1)
15091 return SDValue();
15092
15093 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15094 if (!K0)
15095 return SDValue();
15096
15097 // Ordered >= (although NaN inputs should have folded away by now).
15098 if (K0->getValueAPF() > K1->getValueAPF())
15099 return SDValue();
15100
15101 // med3 with a nan input acts like
15102 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15103 //
15104 // So the result depends on whether the IEEE mode bit is enabled or not with a
15105 // signaling nan input.
15106 // ieee=1
15107 // s0 snan: yields s2
15108 // s1 snan: yields s2
15109 // s2 snan: qnan
15110
15111 // s0 qnan: min(s1, s2)
15112 // s1 qnan: min(s0, s2)
15113 // s2 qnan: min(s0, s1)
15114
15115 // ieee=0
15116 // s0 snan: min(s1, s2)
15117 // s1 snan: min(s0, s2)
15118 // s2 snan: qnan
15119
15120 // s0 qnan: min(s1, s2)
15121 // s1 qnan: min(s0, s2)
15122 // s2 qnan: min(s0, s1)
15123 const MachineFunction &MF = DAG.getMachineFunction();
15124 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15125
15126 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15127 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15128 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15129 EVT VT = Op0.getValueType();
15130 if (Info->getMode().DX10Clamp) {
15131 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15132 // hardware fmed3 behavior converting to a min.
15133 // FIXME: Should this be allowing -0.0?
15134 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15135 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15136 }
15137
15138 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15139 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15140 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15141 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15142 // then give the other result, which is different from med3 with a NaN
15143 // input.
15144 SDValue Var = Op0.getOperand(0);
15145 if (!DAG.isKnownNeverSNaN(Var))
15146 return SDValue();
15147
15148 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15149
15150 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15151 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15152 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15153 SDValue(K0, 0), SDValue(K1, 0));
15154 }
15155 }
15156
15157 return SDValue();
15158}
15159
15160/// \return true if the subtarget supports minimum3 and maximum3 with the given
15161/// base min/max opcode \p Opc for type \p VT.
15162static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15163 EVT VT) {
15164 switch (Opc) {
15165 case ISD::FMINNUM:
15166 case ISD::FMAXNUM:
15167 case ISD::FMINNUM_IEEE:
15168 case ISD::FMAXNUM_IEEE:
15169 case ISD::FMINIMUMNUM:
15170 case ISD::FMAXIMUMNUM:
15171 case AMDGPUISD::FMIN_LEGACY:
15172 case AMDGPUISD::FMAX_LEGACY:
15173 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15174 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15175 case ISD::FMINIMUM:
15176 case ISD::FMAXIMUM:
15177 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15178 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15179 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15180 case ISD::SMAX:
15181 case ISD::SMIN:
15182 case ISD::UMAX:
15183 case ISD::UMIN:
15184 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15185 default:
15186 return false;
15187 }
15188
15189 llvm_unreachable("not a min/max opcode");
15190}
15191
15192SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15193 DAGCombinerInfo &DCI) const {
15194 SelectionDAG &DAG = DCI.DAG;
15195
15196 EVT VT = N->getValueType(0);
15197 unsigned Opc = N->getOpcode();
15198 SDValue Op0 = N->getOperand(0);
15199 SDValue Op1 = N->getOperand(1);
15200
15201 // Only do this if the inner op has one use since this will just increases
15202 // register pressure for no benefit.
15203
15204 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15205 // max(max(a, b), c) -> max3(a, b, c)
15206 // min(min(a, b), c) -> min3(a, b, c)
15207 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15208 SDLoc DL(N);
15209 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15210 Op0.getOperand(0), Op0.getOperand(1), Op1);
15211 }
15212
15213 // Try commuted.
15214 // max(a, max(b, c)) -> max3(a, b, c)
15215 // min(a, min(b, c)) -> min3(a, b, c)
15216 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15217 SDLoc DL(N);
15218 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15219 Op0, Op1.getOperand(0), Op1.getOperand(1));
15220 }
15221 }
15222
15223 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15224 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15225 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15226 if (SDValue Med3 = performIntMed3ImmCombine(
15227 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15228 return Med3;
15229 }
15230 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15231 if (SDValue Med3 = performIntMed3ImmCombine(
15232 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15233 return Med3;
15234 }
15235
15236 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15237 if (SDValue Med3 = performIntMed3ImmCombine(
15238 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15239 return Med3;
15240 }
15241 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15242 if (SDValue Med3 = performIntMed3ImmCombine(
15243 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15244 return Med3;
15245 }
15246
15247 // if !is_snan(x):
15248 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15249 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15250 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15251 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15252 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15255 (Opc == AMDGPUISD::FMIN_LEGACY &&
15256 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15257 (VT == MVT::f32 || VT == MVT::f64 ||
15258 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15259 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15260 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15261 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15262 Op0.hasOneUse()) {
15263 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15264 return Res;
15265 }
15266
15267 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15268 // for some types, but at a higher cost since it's implemented with a 3
15269 // operand form.
15270 const SDNodeFlags Flags = N->getFlags();
15271 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15272 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15273 unsigned NewOpc =
15275 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15276 }
15277
15278 return SDValue();
15279}
15280
15284 // FIXME: Should this be allowing -0.0?
15285 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15286 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15287 }
15288 }
15289
15290 return false;
15291}
15292
15293// FIXME: Should only worry about snans for version with chain.
15294SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15295 DAGCombinerInfo &DCI) const {
15296 EVT VT = N->getValueType(0);
15297 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15298 // NaNs. With a NaN input, the order of the operands may change the result.
15299
15300 SelectionDAG &DAG = DCI.DAG;
15301 SDLoc SL(N);
15302
15303 SDValue Src0 = N->getOperand(0);
15304 SDValue Src1 = N->getOperand(1);
15305 SDValue Src2 = N->getOperand(2);
15306
15307 if (isClampZeroToOne(Src0, Src1)) {
15308 // const_a, const_b, x -> clamp is safe in all cases including signaling
15309 // nans.
15310 // FIXME: Should this be allowing -0.0?
15311 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15312 }
15313
15314 const MachineFunction &MF = DAG.getMachineFunction();
15315 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15316
15317 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15318 // handling no dx10-clamp?
15319 if (Info->getMode().DX10Clamp) {
15320 // If NaNs is clamped to 0, we are free to reorder the inputs.
15321
15322 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15323 std::swap(Src0, Src1);
15324
15325 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15326 std::swap(Src1, Src2);
15327
15328 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15329 std::swap(Src0, Src1);
15330
15331 if (isClampZeroToOne(Src1, Src2))
15332 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15333 }
15334
15335 return SDValue();
15336}
15337
15338SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15339 DAGCombinerInfo &DCI) const {
15340 SDValue Src0 = N->getOperand(0);
15341 SDValue Src1 = N->getOperand(1);
15342 if (Src0.isUndef() && Src1.isUndef())
15343 return DCI.DAG.getUNDEF(N->getValueType(0));
15344 return SDValue();
15345}
15346
15347// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15348// expanded into a set of cmp/select instructions.
15350 unsigned NumElem,
15351 bool IsDivergentIdx,
15352 const GCNSubtarget *Subtarget) {
15354 return false;
15355
15356 unsigned VecSize = EltSize * NumElem;
15357
15358 // Sub-dword vectors of size 2 dword or less have better implementation.
15359 if (VecSize <= 64 && EltSize < 32)
15360 return false;
15361
15362 // Always expand the rest of sub-dword instructions, otherwise it will be
15363 // lowered via memory.
15364 if (EltSize < 32)
15365 return true;
15366
15367 // Always do this if var-idx is divergent, otherwise it will become a loop.
15368 if (IsDivergentIdx)
15369 return true;
15370
15371 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15372 unsigned NumInsts = NumElem /* Number of compares */ +
15373 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15374
15375 // On some architectures (GFX9) movrel is not available and it's better
15376 // to expand.
15377 if (Subtarget->useVGPRIndexMode())
15378 return NumInsts <= 16;
15379
15380 // If movrel is available, use it instead of expanding for vector of 8
15381 // elements.
15382 if (Subtarget->hasMovrel())
15383 return NumInsts <= 15;
15384
15385 return true;
15386}
15387
15389 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15390 if (isa<ConstantSDNode>(Idx))
15391 return false;
15392
15393 SDValue Vec = N->getOperand(0);
15394 EVT VecVT = Vec.getValueType();
15395 EVT EltVT = VecVT.getVectorElementType();
15396 unsigned EltSize = EltVT.getSizeInBits();
15397 unsigned NumElem = VecVT.getVectorNumElements();
15398
15400 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15401}
15402
15403SDValue
15404SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15405 DAGCombinerInfo &DCI) const {
15406 SDValue Vec = N->getOperand(0);
15407 SelectionDAG &DAG = DCI.DAG;
15408
15409 EVT VecVT = Vec.getValueType();
15410 EVT VecEltVT = VecVT.getVectorElementType();
15411 EVT ResVT = N->getValueType(0);
15412
15413 unsigned VecSize = VecVT.getSizeInBits();
15414 unsigned VecEltSize = VecEltVT.getSizeInBits();
15415
15416 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15418 SDLoc SL(N);
15419 SDValue Idx = N->getOperand(1);
15420 SDValue Elt =
15421 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15422 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15423 }
15424
15425 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15426 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15427 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15428 // depending on the shift operand. See e.g. performSraCombine().
15429 // This combine ensures that the optimisation is compatible with v2i32
15430 // legalised AND.
15431 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15432 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15433
15435 if (!C || C->getZExtValue() != 0x1f)
15436 return SDValue();
15437
15438 SDLoc SL(N);
15439 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15440 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15441 Vec->getOperand(0), N->getOperand(1));
15442 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15443 DAG.ReplaceAllUsesWith(N, A.getNode());
15444 }
15445
15446 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15447 // =>
15448 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15449 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15450 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15451 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15452 SDLoc SL(N);
15453 SDValue Idx = N->getOperand(1);
15454 unsigned Opc = Vec.getOpcode();
15455
15456 switch (Opc) {
15457 default:
15458 break;
15459 // TODO: Support other binary operations.
15460 case ISD::FADD:
15461 case ISD::FSUB:
15462 case ISD::FMUL:
15463 case ISD::ADD:
15464 case ISD::UMIN:
15465 case ISD::UMAX:
15466 case ISD::SMIN:
15467 case ISD::SMAX:
15468 case ISD::FMAXNUM:
15469 case ISD::FMINNUM:
15470 case ISD::FMAXNUM_IEEE:
15471 case ISD::FMINNUM_IEEE:
15472 case ISD::FMAXIMUM:
15473 case ISD::FMINIMUM: {
15474 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15475 Vec.getOperand(0), Idx);
15476 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15477 Vec.getOperand(1), Idx);
15478
15479 DCI.AddToWorklist(Elt0.getNode());
15480 DCI.AddToWorklist(Elt1.getNode());
15481 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15482 }
15483 }
15484 }
15485
15486 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15488 SDLoc SL(N);
15489 SDValue Idx = N->getOperand(1);
15490 SDValue V;
15491 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15492 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15493 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15494 if (I == 0)
15495 V = Elt;
15496 else
15497 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15498 }
15499 return V;
15500 }
15501
15502 if (!DCI.isBeforeLegalize())
15503 return SDValue();
15504
15505 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15506 // elements. This exposes more load reduction opportunities by replacing
15507 // multiple small extract_vector_elements with a single 32-bit extract.
15508 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15509 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15510 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15511 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15512
15513 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15514 unsigned EltIdx = BitIndex / 32;
15515 unsigned LeftoverBitIdx = BitIndex % 32;
15516 SDLoc SL(N);
15517
15518 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15519 DCI.AddToWorklist(Cast.getNode());
15520
15521 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15522 DAG.getConstant(EltIdx, SL, MVT::i32));
15523 DCI.AddToWorklist(Elt.getNode());
15524 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15525 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15526 DCI.AddToWorklist(Srl.getNode());
15527
15528 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15529 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15530 DCI.AddToWorklist(Trunc.getNode());
15531
15532 if (VecEltVT == ResVT) {
15533 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15534 }
15535
15536 assert(ResVT.isScalarInteger());
15537 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15538 }
15539
15540 return SDValue();
15541}
15542
15543SDValue
15544SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15545 DAGCombinerInfo &DCI) const {
15546 SDValue Vec = N->getOperand(0);
15547 SDValue Idx = N->getOperand(2);
15548 EVT VecVT = Vec.getValueType();
15549 EVT EltVT = VecVT.getVectorElementType();
15550
15551 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15552 // => BUILD_VECTOR n x select (e, const-idx)
15554 return SDValue();
15555
15556 SelectionDAG &DAG = DCI.DAG;
15557 SDLoc SL(N);
15558 SDValue Ins = N->getOperand(1);
15559 EVT IdxVT = Idx.getValueType();
15560
15562 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15563 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15564 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15565 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15566 Ops.push_back(V);
15567 }
15568
15569 return DAG.getBuildVector(VecVT, SL, Ops);
15570}
15571
15572/// Return the source of an fp_extend from f16 to f32, or a converted FP
15573/// constant.
15575 if (Src.getOpcode() == ISD::FP_EXTEND &&
15576 Src.getOperand(0).getValueType() == MVT::f16) {
15577 return Src.getOperand(0);
15578 }
15579
15580 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15581 APFloat Val = CFP->getValueAPF();
15582 bool LosesInfo = true;
15584 if (!LosesInfo)
15585 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15586 }
15587
15588 return SDValue();
15589}
15590
15591SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15592 DAGCombinerInfo &DCI) const {
15593 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15594 "combine only useful on gfx8");
15595
15596 SDValue TruncSrc = N->getOperand(0);
15597 EVT VT = N->getValueType(0);
15598 if (VT != MVT::f16)
15599 return SDValue();
15600
15601 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15602 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15603 return SDValue();
15604
15605 SelectionDAG &DAG = DCI.DAG;
15606 SDLoc SL(N);
15607
15608 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15609 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15610 // casting back.
15611
15612 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15613 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15614 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15615 if (!A)
15616 return SDValue();
15617
15618 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15619 if (!B)
15620 return SDValue();
15621
15622 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15623 if (!C)
15624 return SDValue();
15625
15626 // This changes signaling nan behavior. If an input is a signaling nan, it
15627 // would have been quieted by the fpext originally. We don't care because
15628 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15629 // we would be worse off than just doing the promotion.
15630 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15631 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15632 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15633 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15634}
15635
15636unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15637 const SDNode *N0,
15638 const SDNode *N1) const {
15639 EVT VT = N0->getValueType(0);
15640
15641 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15642 // support denormals ever.
15643 if (((VT == MVT::f32 &&
15645 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15648 return ISD::FMAD;
15649
15650 const TargetOptions &Options = DAG.getTarget().Options;
15651 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15652 (N0->getFlags().hasAllowContract() &&
15653 N1->getFlags().hasAllowContract())) &&
15655 return ISD::FMA;
15656 }
15657
15658 return 0;
15659}
15660
15661// For a reassociatable opcode perform:
15662// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15663SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15664 SelectionDAG &DAG) const {
15665 EVT VT = N->getValueType(0);
15666 if (VT != MVT::i32 && VT != MVT::i64)
15667 return SDValue();
15668
15669 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15670 return SDValue();
15671
15672 unsigned Opc = N->getOpcode();
15673 SDValue Op0 = N->getOperand(0);
15674 SDValue Op1 = N->getOperand(1);
15675
15676 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15677 return SDValue();
15678
15679 if (Op0->isDivergent())
15680 std::swap(Op0, Op1);
15681
15682 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15683 return SDValue();
15684
15685 SDValue Op2 = Op1.getOperand(1);
15686 Op1 = Op1.getOperand(0);
15687 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15688 return SDValue();
15689
15690 if (Op1->isDivergent())
15691 std::swap(Op1, Op2);
15692
15693 SDLoc SL(N);
15694 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15695 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15696}
15697
15698static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15699 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15701 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15702 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15703 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15704}
15705
15706// Fold
15707// y = lshr i64 x, 32
15708// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15709// with Const.hi == -1
15710// To
15711// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15713 SDValue MulLHS, SDValue MulRHS,
15714 SDValue AddRHS) {
15715 if (MulRHS.getOpcode() == ISD::SRL)
15716 std::swap(MulLHS, MulRHS);
15717
15718 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15719 return SDValue();
15720
15721 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15722 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15723 MulLHS.getOperand(0) != AddRHS)
15724 return SDValue();
15725
15727 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15728 return SDValue();
15729
15730 SDValue ConstMul =
15731 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15732 return getMad64_32(DAG, SL, MVT::i64,
15733 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15734 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15735}
15736
15737// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15738// multiplies, if any.
15739//
15740// Full 64-bit multiplies that feed into an addition are lowered here instead
15741// of using the generic expansion. The generic expansion ends up with
15742// a tree of ADD nodes that prevents us from using the "add" part of the
15743// MAD instruction. The expansion produced here results in a chain of ADDs
15744// instead of a tree.
15745SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15746 DAGCombinerInfo &DCI) const {
15747 assert(N->isAnyAdd());
15748
15749 SelectionDAG &DAG = DCI.DAG;
15750 EVT VT = N->getValueType(0);
15751 SDLoc SL(N);
15752 SDValue LHS = N->getOperand(0);
15753 SDValue RHS = N->getOperand(1);
15754
15755 if (VT.isVector())
15756 return SDValue();
15757
15758 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15759 // result in scalar registers for uniform values.
15760 if (!N->isDivergent() && Subtarget->hasSMulHi())
15761 return SDValue();
15762
15763 unsigned NumBits = VT.getScalarSizeInBits();
15764 if (NumBits <= 32 || NumBits > 64)
15765 return SDValue();
15766
15767 if (LHS.getOpcode() != ISD::MUL) {
15768 assert(RHS.getOpcode() == ISD::MUL);
15769 std::swap(LHS, RHS);
15770 }
15771
15772 // Avoid the fold if it would unduly increase the number of multiplies due to
15773 // multiple uses, except on hardware with full-rate multiply-add (which is
15774 // part of full-rate 64-bit ops).
15775 if (!Subtarget->hasFullRate64Ops()) {
15776 unsigned NumUsers = 0;
15777 for (SDNode *User : LHS->users()) {
15778 // There is a use that does not feed into addition, so the multiply can't
15779 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15780 if (!User->isAnyAdd())
15781 return SDValue();
15782
15783 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15784 // MUL + 3xADD + 3xADDC over 3xMAD.
15785 ++NumUsers;
15786 if (NumUsers >= 3)
15787 return SDValue();
15788 }
15789 }
15790
15791 SDValue MulLHS = LHS.getOperand(0);
15792 SDValue MulRHS = LHS.getOperand(1);
15793 SDValue AddRHS = RHS;
15794
15795 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15796 return FoldedMAD;
15797
15798 // Always check whether operands are small unsigned values, since that
15799 // knowledge is useful in more cases. Check for small signed values only if
15800 // doing so can unlock a shorter code sequence.
15801 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15802 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15803
15804 bool MulSignedLo = false;
15805 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15806 MulSignedLo =
15807 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15808 }
15809
15810 // The operands and final result all have the same number of bits. If
15811 // operands need to be extended, they can be extended with garbage. The
15812 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15813 // truncated away in the end.
15814 if (VT != MVT::i64) {
15815 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15816 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15817 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15818 }
15819
15820 // The basic code generated is conceptually straightforward. Pseudo code:
15821 //
15822 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15823 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15824 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15825 //
15826 // The second and third lines are optional, depending on whether the factors
15827 // are {sign,zero}-extended or not.
15828 //
15829 // The actual DAG is noisier than the pseudo code, but only due to
15830 // instructions that disassemble values into low and high parts, and
15831 // assemble the final result.
15832 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15833
15834 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15835 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15836 SDValue Accum =
15837 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15838
15839 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15840 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15841
15842 if (!MulLHSUnsigned32) {
15843 auto MulLHSHi =
15844 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15845 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15846 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15847 }
15848
15849 if (!MulRHSUnsigned32) {
15850 auto MulRHSHi =
15851 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15852 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15853 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15854 }
15855
15856 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15857 Accum = DAG.getBitcast(MVT::i64, Accum);
15858 }
15859
15860 if (VT != MVT::i64)
15861 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15862 return Accum;
15863}
15864
15865SDValue
15866SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15867 DAGCombinerInfo &DCI) const {
15868 SDValue RHS = N->getOperand(1);
15869 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15870 if (!CRHS)
15871 return SDValue();
15872
15873 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15874 // common.
15875 uint64_t Val = CRHS->getZExtValue();
15876 if (countr_zero(Val) >= 32) {
15877 SelectionDAG &DAG = DCI.DAG;
15878 SDLoc SL(N);
15879 SDValue LHS = N->getOperand(0);
15880
15881 // Avoid carry machinery if we know the low half of the add does not
15882 // contribute to the final result.
15883 //
15884 // add i64:x, K if computeTrailingZeros(K) >= 32
15885 // => build_pair (add x.hi, K.hi), x.lo
15886
15887 // Breaking the 64-bit add here with this strange constant is unlikely
15888 // to interfere with addressing mode patterns.
15889
15890 SDValue Hi = getHiHalf64(LHS, DAG);
15891 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15892 unsigned Opcode = N->getOpcode();
15893 if (Opcode == ISD::PTRADD)
15894 Opcode = ISD::ADD;
15895 SDValue AddHi =
15896 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15897
15898 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15899 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15900 }
15901
15902 return SDValue();
15903}
15904
15905// Collect the ultimate src of each of the mul node's operands, and confirm
15906// each operand is 8 bytes.
15907static std::optional<ByteProvider<SDValue>>
15908handleMulOperand(const SDValue &MulOperand) {
15909 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15910 if (!Byte0 || Byte0->isConstantZero()) {
15911 return std::nullopt;
15912 }
15913 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15914 if (Byte1 && !Byte1->isConstantZero()) {
15915 return std::nullopt;
15916 }
15917 return Byte0;
15918}
15919
15920static unsigned addPermMasks(unsigned First, unsigned Second) {
15921 unsigned FirstCs = First & 0x0c0c0c0c;
15922 unsigned SecondCs = Second & 0x0c0c0c0c;
15923 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15924 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15925
15926 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15927 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15928 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15929 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15930
15931 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15932}
15933
15934struct DotSrc {
15936 int64_t PermMask;
15938};
15939
15943 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15944
15945 assert(Src0.Src.has_value() && Src1.Src.has_value());
15946 // Src0s and Src1s are empty, just place arbitrarily.
15947 if (Step == 0) {
15948 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15949 Src0.SrcOffset / 4});
15950 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15951 Src1.SrcOffset / 4});
15952 return;
15953 }
15954
15955 for (int BPI = 0; BPI < 2; BPI++) {
15956 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15957 if (BPI == 1) {
15958 BPP = {Src1, Src0};
15959 }
15960 unsigned ZeroMask = 0x0c0c0c0c;
15961 unsigned FMask = 0xFF << (8 * (3 - Step));
15962
15963 unsigned FirstMask =
15964 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15965 unsigned SecondMask =
15966 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15967 // Attempt to find Src vector which contains our SDValue, if so, add our
15968 // perm mask to the existing one. If we are unable to find a match for the
15969 // first SDValue, attempt to find match for the second.
15970 int FirstGroup = -1;
15971 for (int I = 0; I < 2; I++) {
15972 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15973 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15974 return IterElt.SrcOp == *BPP.first.Src &&
15975 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15976 };
15977
15978 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15979 if (Match != Srcs.end()) {
15980 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15981 FirstGroup = I;
15982 break;
15983 }
15984 }
15985 if (FirstGroup != -1) {
15986 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15987 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15988 return IterElt.SrcOp == *BPP.second.Src &&
15989 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15990 };
15991 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15992 if (Match != Srcs.end()) {
15993 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15994 } else
15995 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15996 return;
15997 }
15998 }
15999
16000 // If we have made it here, then we could not find a match in Src0s or Src1s
16001 // for either Src0 or Src1, so just place them arbitrarily.
16002
16003 unsigned ZeroMask = 0x0c0c0c0c;
16004 unsigned FMask = 0xFF << (8 * (3 - Step));
16005
16006 Src0s.push_back(
16007 {*Src0.Src,
16008 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16009 Src0.SrcOffset / 4});
16010 Src1s.push_back(
16011 {*Src1.Src,
16012 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16013 Src1.SrcOffset / 4});
16014}
16015
16017 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16018 bool IsAny) {
16019
16020 // If we just have one source, just permute it accordingly.
16021 if (Srcs.size() == 1) {
16022 auto *Elt = Srcs.begin();
16023 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16024
16025 // v_perm will produce the original value
16026 if (Elt->PermMask == 0x3020100)
16027 return EltOp;
16028
16029 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16030 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16031 }
16032
16033 auto *FirstElt = Srcs.begin();
16034 auto *SecondElt = std::next(FirstElt);
16035
16037
16038 // If we have multiple sources in the chain, combine them via perms (using
16039 // calculated perm mask) and Ors.
16040 while (true) {
16041 auto FirstMask = FirstElt->PermMask;
16042 auto SecondMask = SecondElt->PermMask;
16043
16044 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16045 unsigned FirstPlusFour = FirstMask | 0x04040404;
16046 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16047 // original 0x0C.
16048 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16049
16050 auto PermMask = addPermMasks(FirstMask, SecondMask);
16051 auto FirstVal =
16052 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16053 auto SecondVal =
16054 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16055
16056 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16057 SecondVal,
16058 DAG.getConstant(PermMask, SL, MVT::i32)));
16059
16060 FirstElt = std::next(SecondElt);
16061 if (FirstElt == Srcs.end())
16062 break;
16063
16064 SecondElt = std::next(FirstElt);
16065 // If we only have a FirstElt, then just combine that into the cumulative
16066 // source node.
16067 if (SecondElt == Srcs.end()) {
16068 auto EltOp =
16069 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16070
16071 Perms.push_back(
16072 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16073 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16074 break;
16075 }
16076 }
16077
16078 assert(Perms.size() == 1 || Perms.size() == 2);
16079 return Perms.size() == 2
16080 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16081 : Perms[0];
16082}
16083
16084static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16085 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16086 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16087 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16088 EntryMask += ZeroMask;
16089 }
16090}
16091
16092static bool isMul(const SDValue Op) {
16093 auto Opcode = Op.getOpcode();
16094
16095 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16096 Opcode == AMDGPUISD::MUL_I24);
16097}
16098
16099static std::optional<bool>
16101 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16102 const SDValue &S1Op, const SelectionDAG &DAG) {
16103 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16104 // of the dot4 is irrelevant.
16105 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16106 return false;
16107
16108 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16109 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16110 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16111 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16112 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16113 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16114
16115 assert(!(S0IsUnsigned && S0IsSigned));
16116 assert(!(S1IsUnsigned && S1IsSigned));
16117
16118 // There are 9 possible permutations of
16119 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16120
16121 // In two permutations, the sign bits are known to be the same for both Ops,
16122 // so simply return Signed / Unsigned corresponding to the MSB
16123
16124 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16125 return S0IsSigned;
16126
16127 // In another two permutations, the sign bits are known to be opposite. In
16128 // this case return std::nullopt to indicate a bad match.
16129
16130 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16131 return std::nullopt;
16132
16133 // In the remaining five permutations, we don't know the value of the sign
16134 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16135 // the upper bits must be extension bits. Thus, the only ways for the sign
16136 // bit to be unknown is if it was sign extended from unknown value, or if it
16137 // was any extended. In either case, it is correct to use the signed
16138 // version of the signedness semantics of dot4
16139
16140 // In two of such permutations, we known the sign bit is set for
16141 // one op, and the other is unknown. It is okay to used signed version of
16142 // dot4.
16143 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16144 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16145 return true;
16146
16147 // In one such permutation, we don't know either of the sign bits. It is okay
16148 // to used the signed version of dot4.
16149 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16150 return true;
16151
16152 // In two of such permutations, we known the sign bit is unset for
16153 // one op, and the other is unknown. Return std::nullopt to indicate a
16154 // bad match.
16155 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16156 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16157 return std::nullopt;
16158
16159 llvm_unreachable("Fully covered condition");
16160}
16161
16162SDValue SITargetLowering::performAddCombine(SDNode *N,
16163 DAGCombinerInfo &DCI) const {
16164 SelectionDAG &DAG = DCI.DAG;
16165 EVT VT = N->getValueType(0);
16166 SDLoc SL(N);
16167 SDValue LHS = N->getOperand(0);
16168 SDValue RHS = N->getOperand(1);
16169
16170 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16171 if (Subtarget->hasMad64_32()) {
16172 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16173 return Folded;
16174 }
16175 }
16176
16177 if (SDValue V = reassociateScalarOps(N, DAG)) {
16178 return V;
16179 }
16180
16181 if (VT == MVT::i64) {
16182 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16183 return Folded;
16184 }
16185
16186 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16187 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16188 SDValue TempNode(N, 0);
16189 std::optional<bool> IsSigned;
16193
16194 // Match the v_dot4 tree, while collecting src nodes.
16195 int ChainLength = 0;
16196 for (int I = 0; I < 4; I++) {
16197 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16198 if (MulIdx == -1)
16199 break;
16200 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16201 if (!Src0)
16202 break;
16203 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16204 if (!Src1)
16205 break;
16206
16207 auto IterIsSigned = checkDot4MulSignedness(
16208 TempNode->getOperand(MulIdx), *Src0, *Src1,
16209 TempNode->getOperand(MulIdx)->getOperand(0),
16210 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16211 if (!IterIsSigned)
16212 break;
16213 if (!IsSigned)
16214 IsSigned = *IterIsSigned;
16215 if (*IterIsSigned != *IsSigned)
16216 break;
16217 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16218 auto AddIdx = 1 - MulIdx;
16219 // Allow the special case where add (add (mul24, 0), mul24) became ->
16220 // add (mul24, mul24).
16221 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16222 Src2s.push_back(TempNode->getOperand(AddIdx));
16223 auto Src0 =
16224 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16225 if (!Src0)
16226 break;
16227 auto Src1 =
16228 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16229 if (!Src1)
16230 break;
16231 auto IterIsSigned = checkDot4MulSignedness(
16232 TempNode->getOperand(AddIdx), *Src0, *Src1,
16233 TempNode->getOperand(AddIdx)->getOperand(0),
16234 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16235 if (!IterIsSigned)
16236 break;
16237 assert(IsSigned);
16238 if (*IterIsSigned != *IsSigned)
16239 break;
16240 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16241 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16242 ChainLength = I + 2;
16243 break;
16244 }
16245
16246 TempNode = TempNode->getOperand(AddIdx);
16247 Src2s.push_back(TempNode);
16248 ChainLength = I + 1;
16249 if (TempNode->getNumOperands() < 2)
16250 break;
16251 LHS = TempNode->getOperand(0);
16252 RHS = TempNode->getOperand(1);
16253 }
16254
16255 if (ChainLength < 2)
16256 return SDValue();
16257
16258 // Masks were constructed with assumption that we would find a chain of
16259 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16260 // 0x0c) so they do not affect dot calculation.
16261 if (ChainLength < 4) {
16262 fixMasks(Src0s, ChainLength);
16263 fixMasks(Src1s, ChainLength);
16264 }
16265
16266 SDValue Src0, Src1;
16267
16268 // If we are just using a single source for both, and have permuted the
16269 // bytes consistently, we can just use the sources without permuting
16270 // (commutation).
16271 bool UseOriginalSrc = false;
16272 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16273 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16274 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16275 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16276 SmallVector<unsigned, 4> SrcBytes;
16277 auto Src0Mask = Src0s.begin()->PermMask;
16278 SrcBytes.push_back(Src0Mask & 0xFF000000);
16279 bool UniqueEntries = true;
16280 for (auto I = 1; I < 4; I++) {
16281 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16282
16283 if (is_contained(SrcBytes, NextByte)) {
16284 UniqueEntries = false;
16285 break;
16286 }
16287 SrcBytes.push_back(NextByte);
16288 }
16289
16290 if (UniqueEntries) {
16291 UseOriginalSrc = true;
16292
16293 auto *FirstElt = Src0s.begin();
16294 auto FirstEltOp =
16295 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16296
16297 auto *SecondElt = Src1s.begin();
16298 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16299 SecondElt->DWordOffset);
16300
16301 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16302 MVT::getIntegerVT(32));
16303 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16304 MVT::getIntegerVT(32));
16305 }
16306 }
16307
16308 if (!UseOriginalSrc) {
16309 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16310 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16311 }
16312
16313 assert(IsSigned);
16314 SDValue Src2 =
16315 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16316
16317 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16318 : Intrinsic::amdgcn_udot4,
16319 SL, MVT::i64);
16320
16321 assert(!VT.isVector());
16322 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16323 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16324
16325 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16326 }
16327
16328 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16329 return SDValue();
16330
16331 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16332 // add x, sext (setcc) => usubo_carry x, 0, setcc
16333 unsigned Opc = LHS.getOpcode();
16336 std::swap(RHS, LHS);
16337
16338 Opc = RHS.getOpcode();
16339 switch (Opc) {
16340 default:
16341 break;
16342 case ISD::ZERO_EXTEND:
16343 case ISD::SIGN_EXTEND:
16344 case ISD::ANY_EXTEND: {
16345 auto Cond = RHS.getOperand(0);
16346 // If this won't be a real VOPC output, we would still need to insert an
16347 // extra instruction anyway.
16348 if (!isBoolSGPR(Cond))
16349 break;
16350 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16351 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16353 return DAG.getNode(Opc, SL, VTList, Args);
16354 }
16355 case ISD::UADDO_CARRY: {
16356 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16357 if (!isNullConstant(RHS.getOperand(1)))
16358 break;
16359 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16360 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16361 }
16362 }
16363 return SDValue();
16364}
16365
16366SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16367 DAGCombinerInfo &DCI) const {
16368 SelectionDAG &DAG = DCI.DAG;
16369 SDLoc DL(N);
16370 EVT VT = N->getValueType(0);
16371 SDValue N0 = N->getOperand(0);
16372 SDValue N1 = N->getOperand(1);
16373
16374 // The following folds transform PTRADDs into regular arithmetic in cases
16375 // where the PTRADD wouldn't be folded as an immediate offset into memory
16376 // instructions anyway. They are target-specific in that other targets might
16377 // prefer to not lose information about the pointer arithmetic.
16378
16379 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16380 // Adapted from DAGCombiner::visitADDLikeCommutative.
16381 SDValue V, K;
16382 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16383 SDNodeFlags ShlFlags = N1->getFlags();
16384 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16385 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16386 // preserved.
16387 SDNodeFlags NewShlFlags =
16388 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16390 : SDNodeFlags();
16391 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16392 DCI.AddToWorklist(Inner.getNode());
16393 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16394 }
16395
16396 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16397 // performAddCombine.
16398 if (N1.getOpcode() == ISD::MUL) {
16399 if (Subtarget->hasMad64_32()) {
16400 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16401 return Folded;
16402 }
16403 }
16404
16405 // If the 32 low bits of the constant are all zero, there is nothing to fold
16406 // into an immediate offset, so it's better to eliminate the unnecessary
16407 // addition for the lower 32 bits than to preserve the PTRADD.
16408 // Analogous to a fold in performAddCombine.
16409 if (VT == MVT::i64) {
16410 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16411 return Folded;
16412 }
16413
16414 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16415 return SDValue();
16416
16417 SDValue X = N0;
16418 SDValue Y = N1.getOperand(0);
16419 SDValue Z = N1.getOperand(1);
16420 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16421 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16422
16423 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16424 Y->isDivergent() != Z->isDivergent()) {
16425 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16426 // y are uniform and z isn't.
16427 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16428 // z are uniform and y isn't.
16429 // The goal is to push uniform operands up in the computation, so that they
16430 // can be handled with scalar operations. We can't use reassociateScalarOps
16431 // for this since it requires two identical commutative operations to
16432 // reassociate.
16433 if (Y->isDivergent())
16434 std::swap(Y, Z);
16435 // If both additions in the original were NUW, reassociation preserves that.
16436 SDNodeFlags ReassocFlags =
16437 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16438 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16439 DCI.AddToWorklist(UniformInner.getNode());
16440 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16441 }
16442
16443 return SDValue();
16444}
16445
16446SDValue SITargetLowering::performSubCombine(SDNode *N,
16447 DAGCombinerInfo &DCI) const {
16448 SelectionDAG &DAG = DCI.DAG;
16449 EVT VT = N->getValueType(0);
16450
16451 if (VT == MVT::i64) {
16452 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16453 return Folded;
16454 }
16455
16456 if (VT != MVT::i32)
16457 return SDValue();
16458
16459 SDLoc SL(N);
16460 SDValue LHS = N->getOperand(0);
16461 SDValue RHS = N->getOperand(1);
16462
16463 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16464 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16465 unsigned Opc = RHS.getOpcode();
16466 switch (Opc) {
16467 default:
16468 break;
16469 case ISD::ZERO_EXTEND:
16470 case ISD::SIGN_EXTEND:
16471 case ISD::ANY_EXTEND: {
16472 auto Cond = RHS.getOperand(0);
16473 // If this won't be a real VOPC output, we would still need to insert an
16474 // extra instruction anyway.
16475 if (!isBoolSGPR(Cond))
16476 break;
16477 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16478 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16480 return DAG.getNode(Opc, SL, VTList, Args);
16481 }
16482 }
16483
16484 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16485 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16486 if (!isNullConstant(LHS.getOperand(1)))
16487 return SDValue();
16488 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16489 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16490 }
16491 return SDValue();
16492}
16493
16494SDValue
16495SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16496 DAGCombinerInfo &DCI) const {
16497
16498 if (N->getValueType(0) != MVT::i32)
16499 return SDValue();
16500
16501 if (!isNullConstant(N->getOperand(1)))
16502 return SDValue();
16503
16504 SelectionDAG &DAG = DCI.DAG;
16505 SDValue LHS = N->getOperand(0);
16506
16507 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16508 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16509 unsigned LHSOpc = LHS.getOpcode();
16510 unsigned Opc = N->getOpcode();
16511 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16512 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16513 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16514 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16515 }
16516 return SDValue();
16517}
16518
16519SDValue SITargetLowering::performFAddCombine(SDNode *N,
16520 DAGCombinerInfo &DCI) const {
16521 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16522 return SDValue();
16523
16524 SelectionDAG &DAG = DCI.DAG;
16525 EVT VT = N->getValueType(0);
16526
16527 SDLoc SL(N);
16528 SDValue LHS = N->getOperand(0);
16529 SDValue RHS = N->getOperand(1);
16530
16531 // These should really be instruction patterns, but writing patterns with
16532 // source modifiers is a pain.
16533
16534 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16535 if (LHS.getOpcode() == ISD::FADD) {
16536 SDValue A = LHS.getOperand(0);
16537 if (A == LHS.getOperand(1)) {
16538 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16539 if (FusedOp != 0) {
16540 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16541 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16542 }
16543 }
16544 }
16545
16546 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16547 if (RHS.getOpcode() == ISD::FADD) {
16548 SDValue A = RHS.getOperand(0);
16549 if (A == RHS.getOperand(1)) {
16550 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16551 if (FusedOp != 0) {
16552 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16553 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16554 }
16555 }
16556 }
16557
16558 return SDValue();
16559}
16560
16561SDValue SITargetLowering::performFSubCombine(SDNode *N,
16562 DAGCombinerInfo &DCI) const {
16563 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16564 return SDValue();
16565
16566 SelectionDAG &DAG = DCI.DAG;
16567 SDLoc SL(N);
16568 EVT VT = N->getValueType(0);
16569 assert(!VT.isVector());
16570
16571 // Try to get the fneg to fold into the source modifier. This undoes generic
16572 // DAG combines and folds them into the mad.
16573 //
16574 // Only do this if we are not trying to support denormals. v_mad_f32 does
16575 // not support denormals ever.
16576 SDValue LHS = N->getOperand(0);
16577 SDValue RHS = N->getOperand(1);
16578 if (LHS.getOpcode() == ISD::FADD) {
16579 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16580 SDValue A = LHS.getOperand(0);
16581 if (A == LHS.getOperand(1)) {
16582 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16583 if (FusedOp != 0) {
16584 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16585 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16586
16587 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16588 }
16589 }
16590 }
16591
16592 if (RHS.getOpcode() == ISD::FADD) {
16593 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16594
16595 SDValue A = RHS.getOperand(0);
16596 if (A == RHS.getOperand(1)) {
16597 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16598 if (FusedOp != 0) {
16599 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16600 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16601 }
16602 }
16603 }
16604
16605 return SDValue();
16606}
16607
16608SDValue SITargetLowering::performFDivCombine(SDNode *N,
16609 DAGCombinerInfo &DCI) const {
16610 SelectionDAG &DAG = DCI.DAG;
16611 SDLoc SL(N);
16612 EVT VT = N->getValueType(0);
16613
16614 // fsqrt legality correlates to rsq availability.
16615 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16616 return SDValue();
16617
16618 SDValue LHS = N->getOperand(0);
16619 SDValue RHS = N->getOperand(1);
16620
16621 SDNodeFlags Flags = N->getFlags();
16622 SDNodeFlags RHSFlags = RHS->getFlags();
16623 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16624 !RHS->hasOneUse())
16625 return SDValue();
16626
16627 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16628 bool IsNegative = false;
16629 if (CLHS->isExactlyValue(1.0) ||
16630 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16631 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16632 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16633 if (RHS.getOpcode() == ISD::FSQRT) {
16634 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16635 SDValue Rsq =
16636 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16637 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16638 }
16639 }
16640 }
16641
16642 return SDValue();
16643}
16644
16645SDValue SITargetLowering::performFMulCombine(SDNode *N,
16646 DAGCombinerInfo &DCI) const {
16647 SelectionDAG &DAG = DCI.DAG;
16648 EVT VT = N->getValueType(0);
16649 EVT ScalarVT = VT.getScalarType();
16650 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
16651
16652 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16653 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16654 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16655 return SDValue();
16656 }
16657
16658 SDValue LHS = N->getOperand(0);
16659 SDValue RHS = N->getOperand(1);
16660
16661 // It is cheaper to realize i32 inline constants as compared against
16662 // materializing f16 or f64 (or even non-inline f32) values,
16663 // possible via ldexp usage, as shown below :
16664 //
16665 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16666 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16667 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16668 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16669 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16670 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16671 if (!TrueNode)
16672 return SDValue();
16673 const ConstantFPSDNode *FalseNode =
16674 isConstOrConstSplatFP(RHS.getOperand(2));
16675 if (!FalseNode)
16676 return SDValue();
16677
16678 if (TrueNode->isNegative() != FalseNode->isNegative())
16679 return SDValue();
16680
16681 // For f32, only non-inline constants should be transformed.
16682 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16683 if (ScalarVT == MVT::f32 &&
16684 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16685 TII->isInlineConstant(FalseNode->getValueAPF()))
16686 return SDValue();
16687
16688 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16689 if (TrueNodeExpVal == INT_MIN)
16690 return SDValue();
16691 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16692 if (FalseNodeExpVal == INT_MIN)
16693 return SDValue();
16694
16695 SDLoc SL(N);
16696 SDValue SelectNode =
16697 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16698 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16699 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16700
16701 LHS = TrueNode->isNegative()
16702 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16703 : LHS;
16704
16705 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16706 }
16707
16708 return SDValue();
16709}
16710
16711SDValue SITargetLowering::performFMACombine(SDNode *N,
16712 DAGCombinerInfo &DCI) const {
16713 SelectionDAG &DAG = DCI.DAG;
16714 EVT VT = N->getValueType(0);
16715 SDLoc SL(N);
16716
16717 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16718 return SDValue();
16719
16720 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16721 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16722 SDValue Op1 = N->getOperand(0);
16723 SDValue Op2 = N->getOperand(1);
16724 SDValue FMA = N->getOperand(2);
16725
16726 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16727 Op2.getOpcode() != ISD::FP_EXTEND)
16728 return SDValue();
16729
16730 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16731 // regardless of the denorm mode setting. Therefore,
16732 // fp-contract is sufficient to allow generating fdot2.
16733 const TargetOptions &Options = DAG.getTarget().Options;
16734 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16735 (N->getFlags().hasAllowContract() &&
16736 FMA->getFlags().hasAllowContract())) {
16737 Op1 = Op1.getOperand(0);
16738 Op2 = Op2.getOperand(0);
16739 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16741 return SDValue();
16742
16743 SDValue Vec1 = Op1.getOperand(0);
16744 SDValue Idx1 = Op1.getOperand(1);
16745 SDValue Vec2 = Op2.getOperand(0);
16746
16747 SDValue FMAOp1 = FMA.getOperand(0);
16748 SDValue FMAOp2 = FMA.getOperand(1);
16749 SDValue FMAAcc = FMA.getOperand(2);
16750
16751 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16752 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16753 return SDValue();
16754
16755 FMAOp1 = FMAOp1.getOperand(0);
16756 FMAOp2 = FMAOp2.getOperand(0);
16757 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16759 return SDValue();
16760
16761 SDValue Vec3 = FMAOp1.getOperand(0);
16762 SDValue Vec4 = FMAOp2.getOperand(0);
16763 SDValue Idx2 = FMAOp1.getOperand(1);
16764
16765 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16766 // Idx1 and Idx2 cannot be the same.
16767 Idx1 == Idx2)
16768 return SDValue();
16769
16770 if (Vec1 == Vec2 || Vec3 == Vec4)
16771 return SDValue();
16772
16773 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16774 return SDValue();
16775
16776 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16777 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16778 DAG.getTargetConstant(0, SL, MVT::i1));
16779 }
16780 }
16781 return SDValue();
16782}
16783
16784SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16785 DAGCombinerInfo &DCI) const {
16786 SelectionDAG &DAG = DCI.DAG;
16787 SDLoc SL(N);
16788
16789 SDValue LHS = N->getOperand(0);
16790 SDValue RHS = N->getOperand(1);
16791 EVT VT = LHS.getValueType();
16792 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16793
16794 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16795 if (!CRHS) {
16797 if (CRHS) {
16798 std::swap(LHS, RHS);
16799 CC = getSetCCSwappedOperands(CC);
16800 }
16801 }
16802
16803 if (CRHS) {
16804 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16805 isBoolSGPR(LHS.getOperand(0))) {
16806 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16807 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16808 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16809 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16810 if ((CRHS->isAllOnes() &&
16811 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16812 (CRHS->isZero() &&
16813 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16814 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16815 DAG.getAllOnesConstant(SL, MVT::i1));
16816 if ((CRHS->isAllOnes() &&
16817 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16818 (CRHS->isZero() &&
16819 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16820 return LHS.getOperand(0);
16821 }
16822
16823 const APInt &CRHSVal = CRHS->getAPIntValue();
16824 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16825 LHS.getOpcode() == ISD::SELECT &&
16826 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16827 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16828 isBoolSGPR(LHS.getOperand(0))) {
16829 // Given CT != FT:
16830 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16831 // setcc (select cc, CT, CF), CF, ne => cc
16832 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16833 // setcc (select cc, CT, CF), CT, eq => cc
16834 const APInt &CT = LHS.getConstantOperandAPInt(1);
16835 const APInt &CF = LHS.getConstantOperandAPInt(2);
16836
16837 if (CT != CF) {
16838 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16839 (CT == CRHSVal && CC == ISD::SETNE))
16840 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
16841 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16842 (CT == CRHSVal && CC == ISD::SETEQ))
16843 return LHS.getOperand(0);
16844 }
16845 }
16846 }
16847
16848 // Eliminate setcc by using carryout from add/sub instruction
16849
16850 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16851 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16852 // similarly for subtraction
16853
16854 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16855 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16856
16857 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16859 (CC == ISD::SETUGT &&
16861 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16862 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16863 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16864
16865 SDValue Op0 = LHS.getOperand(0);
16866 SDValue Op1 = LHS.getOperand(1);
16867
16868 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16869 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16870
16871 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16872 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16873
16874 SDValue NodeLo =
16875 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16876 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16877
16878 SDValue CarryInHi = NodeLo.getValue(1);
16879 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16880 SL, DAG.getVTList(MVT::i32, MVT::i1),
16881 {Op0Hi, Op1Hi, CarryInHi});
16882
16883 SDValue ResultLo = NodeLo.getValue(0);
16884 SDValue ResultHi = NodeHi.getValue(0);
16885
16886 SDValue JoinedResult =
16887 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16888
16889 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16890 SDValue Overflow = NodeHi.getValue(1);
16891 DCI.CombineTo(LHS.getNode(), Result);
16892 return Overflow;
16893 }
16894
16895 if (VT != MVT::f32 && VT != MVT::f64 &&
16896 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16897 return SDValue();
16898
16899 // Match isinf/isfinite pattern
16900 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16901 // (fcmp one (fabs x), inf) -> (fp_class x,
16902 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16903 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16904 LHS.getOpcode() == ISD::FABS) {
16905 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16906 if (!CRHS)
16907 return SDValue();
16908
16909 const APFloat &APF = CRHS->getValueAPF();
16910 if (APF.isInfinity() && !APF.isNegative()) {
16911 const unsigned IsInfMask =
16913 const unsigned IsFiniteMask =
16917 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16918 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16919 DAG.getConstant(Mask, SL, MVT::i32));
16920 }
16921 }
16922
16923 return SDValue();
16924}
16925
16926SDValue
16927SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16928 DAGCombinerInfo &DCI) const {
16929 SelectionDAG &DAG = DCI.DAG;
16930 SDLoc SL(N);
16931 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16932
16933 SDValue Src = N->getOperand(0);
16934 SDValue Shift = N->getOperand(0);
16935
16936 // TODO: Extend type shouldn't matter (assuming legal types).
16937 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16938 Shift = Shift.getOperand(0);
16939
16940 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16941 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16942 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16943 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16944 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16945 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16946 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16947 SDValue Shifted = DAG.getZExtOrTrunc(
16948 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16949
16950 unsigned ShiftOffset = 8 * Offset;
16951 if (Shift.getOpcode() == ISD::SHL)
16952 ShiftOffset -= C->getZExtValue();
16953 else
16954 ShiftOffset += C->getZExtValue();
16955
16956 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16957 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16958 MVT::f32, Shifted);
16959 }
16960 }
16961 }
16962
16963 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16964 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16965 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16966 // We simplified Src. If this node is not dead, visit it again so it is
16967 // folded properly.
16968 if (N->getOpcode() != ISD::DELETED_NODE)
16969 DCI.AddToWorklist(N);
16970 return SDValue(N, 0);
16971 }
16972
16973 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16974 if (SDValue DemandedSrc =
16975 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16976 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16977
16978 return SDValue();
16979}
16980
16981SDValue SITargetLowering::performClampCombine(SDNode *N,
16982 DAGCombinerInfo &DCI) const {
16983 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16984 if (!CSrc)
16985 return SDValue();
16986
16987 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16988 const APFloat &F = CSrc->getValueAPF();
16989 APFloat Zero = APFloat::getZero(F.getSemantics());
16990 if (F < Zero ||
16991 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16992 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16993 }
16994
16995 APFloat One(F.getSemantics(), "1.0");
16996 if (F > One)
16997 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16998
16999 return SDValue(CSrc, 0);
17000}
17001
17002SDValue SITargetLowering::performSelectCombine(SDNode *N,
17003 DAGCombinerInfo &DCI) const {
17004
17005 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17006 // integer).
17007 // Detect when CMP and SELECT use the same constant and fold them to avoid
17008 // loading the constant twice. Specifically handles patterns like:
17009 // %cmp = icmp eq i32 %val, 4242
17010 // %sel = select i1 %cmp, i32 4242, i32 %other
17011 // It can be optimized to reuse %val instead of 4242 in select.
17012 SDValue Cond = N->getOperand(0);
17013 SDValue TrueVal = N->getOperand(1);
17014 SDValue FalseVal = N->getOperand(2);
17015
17016 // Check if condition is a comparison.
17017 if (Cond.getOpcode() != ISD::SETCC)
17018 return SDValue();
17019
17020 SDValue LHS = Cond.getOperand(0);
17021 SDValue RHS = Cond.getOperand(1);
17022 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17023
17024 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17025 bool isInteger = LHS.getValueType().isInteger();
17026
17027 // Handle simple floating-point and integer types only.
17028 if (!isFloatingPoint && !isInteger)
17029 return SDValue();
17030
17031 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17032 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17033 if (!isEquality && !isNonEquality)
17034 return SDValue();
17035
17036 SDValue ArgVal, ConstVal;
17037 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17038 (isInteger && isa<ConstantSDNode>(RHS))) {
17039 ConstVal = RHS;
17040 ArgVal = LHS;
17041 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17042 (isInteger && isa<ConstantSDNode>(LHS))) {
17043 ConstVal = LHS;
17044 ArgVal = RHS;
17045 } else {
17046 return SDValue();
17047 }
17048
17049 // Skip optimization for inlinable immediates.
17050 if (isFloatingPoint) {
17051 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17052 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17053 return SDValue();
17054 } else {
17056 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17057 return SDValue();
17058 }
17059
17060 // For equality and non-equality comparisons, patterns:
17061 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17062 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17063 if (!(isEquality && TrueVal == ConstVal) &&
17064 !(isNonEquality && FalseVal == ConstVal))
17065 return SDValue();
17066
17067 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17068 SDValue SelectRHS =
17069 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17070 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17071 SelectLHS, SelectRHS);
17072}
17073
17075 DAGCombinerInfo &DCI) const {
17076 switch (N->getOpcode()) {
17077 case ISD::ADD:
17078 case ISD::SUB:
17079 case ISD::SHL:
17080 case ISD::SRL:
17081 case ISD::SRA:
17082 case ISD::AND:
17083 case ISD::OR:
17084 case ISD::XOR:
17085 case ISD::MUL:
17086 case ISD::SETCC:
17087 case ISD::SELECT:
17088 case ISD::SMIN:
17089 case ISD::SMAX:
17090 case ISD::UMIN:
17091 case ISD::UMAX:
17092 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17093 return Res;
17094 break;
17095 default:
17096 break;
17097 }
17098
17099 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17100 return SDValue();
17101
17102 switch (N->getOpcode()) {
17103 case ISD::ADD:
17104 return performAddCombine(N, DCI);
17105 case ISD::PTRADD:
17106 return performPtrAddCombine(N, DCI);
17107 case ISD::SUB:
17108 return performSubCombine(N, DCI);
17109 case ISD::UADDO_CARRY:
17110 case ISD::USUBO_CARRY:
17111 return performAddCarrySubCarryCombine(N, DCI);
17112 case ISD::FADD:
17113 return performFAddCombine(N, DCI);
17114 case ISD::FSUB:
17115 return performFSubCombine(N, DCI);
17116 case ISD::FDIV:
17117 return performFDivCombine(N, DCI);
17118 case ISD::FMUL:
17119 return performFMulCombine(N, DCI);
17120 case ISD::SETCC:
17121 return performSetCCCombine(N, DCI);
17122 case ISD::SELECT:
17123 if (auto Res = performSelectCombine(N, DCI))
17124 return Res;
17125 break;
17126 case ISD::FMAXNUM:
17127 case ISD::FMINNUM:
17128 case ISD::FMAXNUM_IEEE:
17129 case ISD::FMINNUM_IEEE:
17130 case ISD::FMAXIMUM:
17131 case ISD::FMINIMUM:
17132 case ISD::FMAXIMUMNUM:
17133 case ISD::FMINIMUMNUM:
17134 case ISD::SMAX:
17135 case ISD::SMIN:
17136 case ISD::UMAX:
17137 case ISD::UMIN:
17138 case AMDGPUISD::FMIN_LEGACY:
17139 case AMDGPUISD::FMAX_LEGACY:
17140 return performMinMaxCombine(N, DCI);
17141 case ISD::FMA:
17142 return performFMACombine(N, DCI);
17143 case ISD::AND:
17144 return performAndCombine(N, DCI);
17145 case ISD::OR:
17146 return performOrCombine(N, DCI);
17147 case ISD::FSHR: {
17149 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17150 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17151 return matchPERM(N, DCI);
17152 }
17153 break;
17154 }
17155 case ISD::XOR:
17156 return performXorCombine(N, DCI);
17157 case ISD::ZERO_EXTEND:
17158 return performZeroExtendCombine(N, DCI);
17160 return performSignExtendInRegCombine(N, DCI);
17161 case AMDGPUISD::FP_CLASS:
17162 return performClassCombine(N, DCI);
17163 case ISD::FCANONICALIZE:
17164 return performFCanonicalizeCombine(N, DCI);
17165 case AMDGPUISD::RCP:
17166 return performRcpCombine(N, DCI);
17167 case ISD::FLDEXP:
17168 case AMDGPUISD::FRACT:
17169 case AMDGPUISD::RSQ:
17170 case AMDGPUISD::RCP_LEGACY:
17171 case AMDGPUISD::RCP_IFLAG:
17172 case AMDGPUISD::RSQ_CLAMP: {
17173 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17174 SDValue Src = N->getOperand(0);
17175 if (Src.isUndef())
17176 return Src;
17177 break;
17178 }
17179 case ISD::SINT_TO_FP:
17180 case ISD::UINT_TO_FP:
17181 return performUCharToFloatCombine(N, DCI);
17182 case ISD::FCOPYSIGN:
17183 return performFCopySignCombine(N, DCI);
17184 case AMDGPUISD::CVT_F32_UBYTE0:
17185 case AMDGPUISD::CVT_F32_UBYTE1:
17186 case AMDGPUISD::CVT_F32_UBYTE2:
17187 case AMDGPUISD::CVT_F32_UBYTE3:
17188 return performCvtF32UByteNCombine(N, DCI);
17189 case AMDGPUISD::FMED3:
17190 return performFMed3Combine(N, DCI);
17191 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17192 return performCvtPkRTZCombine(N, DCI);
17193 case AMDGPUISD::CLAMP:
17194 return performClampCombine(N, DCI);
17195 case ISD::SCALAR_TO_VECTOR: {
17196 SelectionDAG &DAG = DCI.DAG;
17197 EVT VT = N->getValueType(0);
17198
17199 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17200 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17201 SDLoc SL(N);
17202 SDValue Src = N->getOperand(0);
17203 EVT EltVT = Src.getValueType();
17204 if (EltVT != MVT::i16)
17205 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17206
17207 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17208 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17209 }
17210
17211 break;
17212 }
17214 return performExtractVectorEltCombine(N, DCI);
17216 return performInsertVectorEltCombine(N, DCI);
17217 case ISD::FP_ROUND:
17218 return performFPRoundCombine(N, DCI);
17219 case ISD::LOAD: {
17220 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17221 return Widened;
17222 [[fallthrough]];
17223 }
17224 default: {
17225 if (!DCI.isBeforeLegalize()) {
17226 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17227 return performMemSDNodeCombine(MemNode, DCI);
17228 }
17229
17230 break;
17231 }
17232 }
17233
17235}
17236
17237/// Helper function for adjustWritemask
17238static unsigned SubIdx2Lane(unsigned Idx) {
17239 switch (Idx) {
17240 default:
17241 return ~0u;
17242 case AMDGPU::sub0:
17243 return 0;
17244 case AMDGPU::sub1:
17245 return 1;
17246 case AMDGPU::sub2:
17247 return 2;
17248 case AMDGPU::sub3:
17249 return 3;
17250 case AMDGPU::sub4:
17251 return 4; // Possible with TFE/LWE
17252 }
17253}
17254
17255/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17256SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17257 SelectionDAG &DAG) const {
17258 unsigned Opcode = Node->getMachineOpcode();
17259
17260 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17261 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17262 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17263 return Node; // not implemented for D16
17264
17265 SDNode *Users[5] = {nullptr};
17266 unsigned Lane = 0;
17267 unsigned DmaskIdx =
17268 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17269 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17270 unsigned NewDmask = 0;
17271 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17272 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17273 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17274 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17275 unsigned TFCLane = 0;
17276 bool HasChain = Node->getNumValues() > 1;
17277
17278 if (OldDmask == 0) {
17279 // These are folded out, but on the chance it happens don't assert.
17280 return Node;
17281 }
17282
17283 unsigned OldBitsSet = llvm::popcount(OldDmask);
17284 // Work out which is the TFE/LWE lane if that is enabled.
17285 if (UsesTFC) {
17286 TFCLane = OldBitsSet;
17287 }
17288
17289 // Try to figure out the used register components
17290 for (SDUse &Use : Node->uses()) {
17291
17292 // Don't look at users of the chain.
17293 if (Use.getResNo() != 0)
17294 continue;
17295
17296 SDNode *User = Use.getUser();
17297
17298 // Abort if we can't understand the usage
17299 if (!User->isMachineOpcode() ||
17300 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17301 return Node;
17302
17303 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17304 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17305 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17306 // set, etc.
17307 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17308 if (Lane == ~0u)
17309 return Node;
17310
17311 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17312 if (UsesTFC && Lane == TFCLane) {
17313 Users[Lane] = User;
17314 } else {
17315 // Set which texture component corresponds to the lane.
17316 unsigned Comp;
17317 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17318 Comp = llvm::countr_zero(Dmask);
17319 Dmask &= ~(1 << Comp);
17320 }
17321
17322 // Abort if we have more than one user per component.
17323 if (Users[Lane])
17324 return Node;
17325
17326 Users[Lane] = User;
17327 NewDmask |= 1 << Comp;
17328 }
17329 }
17330
17331 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17332 bool NoChannels = !NewDmask;
17333 if (NoChannels) {
17334 if (!UsesTFC) {
17335 // No uses of the result and not using TFC. Then do nothing.
17336 return Node;
17337 }
17338 // If the original dmask has one channel - then nothing to do
17339 if (OldBitsSet == 1)
17340 return Node;
17341 // Use an arbitrary dmask - required for the instruction to work
17342 NewDmask = 1;
17343 }
17344 // Abort if there's no change
17345 if (NewDmask == OldDmask)
17346 return Node;
17347
17348 unsigned BitsSet = llvm::popcount(NewDmask);
17349
17350 // Check for TFE or LWE - increase the number of channels by one to account
17351 // for the extra return value
17352 // This will need adjustment for D16 if this is also included in
17353 // adjustWriteMask (this function) but at present D16 are excluded.
17354 unsigned NewChannels = BitsSet + UsesTFC;
17355
17356 int NewOpcode =
17357 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17358 assert(NewOpcode != -1 &&
17359 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17360 "failed to find equivalent MIMG op");
17361
17362 // Adjust the writemask in the node
17364 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17365 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17366 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17367
17368 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17369
17370 MVT ResultVT = NewChannels == 1
17371 ? SVT
17372 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17373 : NewChannels == 5 ? 8
17374 : NewChannels);
17375 SDVTList NewVTList =
17376 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17377
17378 MachineSDNode *NewNode =
17379 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17380
17381 if (HasChain) {
17382 // Update chain.
17383 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17384 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17385 }
17386
17387 if (NewChannels == 1) {
17388 assert(Node->hasNUsesOfValue(1, 0));
17389 SDNode *Copy =
17390 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17391 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17392 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17393 return nullptr;
17394 }
17395
17396 // Update the users of the node with the new indices
17397 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17398 SDNode *User = Users[i];
17399 if (!User) {
17400 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17401 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17402 if (i || !NoChannels)
17403 continue;
17404 } else {
17405 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17406 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17407 if (NewUser != User) {
17408 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17409 DAG.RemoveDeadNode(User);
17410 }
17411 }
17412
17413 switch (Idx) {
17414 default:
17415 break;
17416 case AMDGPU::sub0:
17417 Idx = AMDGPU::sub1;
17418 break;
17419 case AMDGPU::sub1:
17420 Idx = AMDGPU::sub2;
17421 break;
17422 case AMDGPU::sub2:
17423 Idx = AMDGPU::sub3;
17424 break;
17425 case AMDGPU::sub3:
17426 Idx = AMDGPU::sub4;
17427 break;
17428 }
17429 }
17430
17431 DAG.RemoveDeadNode(Node);
17432 return nullptr;
17433}
17434
17436 if (Op.getOpcode() == ISD::AssertZext)
17437 Op = Op.getOperand(0);
17438
17439 return isa<FrameIndexSDNode>(Op);
17440}
17441
17442/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17443/// with frame index operands.
17444/// LLVM assumes that inputs are to these instructions are registers.
17445SDNode *
17447 SelectionDAG &DAG) const {
17448 if (Node->getOpcode() == ISD::CopyToReg) {
17449 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17450 SDValue SrcVal = Node->getOperand(2);
17451
17452 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17453 // to try understanding copies to physical registers.
17454 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17455 SDLoc SL(Node);
17457 SDValue VReg = DAG.getRegister(
17458 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17459
17460 SDNode *Glued = Node->getGluedNode();
17461 SDValue ToVReg = DAG.getCopyToReg(
17462 Node->getOperand(0), SL, VReg, SrcVal,
17463 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17464 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17465 VReg, ToVReg.getValue(1));
17466 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17467 DAG.RemoveDeadNode(Node);
17468 return ToResultReg.getNode();
17469 }
17470 }
17471
17473 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17474 if (!isFrameIndexOp(Node->getOperand(i))) {
17475 Ops.push_back(Node->getOperand(i));
17476 continue;
17477 }
17478
17479 SDLoc DL(Node);
17480 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17481 Node->getOperand(i).getValueType(),
17482 Node->getOperand(i)),
17483 0));
17484 }
17485
17486 return DAG.UpdateNodeOperands(Node, Ops);
17487}
17488
17489/// Fold the instructions after selecting them.
17490/// Returns null if users were already updated.
17492 SelectionDAG &DAG) const {
17494 unsigned Opcode = Node->getMachineOpcode();
17495
17496 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17497 !TII->isGather4(Opcode) &&
17498 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17499 return adjustWritemask(Node, DAG);
17500 }
17501
17502 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17504 return Node;
17505 }
17506
17507 switch (Opcode) {
17508 case AMDGPU::V_DIV_SCALE_F32_e64:
17509 case AMDGPU::V_DIV_SCALE_F64_e64: {
17510 // Satisfy the operand register constraint when one of the inputs is
17511 // undefined. Ordinarily each undef value will have its own implicit_def of
17512 // a vreg, so force these to use a single register.
17513 SDValue Src0 = Node->getOperand(1);
17514 SDValue Src1 = Node->getOperand(3);
17515 SDValue Src2 = Node->getOperand(5);
17516
17517 if ((Src0.isMachineOpcode() &&
17518 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17519 (Src0 == Src1 || Src0 == Src2))
17520 break;
17521
17522 MVT VT = Src0.getValueType().getSimpleVT();
17523 const TargetRegisterClass *RC =
17524 getRegClassFor(VT, Src0.getNode()->isDivergent());
17525
17527 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17528
17529 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17530 Src0, SDValue());
17531
17532 // src0 must be the same register as src1 or src2, even if the value is
17533 // undefined, so make sure we don't violate this constraint.
17534 if (Src0.isMachineOpcode() &&
17535 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17536 if (Src1.isMachineOpcode() &&
17537 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17538 Src0 = Src1;
17539 else if (Src2.isMachineOpcode() &&
17540 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17541 Src0 = Src2;
17542 else {
17543 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17544 Src0 = UndefReg;
17545 Src1 = UndefReg;
17546 }
17547 } else
17548 break;
17549
17551 Ops[1] = Src0;
17552 Ops[3] = Src1;
17553 Ops[5] = Src2;
17554 Ops.push_back(ImpDef.getValue(1));
17555 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17556 }
17557 default:
17558 break;
17559 }
17560
17561 return Node;
17562}
17563
17564// Any MIMG instructions that use tfe or lwe require an initialization of the
17565// result register that will be written in the case of a memory access failure.
17566// The required code is also added to tie this init code to the result of the
17567// img instruction.
17570 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17571 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17572 MachineBasicBlock &MBB = *MI.getParent();
17573
17574 int DstIdx =
17575 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17576 unsigned InitIdx = 0;
17577
17578 if (TII->isImage(MI)) {
17579 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17580 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17581 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17582
17583 if (!TFE && !LWE) // intersect_ray
17584 return;
17585
17586 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17587 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17588 unsigned D16Val = D16 ? D16->getImm() : 0;
17589
17590 if (!TFEVal && !LWEVal)
17591 return;
17592
17593 // At least one of TFE or LWE are non-zero
17594 // We have to insert a suitable initialization of the result value and
17595 // tie this to the dest of the image instruction.
17596
17597 // Calculate which dword we have to initialize to 0.
17598 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17599
17600 // check that dmask operand is found.
17601 assert(MO_Dmask && "Expected dmask operand in instruction");
17602
17603 unsigned dmask = MO_Dmask->getImm();
17604 // Determine the number of active lanes taking into account the
17605 // Gather4 special case
17606 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17607
17608 bool Packed = !Subtarget->hasUnpackedD16VMem();
17609
17610 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17611
17612 // Abandon attempt if the dst size isn't large enough
17613 // - this is in fact an error but this is picked up elsewhere and
17614 // reported correctly.
17615 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17616
17617 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17618 if (DstSize < InitIdx)
17619 return;
17620 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17621 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17622 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17623 } else {
17624 return;
17625 }
17626
17627 const DebugLoc &DL = MI.getDebugLoc();
17628
17629 // Create a register for the initialization value.
17630 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17631 unsigned NewDst = 0; // Final initialized value will be in here
17632
17633 // If PRTStrictNull feature is enabled (the default) then initialize
17634 // all the result registers to 0, otherwise just the error indication
17635 // register (VGPRn+1)
17636 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17637 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17638
17639 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17640 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17641 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17642 // Initialize dword
17643 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17644 // clang-format off
17645 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17646 .addImm(0);
17647 // clang-format on
17648 // Insert into the super-reg
17649 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17650 .addReg(PrevDst)
17651 .addReg(SubReg)
17653
17654 PrevDst = NewDst;
17655 }
17656
17657 // Add as an implicit operand
17658 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17659
17660 // Tie the just added implicit operand to the dst
17661 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17662}
17663
17664/// Assign the register class depending on the number of
17665/// bits set in the writemask
17667 SDNode *Node) const {
17669
17670 MachineFunction *MF = MI.getMF();
17672
17673 if (TII->isVOP3(MI.getOpcode())) {
17674 // Make sure constant bus requirements are respected.
17675 TII->legalizeOperandsVOP3(MRI, MI);
17676
17677 if (TII->isMAI(MI)) {
17678 // The ordinary src0, src1, src2 were legalized above.
17679 //
17680 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17681 // as a separate instruction.
17682 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17683 AMDGPU::OpName::scale_src0);
17684 if (Src0Idx != -1) {
17685 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17686 AMDGPU::OpName::scale_src1);
17687 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17688 TII->usesConstantBus(MRI, MI, Src1Idx))
17689 TII->legalizeOpWithMove(MI, Src1Idx);
17690 }
17691 }
17692
17693 return;
17694 }
17695
17696 if (TII->isImage(MI))
17697 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17698}
17699
17701 uint64_t Val) {
17702 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17703 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17704}
17705
17707 const SDLoc &DL,
17708 SDValue Ptr) const {
17710
17711 // Build the half of the subregister with the constants before building the
17712 // full 128-bit register. If we are building multiple resource descriptors,
17713 // this will allow CSEing of the 2-component register.
17714 const SDValue Ops0[] = {
17715 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17716 buildSMovImm32(DAG, DL, 0),
17717 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17718 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17719 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17720
17721 SDValue SubRegHi = SDValue(
17722 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17723
17724 // Combine the constants and the pointer.
17725 const SDValue Ops1[] = {
17726 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17727 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17728 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17729
17730 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17731}
17732
17733/// Return a resource descriptor with the 'Add TID' bit enabled
17734/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17735/// of the resource descriptor) to create an offset, which is added to
17736/// the resource pointer.
17738 SDValue Ptr, uint32_t RsrcDword1,
17739 uint64_t RsrcDword2And3) const {
17740 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17741 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17742 if (RsrcDword1) {
17743 PtrHi =
17744 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17745 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17746 0);
17747 }
17748
17749 SDValue DataLo =
17750 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17751 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17752
17753 const SDValue Ops[] = {
17754 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17755 PtrLo,
17756 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17757 PtrHi,
17758 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17759 DataLo,
17760 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17761 DataHi,
17762 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17763
17764 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17765}
17766
17767//===----------------------------------------------------------------------===//
17768// SI Inline Assembly Support
17769//===----------------------------------------------------------------------===//
17770
17771std::pair<unsigned, const TargetRegisterClass *>
17773 StringRef Constraint,
17774 MVT VT) const {
17775 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17776
17777 const TargetRegisterClass *RC = nullptr;
17778 if (Constraint.size() == 1) {
17779 // Check if we cannot determine the bit size of the given value type. This
17780 // can happen, for example, in this situation where we have an empty struct
17781 // (size 0): `call void asm "", "v"({} poison)`-
17782 if (VT == MVT::Other)
17783 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17784 const unsigned BitWidth = VT.getSizeInBits();
17785 switch (Constraint[0]) {
17786 default:
17787 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17788 case 's':
17789 case 'r':
17790 switch (BitWidth) {
17791 case 16:
17792 RC = &AMDGPU::SReg_32RegClass;
17793 break;
17794 case 64:
17795 RC = &AMDGPU::SGPR_64RegClass;
17796 break;
17797 default:
17799 if (!RC)
17800 return std::pair(0U, nullptr);
17801 break;
17802 }
17803 break;
17804 case 'v':
17805 switch (BitWidth) {
17806 case 1:
17807 return std::pair(0U, nullptr);
17808 case 16:
17809 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17810 : &AMDGPU::VGPR_32_Lo256RegClass;
17811 break;
17812 default:
17813 RC = Subtarget->has1024AddressableVGPRs()
17814 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17815 : TRI->getVGPRClassForBitWidth(BitWidth);
17816 if (!RC)
17817 return std::pair(0U, nullptr);
17818 break;
17819 }
17820 break;
17821 case 'a':
17822 if (!Subtarget->hasMAIInsts())
17823 break;
17824 switch (BitWidth) {
17825 case 1:
17826 return std::pair(0U, nullptr);
17827 case 16:
17828 RC = &AMDGPU::AGPR_32RegClass;
17829 break;
17830 default:
17831 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17832 if (!RC)
17833 return std::pair(0U, nullptr);
17834 break;
17835 }
17836 break;
17837 }
17838 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17839 const unsigned BitWidth = VT.getSizeInBits();
17840 switch (BitWidth) {
17841 case 16:
17842 RC = &AMDGPU::AV_32RegClass;
17843 break;
17844 default:
17845 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17846 if (!RC)
17847 return std::pair(0U, nullptr);
17848 break;
17849 }
17850 }
17851
17852 // We actually support i128, i16 and f16 as inline parameters
17853 // even if they are not reported as legal
17854 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17855 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17856 return std::pair(0U, RC);
17857
17858 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17859 if (Kind != '\0') {
17860 if (Kind == 'v') {
17861 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17862 } else if (Kind == 's') {
17863 RC = &AMDGPU::SGPR_32RegClass;
17864 } else if (Kind == 'a') {
17865 RC = &AMDGPU::AGPR_32RegClass;
17866 }
17867
17868 if (RC) {
17869 if (NumRegs > 1) {
17870 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17871 return std::pair(0U, nullptr);
17872
17873 uint32_t Width = NumRegs * 32;
17874 // Prohibit constraints for register ranges with a width that does not
17875 // match the required type.
17876 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17877 return std::pair(0U, nullptr);
17878
17879 MCRegister Reg = RC->getRegister(Idx);
17881 RC = TRI->getVGPRClassForBitWidth(Width);
17882 else if (SIRegisterInfo::isSGPRClass(RC))
17883 RC = TRI->getSGPRClassForBitWidth(Width);
17884 else if (SIRegisterInfo::isAGPRClass(RC))
17885 RC = TRI->getAGPRClassForBitWidth(Width);
17886 if (RC) {
17887 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17888 if (!Reg) {
17889 // The register class does not contain the requested register,
17890 // e.g., because it is an SGPR pair that would violate alignment
17891 // requirements.
17892 return std::pair(0U, nullptr);
17893 }
17894 return std::pair(Reg, RC);
17895 }
17896 }
17897
17898 // Check for lossy scalar/vector conversions.
17899 if (VT.isVector() && VT.getSizeInBits() != 32)
17900 return std::pair(0U, nullptr);
17901 if (Idx < RC->getNumRegs())
17902 return std::pair(RC->getRegister(Idx), RC);
17903 return std::pair(0U, nullptr);
17904 }
17905 }
17906
17907 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17908 if (Ret.first)
17909 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17910
17911 return Ret;
17912}
17913
17914static bool isImmConstraint(StringRef Constraint) {
17915 if (Constraint.size() == 1) {
17916 switch (Constraint[0]) {
17917 default:
17918 break;
17919 case 'I':
17920 case 'J':
17921 case 'A':
17922 case 'B':
17923 case 'C':
17924 return true;
17925 }
17926 } else if (Constraint == "DA" || Constraint == "DB") {
17927 return true;
17928 }
17929 return false;
17930}
17931
17934 if (Constraint.size() == 1) {
17935 switch (Constraint[0]) {
17936 default:
17937 break;
17938 case 's':
17939 case 'v':
17940 case 'a':
17941 return C_RegisterClass;
17942 }
17943 } else if (Constraint.size() == 2) {
17944 if (Constraint == "VA")
17945 return C_RegisterClass;
17946 }
17947 if (isImmConstraint(Constraint)) {
17948 return C_Other;
17949 }
17950 return TargetLowering::getConstraintType(Constraint);
17951}
17952
17953static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17955 Val = Val & maskTrailingOnes<uint64_t>(Size);
17956 }
17957 return Val;
17958}
17959
17961 StringRef Constraint,
17962 std::vector<SDValue> &Ops,
17963 SelectionDAG &DAG) const {
17964 if (isImmConstraint(Constraint)) {
17965 uint64_t Val;
17966 if (getAsmOperandConstVal(Op, Val) &&
17967 checkAsmConstraintVal(Op, Constraint, Val)) {
17968 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17969 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17970 }
17971 } else {
17973 }
17974}
17975
17977 unsigned Size = Op.getScalarValueSizeInBits();
17978 if (Size > 64)
17979 return false;
17980
17981 if (Size == 16 && !Subtarget->has16BitInsts())
17982 return false;
17983
17985 Val = C->getSExtValue();
17986 return true;
17987 }
17989 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17990 return true;
17991 }
17993 if (Size != 16 || Op.getNumOperands() != 2)
17994 return false;
17995 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17996 return false;
17997 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17998 Val = C->getSExtValue();
17999 return true;
18000 }
18001 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18002 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18003 return true;
18004 }
18005 }
18006
18007 return false;
18008}
18009
18011 uint64_t Val) const {
18012 if (Constraint.size() == 1) {
18013 switch (Constraint[0]) {
18014 case 'I':
18016 case 'J':
18017 return isInt<16>(Val);
18018 case 'A':
18019 return checkAsmConstraintValA(Op, Val);
18020 case 'B':
18021 return isInt<32>(Val);
18022 case 'C':
18023 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18025 default:
18026 break;
18027 }
18028 } else if (Constraint.size() == 2) {
18029 if (Constraint == "DA") {
18030 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18031 int64_t LoBits = static_cast<int32_t>(Val);
18032 return checkAsmConstraintValA(Op, HiBits, 32) &&
18033 checkAsmConstraintValA(Op, LoBits, 32);
18034 }
18035 if (Constraint == "DB") {
18036 return true;
18037 }
18038 }
18039 llvm_unreachable("Invalid asm constraint");
18040}
18041
18043 unsigned MaxSize) const {
18044 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18045 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18046 if (Size == 16) {
18047 MVT VT = Op.getSimpleValueType();
18048 switch (VT.SimpleTy) {
18049 default:
18050 return false;
18051 case MVT::i16:
18052 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18053 case MVT::f16:
18054 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18055 case MVT::bf16:
18056 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18057 case MVT::v2i16:
18058 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18059 case MVT::v2f16:
18060 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18061 case MVT::v2bf16:
18062 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18063 }
18064 }
18065 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18066 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18067 return true;
18068 return false;
18069}
18070
18071static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18072 switch (UnalignedClassID) {
18073 case AMDGPU::VReg_64RegClassID:
18074 return AMDGPU::VReg_64_Align2RegClassID;
18075 case AMDGPU::VReg_96RegClassID:
18076 return AMDGPU::VReg_96_Align2RegClassID;
18077 case AMDGPU::VReg_128RegClassID:
18078 return AMDGPU::VReg_128_Align2RegClassID;
18079 case AMDGPU::VReg_160RegClassID:
18080 return AMDGPU::VReg_160_Align2RegClassID;
18081 case AMDGPU::VReg_192RegClassID:
18082 return AMDGPU::VReg_192_Align2RegClassID;
18083 case AMDGPU::VReg_224RegClassID:
18084 return AMDGPU::VReg_224_Align2RegClassID;
18085 case AMDGPU::VReg_256RegClassID:
18086 return AMDGPU::VReg_256_Align2RegClassID;
18087 case AMDGPU::VReg_288RegClassID:
18088 return AMDGPU::VReg_288_Align2RegClassID;
18089 case AMDGPU::VReg_320RegClassID:
18090 return AMDGPU::VReg_320_Align2RegClassID;
18091 case AMDGPU::VReg_352RegClassID:
18092 return AMDGPU::VReg_352_Align2RegClassID;
18093 case AMDGPU::VReg_384RegClassID:
18094 return AMDGPU::VReg_384_Align2RegClassID;
18095 case AMDGPU::VReg_512RegClassID:
18096 return AMDGPU::VReg_512_Align2RegClassID;
18097 case AMDGPU::VReg_1024RegClassID:
18098 return AMDGPU::VReg_1024_Align2RegClassID;
18099 case AMDGPU::AReg_64RegClassID:
18100 return AMDGPU::AReg_64_Align2RegClassID;
18101 case AMDGPU::AReg_96RegClassID:
18102 return AMDGPU::AReg_96_Align2RegClassID;
18103 case AMDGPU::AReg_128RegClassID:
18104 return AMDGPU::AReg_128_Align2RegClassID;
18105 case AMDGPU::AReg_160RegClassID:
18106 return AMDGPU::AReg_160_Align2RegClassID;
18107 case AMDGPU::AReg_192RegClassID:
18108 return AMDGPU::AReg_192_Align2RegClassID;
18109 case AMDGPU::AReg_256RegClassID:
18110 return AMDGPU::AReg_256_Align2RegClassID;
18111 case AMDGPU::AReg_512RegClassID:
18112 return AMDGPU::AReg_512_Align2RegClassID;
18113 case AMDGPU::AReg_1024RegClassID:
18114 return AMDGPU::AReg_1024_Align2RegClassID;
18115 default:
18116 return -1;
18117 }
18118}
18119
18120// Figure out which registers should be reserved for stack access. Only after
18121// the function is legalized do we know all of the non-spill stack objects or if
18122// calls are present.
18126 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18127 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18128 const SIInstrInfo *TII = ST.getInstrInfo();
18129
18130 if (Info->isEntryFunction()) {
18131 // Callable functions have fixed registers used for stack access.
18133 }
18134
18135 // TODO: Move this logic to getReservedRegs()
18136 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18137 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18138 Register SReg = ST.isWave32()
18139 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18140 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18141 &AMDGPU::SGPR_64RegClass);
18142 Info->setSGPRForEXECCopy(SReg);
18143
18144 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18145 Info->getStackPtrOffsetReg()));
18146 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18147 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18148
18149 // We need to worry about replacing the default register with itself in case
18150 // of MIR testcases missing the MFI.
18151 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18152 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18153
18154 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18155 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18156
18157 Info->limitOccupancy(MF);
18158
18159 if (ST.isWave32() && !MF.empty()) {
18160 for (auto &MBB : MF) {
18161 for (auto &MI : MBB) {
18162 TII->fixImplicitOperands(MI);
18163 }
18164 }
18165 }
18166
18167 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18168 // classes if required. Ideally the register class constraints would differ
18169 // per-subtarget, but there's no easy way to achieve that right now. This is
18170 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18171 // from using them as the register class for legal types.
18172 if (ST.needsAlignedVGPRs()) {
18173 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18174 const Register Reg = Register::index2VirtReg(I);
18175 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18176 if (!RC)
18177 continue;
18178 int NewClassID = getAlignedAGPRClassID(RC->getID());
18179 if (NewClassID != -1)
18180 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18181 }
18182 }
18183
18185}
18186
18188 KnownBits &Known,
18189 const APInt &DemandedElts,
18190 const SelectionDAG &DAG,
18191 unsigned Depth) const {
18192 Known.resetAll();
18193 unsigned Opc = Op.getOpcode();
18194 switch (Opc) {
18196 unsigned IID = Op.getConstantOperandVal(0);
18197 switch (IID) {
18198 case Intrinsic::amdgcn_mbcnt_lo:
18199 case Intrinsic::amdgcn_mbcnt_hi: {
18200 const GCNSubtarget &ST =
18202 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18203 // most 31 + src1.
18204 Known.Zero.setBitsFrom(
18205 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18206 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18207 Known = KnownBits::add(Known, Known2);
18208 return;
18209 }
18210 }
18211 break;
18212 }
18213 }
18215 Op, Known, DemandedElts, DAG, Depth);
18216}
18217
18219 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18221
18222 // Set the high bits to zero based on the maximum allowed scratch size per
18223 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18224 // calculation won't overflow, so assume the sign bit is never set.
18225 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18226}
18227
18229 GISelValueTracking &VT, KnownBits &Known,
18230 unsigned Dim) {
18231 unsigned MaxValue =
18232 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18233 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18234}
18235
18237 KnownBits &Known, const APInt &DemandedElts,
18238 unsigned BFEWidth, bool SExt, unsigned Depth) {
18240 const MachineOperand &Src1 = MI.getOperand(2);
18241
18242 unsigned Src1Cst = 0;
18243 if (Src1.isImm()) {
18244 Src1Cst = Src1.getImm();
18245 } else if (Src1.isReg()) {
18246 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18247 if (!Cst)
18248 return;
18249 Src1Cst = Cst->Value.getZExtValue();
18250 } else {
18251 return;
18252 }
18253
18254 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18255 // Width is always [22:16].
18256 const unsigned Offset =
18257 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18258 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18259
18260 if (Width >= BFEWidth) // Ill-formed.
18261 return;
18262
18263 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18264 Depth + 1);
18265
18266 Known = Known.extractBits(Width, Offset);
18267
18268 if (SExt)
18269 Known = Known.sext(BFEWidth);
18270 else
18271 Known = Known.zext(BFEWidth);
18272}
18273
18275 GISelValueTracking &VT, Register R, KnownBits &Known,
18276 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18277 unsigned Depth) const {
18278 Known.resetAll();
18279 const MachineInstr *MI = MRI.getVRegDef(R);
18280 switch (MI->getOpcode()) {
18281 case AMDGPU::S_BFE_I32:
18282 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18283 /*SExt=*/true, Depth);
18284 case AMDGPU::S_BFE_U32:
18285 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18286 /*SExt=*/false, Depth);
18287 case AMDGPU::S_BFE_I64:
18288 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18289 /*SExt=*/true, Depth);
18290 case AMDGPU::S_BFE_U64:
18291 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18292 /*SExt=*/false, Depth);
18293 case AMDGPU::G_INTRINSIC:
18294 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18295 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18296 switch (IID) {
18297 case Intrinsic::amdgcn_workitem_id_x:
18298 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18299 break;
18300 case Intrinsic::amdgcn_workitem_id_y:
18301 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18302 break;
18303 case Intrinsic::amdgcn_workitem_id_z:
18304 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18305 break;
18306 case Intrinsic::amdgcn_mbcnt_lo:
18307 case Intrinsic::amdgcn_mbcnt_hi: {
18308 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18309 // most 31 + src1.
18310 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18311 ? getSubtarget()->getWavefrontSizeLog2()
18312 : 5);
18313 KnownBits Known2;
18314 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18315 Depth + 1);
18316 Known = KnownBits::add(Known, Known2);
18317 break;
18318 }
18319 case Intrinsic::amdgcn_groupstaticsize: {
18320 // We can report everything over the maximum size as 0. We can't report
18321 // based on the actual size because we don't know if it's accurate or not
18322 // at any given point.
18323 Known.Zero.setHighBits(
18324 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18325 break;
18326 }
18327 }
18328 break;
18329 }
18330 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18331 Known.Zero.setHighBits(24);
18332 break;
18333 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18334 Known.Zero.setHighBits(16);
18335 break;
18336 case AMDGPU::G_AMDGPU_SMED3:
18337 case AMDGPU::G_AMDGPU_UMED3: {
18338 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18339
18340 KnownBits Known2;
18341 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18342 if (Known2.isUnknown())
18343 break;
18344
18345 KnownBits Known1;
18346 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18347 if (Known1.isUnknown())
18348 break;
18349
18350 KnownBits Known0;
18351 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18352 if (Known0.isUnknown())
18353 break;
18354
18355 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18356 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18357 Known.One = Known0.One & Known1.One & Known2.One;
18358 break;
18359 }
18360 }
18361}
18362
18365 unsigned Depth) const {
18366 const MachineInstr *MI = MRI.getVRegDef(R);
18367 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18368 // FIXME: Can this move to generic code? What about the case where the call
18369 // site specifies a lower alignment?
18370 Intrinsic::ID IID = GI->getIntrinsicID();
18372 AttributeList Attrs =
18373 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18374 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18375 return *RetAlign;
18376 }
18377 return Align(1);
18378}
18379
18382 const Align CacheLineAlign = Align(64);
18383
18384 // Pre-GFX10 target did not benefit from loop alignment
18385 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18386 getSubtarget()->hasInstFwdPrefetchBug())
18387 return PrefAlign;
18388
18389 // On GFX10 I$ is 4 x 64 bytes cache lines.
18390 // By default prefetcher keeps one cache line behind and reads two ahead.
18391 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18392 // behind and one ahead.
18393 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18394 // If loop fits 64 bytes it always spans no more than two cache lines and
18395 // does not need an alignment.
18396 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18397 // Else if loop is less or equal 192 bytes we need two lines behind.
18398
18400 const MachineBasicBlock *Header = ML->getHeader();
18401 if (Header->getAlignment() != PrefAlign)
18402 return Header->getAlignment(); // Already processed.
18403
18404 unsigned LoopSize = 0;
18405 for (const MachineBasicBlock *MBB : ML->blocks()) {
18406 // If inner loop block is aligned assume in average half of the alignment
18407 // size to be added as nops.
18408 if (MBB != Header)
18409 LoopSize += MBB->getAlignment().value() / 2;
18410
18411 for (const MachineInstr &MI : *MBB) {
18412 LoopSize += TII->getInstSizeInBytes(MI);
18413 if (LoopSize > 192)
18414 return PrefAlign;
18415 }
18416 }
18417
18418 if (LoopSize <= 64)
18419 return PrefAlign;
18420
18421 if (LoopSize <= 128)
18422 return CacheLineAlign;
18423
18424 // If any of parent loops is surrounded by prefetch instructions do not
18425 // insert new for inner loop, which would reset parent's settings.
18426 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18427 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18428 auto I = Exit->getFirstNonDebugInstr();
18429 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18430 return CacheLineAlign;
18431 }
18432 }
18433
18434 MachineBasicBlock *Pre = ML->getLoopPreheader();
18435 MachineBasicBlock *Exit = ML->getExitBlock();
18436
18437 if (Pre && Exit) {
18438 auto PreTerm = Pre->getFirstTerminator();
18439 if (PreTerm == Pre->begin() ||
18440 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18441 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18442 .addImm(1); // prefetch 2 lines behind PC
18443
18444 auto ExitHead = Exit->getFirstNonDebugInstr();
18445 if (ExitHead == Exit->end() ||
18446 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18447 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18448 .addImm(2); // prefetch 1 line behind PC
18449 }
18450
18451 return CacheLineAlign;
18452}
18453
18454[[maybe_unused]]
18455static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18456 assert(N->getOpcode() == ISD::CopyFromReg);
18457 do {
18458 // Follow the chain until we find an INLINEASM node.
18459 N = N->getOperand(0).getNode();
18460 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18461 return true;
18462 } while (N->getOpcode() == ISD::CopyFromReg);
18463 return false;
18464}
18465
18468 UniformityInfo *UA) const {
18469 switch (N->getOpcode()) {
18470 case ISD::CopyFromReg: {
18471 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18472 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18473 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18474 Register Reg = R->getReg();
18475
18476 // FIXME: Why does this need to consider isLiveIn?
18477 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18478 return !TRI->isSGPRReg(MRI, Reg);
18479
18480 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18481 return UA->isDivergent(V);
18482
18484 return !TRI->isSGPRReg(MRI, Reg);
18485 }
18486 case ISD::LOAD: {
18487 const LoadSDNode *L = cast<LoadSDNode>(N);
18488 unsigned AS = L->getAddressSpace();
18489 // A flat load may access private memory.
18491 }
18492 case ISD::CALLSEQ_END:
18493 return true;
18495 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18497 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18498 case AMDGPUISD::ATOMIC_CMP_SWAP:
18499 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18500 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18501 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18502 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18503 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18504 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18505 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18506 case AMDGPUISD::BUFFER_ATOMIC_AND:
18507 case AMDGPUISD::BUFFER_ATOMIC_OR:
18508 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18509 case AMDGPUISD::BUFFER_ATOMIC_INC:
18510 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18511 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18512 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18513 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18514 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18515 // Target-specific read-modify-write atomics are sources of divergence.
18516 return true;
18517 default:
18518 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18519 // Generic read-modify-write atomics are sources of divergence.
18520 return A->readMem() && A->writeMem();
18521 }
18522 return false;
18523 }
18524}
18525
18527 EVT VT) const {
18528 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18529 case MVT::f32:
18531 case MVT::f64:
18532 case MVT::f16:
18534 default:
18535 return false;
18536 }
18537}
18538
18540 LLT Ty, const MachineFunction &MF) const {
18541 switch (Ty.getScalarSizeInBits()) {
18542 case 32:
18543 return !denormalModeIsFlushAllF32(MF);
18544 case 64:
18545 case 16:
18546 return !denormalModeIsFlushAllF64F16(MF);
18547 default:
18548 return false;
18549 }
18550}
18551
18553 const APInt &DemandedElts,
18554 const SelectionDAG &DAG,
18555 bool SNaN,
18556 unsigned Depth) const {
18557 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18558 const MachineFunction &MF = DAG.getMachineFunction();
18560
18561 if (Info->getMode().DX10Clamp)
18562 return true; // Clamped to 0.
18563 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18564 }
18565
18567 DAG, SNaN, Depth);
18568}
18569
18570// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18571// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18573 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18574 return true;
18575
18576 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18577 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18578 if (DenormMode == DenormalMode::getPreserveSign())
18579 return true;
18580
18581 // TODO: Remove this.
18582 return RMW->getFunction()
18583 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18584 .getValueAsBool();
18585}
18586
18588 LLVMContext &Ctx = RMW->getContext();
18589 StringRef MemScope =
18590 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18591
18592 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18593 << "Hardware instruction generated for atomic "
18594 << RMW->getOperationName(RMW->getOperation())
18595 << " operation at memory scope " << MemScope;
18596}
18597
18598static bool isV2F16OrV2BF16(Type *Ty) {
18599 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18600 Type *EltTy = VT->getElementType();
18601 return VT->getNumElements() == 2 &&
18602 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18603 }
18604
18605 return false;
18606}
18607
18608static bool isV2F16(Type *Ty) {
18610 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18611}
18612
18613static bool isV2BF16(Type *Ty) {
18615 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18616}
18617
18618/// \return true if atomicrmw integer ops work for the type.
18619static bool isAtomicRMWLegalIntTy(Type *Ty) {
18620 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18621 unsigned BW = IT->getBitWidth();
18622 return BW == 32 || BW == 64;
18623 }
18624
18625 return false;
18626}
18627
18628/// \return true if this atomicrmw xchg type can be selected.
18629static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18630 Type *Ty = RMW->getType();
18631 if (isAtomicRMWLegalIntTy(Ty))
18632 return true;
18633
18634 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18635 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18636 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18637 return BW == 32 || BW == 64;
18638 }
18639
18640 if (Ty->isFloatTy() || Ty->isDoubleTy())
18641 return true;
18642
18644 return VT->getNumElements() == 2 &&
18645 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18646 }
18647
18648 return false;
18649}
18650
18651/// \returns true if it's valid to emit a native instruction for \p RMW, based
18652/// on the properties of the target memory.
18653static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18654 const AtomicRMWInst *RMW,
18655 bool HasSystemScope) {
18656 // The remote/fine-grained access logic is different from the integer
18657 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18658 // fine-grained access does not work, even for a device local allocation.
18659 //
18660 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18661 // allocations work.
18662 if (HasSystemScope) {
18664 RMW->hasMetadata("amdgpu.no.remote.memory"))
18665 return true;
18666 if (Subtarget.hasEmulatedSystemScopeAtomics())
18667 return true;
18669 return true;
18670
18671 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18672}
18673
18674/// \return Action to perform on AtomicRMWInsts for integer operations.
18681
18682/// Return if a flat address space atomicrmw can access private memory.
18684 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18685 return !MD ||
18687}
18688
18696
18699 unsigned AS = RMW->getPointerAddressSpace();
18700 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18702
18703 // 64-bit flat atomics that dynamically reside in private memory will silently
18704 // be dropped.
18705 //
18706 // Note that we will emit a new copy of the original atomic in the expansion,
18707 // which will be incrementally relegalized.
18708 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18709 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18710 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18713
18714 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18716 ORE.emit([=]() {
18717 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18718 });
18719 return Kind;
18720 };
18721
18722 auto SSID = RMW->getSyncScopeID();
18723 bool HasSystemScope =
18724 SSID == SyncScope::System ||
18725 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18726
18727 auto Op = RMW->getOperation();
18728 switch (Op) {
18730 // PCIe supports add and xchg for system atomics.
18731 return isAtomicRMWLegalXChgTy(RMW)
18734 case AtomicRMWInst::Add:
18735 // PCIe supports add and xchg for system atomics.
18737 case AtomicRMWInst::Sub:
18738 case AtomicRMWInst::And:
18739 case AtomicRMWInst::Or:
18740 case AtomicRMWInst::Xor:
18741 case AtomicRMWInst::Max:
18742 case AtomicRMWInst::Min:
18749 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18751 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18754 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18755 if (!IT || IT->getBitWidth() != 32)
18757 }
18758
18761 if (Subtarget->hasEmulatedSystemScopeAtomics())
18763
18764 // On most subtargets, for atomicrmw operations other than add/xchg,
18765 // whether or not the instructions will behave correctly depends on where
18766 // the address physically resides and what interconnect is used in the
18767 // system configuration. On some some targets the instruction will nop,
18768 // and in others synchronization will only occur at degraded device scope.
18769 //
18770 // If the allocation is known local to the device, the instructions should
18771 // work correctly.
18772 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18774
18775 // If fine-grained remote memory works at device scope, we don't need to
18776 // do anything.
18777 if (!HasSystemScope &&
18778 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18780
18781 // If we are targeting a remote allocated address, it depends what kind of
18782 // allocation the address belongs to.
18783 //
18784 // If the allocation is fine-grained (in host memory, or in PCIe peer
18785 // device memory), the operation will fail depending on the target.
18786 //
18787 // Note fine-grained host memory access does work on APUs or if XGMI is
18788 // used, but we do not know if we are targeting an APU or the system
18789 // configuration from the ISA version/target-cpu.
18790 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18792
18795 // Atomic sub/or/xor do not work over PCI express, but atomic add
18796 // does. InstCombine transforms these with 0 to or, so undo that.
18797 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18798 ConstVal && ConstVal->isNullValue())
18800 }
18801
18802 // If the allocation could be in remote, fine-grained memory, the rmw
18803 // instructions may fail. cmpxchg should work, so emit that. On some
18804 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18805 // even work, so you're out of luck anyway.
18806
18807 // In summary:
18808 //
18809 // Cases that may fail:
18810 // - fine-grained pinned host memory
18811 // - fine-grained migratable host memory
18812 // - fine-grained PCIe peer device
18813 //
18814 // Cases that should work, but may be treated overly conservatively.
18815 // - fine-grained host memory on an APU
18816 // - fine-grained XGMI peer device
18818 }
18819
18821 }
18822 case AtomicRMWInst::FAdd: {
18823 Type *Ty = RMW->getType();
18824
18825 // TODO: Handle REGION_ADDRESS
18826 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18827 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18828 // is fixed to round-to-nearest-even.
18829 //
18830 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18831 // round-to-nearest-even.
18832 //
18833 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18834 // suggests it is OK if the floating-point mode may not match the calling
18835 // thread.
18836 if (Ty->isFloatTy()) {
18837 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18839 }
18840
18841 if (Ty->isDoubleTy()) {
18842 // Ignores denormal mode, but we don't consider flushing mandatory.
18843 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18845 }
18846
18847 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18849
18851 }
18852
18853 // LDS atomics respect the denormal mode from the mode register.
18854 //
18855 // Traditionally f32 global/buffer memory atomics would unconditionally
18856 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18857 // flush.
18858 //
18859 // On targets with flat atomic fadd, denormals would flush depending on
18860 // whether the target address resides in LDS or global memory. We consider
18861 // this flat-maybe-flush as will-flush.
18862 if (Ty->isFloatTy() &&
18863 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18866
18867 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18868 // safe. The message phrasing also should be better.
18869 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18870 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18871 // gfx942, gfx12
18872 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18873 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18874 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18875 // gfx90a, gfx942, gfx12
18876 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18877 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18878
18879 // gfx942, gfx12
18880 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18881 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18882 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18883 // gfx90a, gfx942, gfx12
18884 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18885 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18886
18887 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18888 // buffer. gfx12 does have the buffer version.
18889 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18890 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18891 }
18892
18893 // global and flat atomic fadd f64: gfx90a, gfx942.
18894 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18895 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18896
18897 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18898 if (Ty->isFloatTy()) {
18899 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18900 // gfx11+.
18901 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18902 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18903 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18904 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18905 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18906 } else {
18907 // gfx908
18908 if (RMW->use_empty() &&
18909 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18910 isV2F16(Ty))
18911 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18912 }
18913 }
18914
18915 // flat atomic fadd f32: gfx942, gfx11+.
18916 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18917 if (Subtarget->hasFlatAtomicFaddF32Inst())
18918 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18919
18920 // If it is in flat address space, and the type is float, we will try to
18921 // expand it, if the target supports global and lds atomic fadd. The
18922 // reason we need that is, in the expansion, we emit the check of
18923 // address space. If it is in global address space, we emit the global
18924 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18925 // fadd.
18926 if (Subtarget->hasLDSFPAtomicAddF32()) {
18927 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18929 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18931 }
18932 }
18933 }
18934
18936 }
18938 case AtomicRMWInst::FMax: {
18939 Type *Ty = RMW->getType();
18940
18941 // LDS float and double fmin/fmax were always supported.
18942 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18943 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18945 }
18946
18947 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18948 // For flat and global cases:
18949 // float, double in gfx7. Manual claims denormal support.
18950 // Removed in gfx8.
18951 // float, double restored in gfx10.
18952 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18953 //
18954 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18955 // no f32.
18956 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18957 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18958 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18959 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18960 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18961 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18963 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18964 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18965 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18966 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18967 }
18968 }
18969
18971 }
18974 default:
18976 }
18977
18978 llvm_unreachable("covered atomicrmw op switch");
18979}
18980
18987
18994
18997 const AtomicCmpXchgInst *CmpX) const {
18998 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18999 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19001
19002 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
19004
19005 const DataLayout &DL = CmpX->getDataLayout();
19006
19007 Type *ValTy = CmpX->getNewValOperand()->getType();
19008
19009 // If a 64-bit flat atomic may alias private, we need to avoid using the
19010 // atomic in the private case.
19011 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19013}
19014
19015const TargetRegisterClass *
19016SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19018 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19019 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19020 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19021 : &AMDGPU::SReg_32RegClass;
19022 if (!TRI->isSGPRClass(RC) && !isDivergent)
19023 return TRI->getEquivalentSGPRClass(RC);
19024 if (TRI->isSGPRClass(RC) && isDivergent) {
19025 if (Subtarget->hasGFX90AInsts())
19026 return TRI->getEquivalentAVClass(RC);
19027 return TRI->getEquivalentVGPRClass(RC);
19028 }
19029
19030 return RC;
19031}
19032
19033// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19034// uniform values (as produced by the mask results of control flow intrinsics)
19035// used outside of divergent blocks. The phi users need to also be treated as
19036// always uniform.
19037//
19038// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19039static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19040 unsigned WaveSize) {
19041 // FIXME: We assume we never cast the mask results of a control flow
19042 // intrinsic.
19043 // Early exit if the type won't be consistent as a compile time hack.
19044 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19045 if (!IT || IT->getBitWidth() != WaveSize)
19046 return false;
19047
19048 if (!isa<Instruction>(V))
19049 return false;
19050 if (!Visited.insert(V).second)
19051 return false;
19052 bool Result = false;
19053 for (const auto *U : V->users()) {
19055 if (V == U->getOperand(1)) {
19056 switch (Intrinsic->getIntrinsicID()) {
19057 default:
19058 Result = false;
19059 break;
19060 case Intrinsic::amdgcn_if_break:
19061 case Intrinsic::amdgcn_if:
19062 case Intrinsic::amdgcn_else:
19063 Result = true;
19064 break;
19065 }
19066 }
19067 if (V == U->getOperand(0)) {
19068 switch (Intrinsic->getIntrinsicID()) {
19069 default:
19070 Result = false;
19071 break;
19072 case Intrinsic::amdgcn_end_cf:
19073 case Intrinsic::amdgcn_loop:
19074 Result = true;
19075 break;
19076 }
19077 }
19078 } else {
19079 Result = hasCFUser(U, Visited, WaveSize);
19080 }
19081 if (Result)
19082 break;
19083 }
19084 return Result;
19085}
19086
19088 const Value *V) const {
19089 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19090 if (CI->isInlineAsm()) {
19091 // FIXME: This cannot give a correct answer. This should only trigger in
19092 // the case where inline asm returns mixed SGPR and VGPR results, used
19093 // outside the defining block. We don't have a specific result to
19094 // consider, so this assumes if any value is SGPR, the overall register
19095 // also needs to be SGPR.
19096 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19098 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19099 for (auto &TC : TargetConstraints) {
19100 if (TC.Type == InlineAsm::isOutput) {
19102 const TargetRegisterClass *RC =
19103 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19104 TC.ConstraintVT)
19105 .second;
19106 if (RC && SIRI->isSGPRClass(RC))
19107 return true;
19108 }
19109 }
19110 }
19111 }
19113 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19114}
19115
19117 for (SDUse &Use : N->uses()) {
19119 if (getBasePtrIndex(M) == Use.getOperandNo())
19120 return true;
19121 }
19122 }
19123 return false;
19124}
19125
19127 SDValue N1) const {
19128 if (!N0.hasOneUse())
19129 return false;
19130 // Take care of the opportunity to keep N0 uniform
19131 if (N0->isDivergent() || !N1->isDivergent())
19132 return true;
19133 // Check if we have a good chance to form the memory access pattern with the
19134 // base and offset
19135 return (DAG.isBaseWithConstantOffset(N0) &&
19137}
19138
19140 Register N0, Register N1) const {
19141 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19142}
19143
19146 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19148 if (I.getMetadata("amdgpu.noclobber"))
19149 Flags |= MONoClobber;
19150 if (I.getMetadata("amdgpu.last.use"))
19151 Flags |= MOLastUse;
19152 return Flags;
19153}
19154
19156 Instruction *AI) const {
19157 // Given: atomicrmw fadd ptr %addr, float %val ordering
19158 //
19159 // With this expansion we produce the following code:
19160 // [...]
19161 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19162 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19163 //
19164 // atomicrmw.shared:
19165 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19166 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19167 // float %val ordering
19168 // br label %atomicrmw.phi
19169 //
19170 // atomicrmw.check.private:
19171 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19172 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19173 //
19174 // atomicrmw.private:
19175 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19176 // %loaded.private = load float, ptr addrspace(5) %cast.private
19177 // %val.new = fadd float %loaded.private, %val
19178 // store float %val.new, ptr addrspace(5) %cast.private
19179 // br label %atomicrmw.phi
19180 //
19181 // atomicrmw.global:
19182 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19183 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19184 // float %val ordering
19185 // br label %atomicrmw.phi
19186 //
19187 // atomicrmw.phi:
19188 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19189 // [ %loaded.private, %atomicrmw.private ],
19190 // [ %loaded.global, %atomicrmw.global ]
19191 // br label %atomicrmw.end
19192 //
19193 // atomicrmw.end:
19194 // [...]
19195 //
19196 //
19197 // For 64-bit atomics which may reside in private memory, we perform a simpler
19198 // version that only inserts the private check, and uses the flat operation.
19199
19200 IRBuilder<> Builder(AI);
19201 LLVMContext &Ctx = Builder.getContext();
19202
19203 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19204 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19206 Value *Addr = AI->getOperand(PtrOpIdx);
19207
19208 /// TODO: Only need to check private, then emit flat-known-not private (no
19209 /// need for shared block, or cast to global).
19211
19212 Align Alignment;
19213 if (RMW)
19214 Alignment = RMW->getAlign();
19215 else if (CX)
19216 Alignment = CX->getAlign();
19217 else
19218 llvm_unreachable("unhandled atomic operation");
19219
19220 // FullFlatEmulation is true if we need to issue the private, shared, and
19221 // global cases.
19222 //
19223 // If this is false, we are only dealing with the flat-targeting-private case,
19224 // where we only insert a check for private and still use the flat instruction
19225 // for global and shared.
19226
19227 bool FullFlatEmulation =
19228 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19229 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19230 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19231 RMW->getType()->isDoubleTy()));
19232
19233 // If the return value isn't used, do not introduce a false use in the phi.
19234 bool ReturnValueIsUsed = !AI->use_empty();
19235
19236 BasicBlock *BB = Builder.GetInsertBlock();
19237 Function *F = BB->getParent();
19238 BasicBlock *ExitBB =
19239 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19240 BasicBlock *SharedBB = nullptr;
19241
19242 BasicBlock *CheckPrivateBB = BB;
19243 if (FullFlatEmulation) {
19244 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19245 CheckPrivateBB =
19246 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19247 }
19248
19249 BasicBlock *PrivateBB =
19250 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19251 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19252 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19253
19254 std::prev(BB->end())->eraseFromParent();
19255 Builder.SetInsertPoint(BB);
19256
19257 Value *LoadedShared = nullptr;
19258 if (FullFlatEmulation) {
19259 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19260 {Addr}, nullptr, "is.shared");
19261 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19262 Builder.SetInsertPoint(SharedBB);
19263 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19265
19266 Instruction *Clone = AI->clone();
19267 Clone->insertInto(SharedBB, SharedBB->end());
19268 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19269 LoadedShared = Clone;
19270
19271 Builder.CreateBr(PhiBB);
19272 Builder.SetInsertPoint(CheckPrivateBB);
19273 }
19274
19275 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19276 {Addr}, nullptr, "is.private");
19277 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19278
19279 Builder.SetInsertPoint(PrivateBB);
19280
19281 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19283
19284 Value *LoadedPrivate;
19285 if (RMW) {
19286 LoadedPrivate = Builder.CreateAlignedLoad(
19287 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19288
19289 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19290 LoadedPrivate, RMW->getValOperand());
19291
19292 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19293 } else {
19294 auto [ResultLoad, Equal] =
19295 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19296 CX->getNewValOperand(), CX->getAlign());
19297
19298 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19299 ResultLoad, 0);
19300 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19301 }
19302
19303 Builder.CreateBr(PhiBB);
19304
19305 Builder.SetInsertPoint(GlobalBB);
19306
19307 // Continue using a flat instruction if we only emitted the check for private.
19308 Instruction *LoadedGlobal = AI;
19309 if (FullFlatEmulation) {
19310 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19312 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19313 }
19314
19315 AI->removeFromParent();
19316 AI->insertInto(GlobalBB, GlobalBB->end());
19317
19318 // The new atomicrmw may go through another round of legalization later.
19319 if (!FullFlatEmulation) {
19320 // We inserted the runtime check already, make sure we do not try to
19321 // re-expand this.
19322 // TODO: Should union with any existing metadata.
19323 MDBuilder MDB(F->getContext());
19324 MDNode *RangeNotPrivate =
19327 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19328 RangeNotPrivate);
19329 }
19330
19331 Builder.CreateBr(PhiBB);
19332
19333 Builder.SetInsertPoint(PhiBB);
19334
19335 if (ReturnValueIsUsed) {
19336 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19337 AI->replaceAllUsesWith(Loaded);
19338 if (FullFlatEmulation)
19339 Loaded->addIncoming(LoadedShared, SharedBB);
19340 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19341 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19342 Loaded->takeName(AI);
19343 }
19344
19345 Builder.CreateBr(ExitBB);
19346}
19347
19349 unsigned PtrOpIdx) {
19350 Value *PtrOp = I->getOperand(PtrOpIdx);
19353
19354 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19355 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19356 I->getIterator());
19357 I->setOperand(PtrOpIdx, ASCast);
19358}
19359
19362
19365
19368 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19369 ConstVal && ConstVal->isNullValue()) {
19370 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19372
19373 // We may still need the private-alias-flat handling below.
19374
19375 // TODO: Skip this for cases where we cannot access remote memory.
19376 }
19377 }
19378
19379 // The non-flat expansions should only perform the de-canonicalization of
19380 // identity values.
19382 return;
19383
19385}
19386
19393
19397
19399 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19400}
19401
19403 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19404 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19405
19407 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19408}
19409
19410LoadInst *
19412 IRBuilder<> Builder(AI);
19413 auto Order = AI->getOrdering();
19414
19415 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19416 // must be flushed if the atomic ordering had a release semantics. This is
19417 // not necessary a fence, a release fence just coincides to do that flush.
19418 // Avoid replacing of an atomicrmw with a release semantics.
19419 if (isReleaseOrStronger(Order))
19420 return nullptr;
19421
19422 LoadInst *LI = Builder.CreateAlignedLoad(
19423 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19424 LI->setAtomic(Order, AI->getSyncScopeID());
19425 LI->copyMetadata(*AI);
19426 LI->takeName(AI);
19427 AI->replaceAllUsesWith(LI);
19428 AI->eraseFromParent();
19429 return LI;
19430}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1183
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5975
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1560
bool isNegative() const
Definition APFloat.h:1512
bool isNormal() const
Definition APFloat.h:1516
APInt bitcastToAPInt() const
Definition APFloat.h:1416
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1201
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1142
bool isInfinity() const
Definition APFloat.h:1509
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1400
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1394
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1648
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2762
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:427
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:818
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:787
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:778
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:852
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:746
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:992
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:974
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:664
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:786
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:795
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:969
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:703
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:810
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:887
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:804
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:996
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:832
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr unsigned getUndefRegState(bool B)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:175
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:238
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:183
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:360
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:261
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs