LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
40#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/IntrinsicsAMDGPU.h"
43#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/IR/MDBuilder.h"
47#include "llvm/Support/ModRef.h"
49#include <optional>
50
51using namespace llvm;
52using namespace llvm::SDPatternMatch;
53
54#define DEBUG_TYPE "si-lower"
55
56STATISTIC(NumTailCalls, "Number of tail calls");
57
58static cl::opt<bool>
59 DisableLoopAlignment("amdgpu-disable-loop-alignment",
60 cl::desc("Do not align and prefetch loops"),
61 cl::init(false));
62
64 "amdgpu-use-divergent-register-indexing", cl::Hidden,
65 cl::desc("Use indirect register addressing for divergent indexes"),
66 cl::init(false));
67
70 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
71}
72
75 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
76}
77
78static unsigned findFirstFreeSGPR(CCState &CCInfo) {
79 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
80 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
81 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
82 return AMDGPU::SGPR0 + Reg;
83 }
84 }
85 llvm_unreachable("Cannot allocate sgpr");
86}
87
89 const GCNSubtarget &STI)
90 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
91 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
92 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
93
94 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V32RegClass =
98 TRI->getDefaultVectorSuperClassForBitWidth(32);
99 addRegisterClass(MVT::f32, V32RegClass);
100
101 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
102
103 const TargetRegisterClass *V64RegClass =
104 TRI->getDefaultVectorSuperClassForBitWidth(64);
105
106 addRegisterClass(MVT::f64, V64RegClass);
107 addRegisterClass(MVT::v2f32, V64RegClass);
108 addRegisterClass(MVT::Untyped, V64RegClass);
109
110 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
111 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
112
113 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
114 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
115
116 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
118
119 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
120 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
121
122 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
123 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
124
125 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
127
128 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
129 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
130
131 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
132 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
133
134 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
136
137 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
138 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
139
140 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
141 addRegisterClass(MVT::v10f32,
142 TRI->getDefaultVectorSuperClassForBitWidth(320));
143
144 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
145 addRegisterClass(MVT::v11f32,
146 TRI->getDefaultVectorSuperClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32,
150 TRI->getDefaultVectorSuperClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32,
154 TRI->getDefaultVectorSuperClassForBitWidth(512));
155
156 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
157 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
158
159 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
160 addRegisterClass(MVT::v16f64,
161 TRI->getDefaultVectorSuperClassForBitWidth(1024));
162
163 if (Subtarget->has16BitInsts()) {
164 if (Subtarget->useRealTrue16Insts()) {
165 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
166 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
168 } else {
169 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
170 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
172 }
173
174 // Unless there are also VOP3P operations, not operations are really legal.
175 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
177 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
180 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
188 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
190 }
191
192 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
193 addRegisterClass(MVT::v32f32,
194 TRI->getDefaultVectorSuperClassForBitWidth(1024));
195
196 computeRegisterProperties(Subtarget->getRegisterInfo());
197
198 // The boolean content concept here is too inflexible. Compares only ever
199 // really produce a 1-bit result. Any copy/extend from these will turn into a
200 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
201 // it's what most targets use.
204
205 // We need to custom lower vector stores from local memory
206 setOperationAction(ISD::LOAD,
207 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
208 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
209 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
210 MVT::i1, MVT::v32i32},
211 Custom);
212
213 setOperationAction(ISD::STORE,
214 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
215 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
216 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
217 MVT::i1, MVT::v32i32},
218 Custom);
219
220 if (isTypeLegal(MVT::bf16)) {
221 for (unsigned Opc :
223 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
224 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
225 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
226 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
227 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
228 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
229 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
230 ISD::SETCC}) {
231 // FIXME: The promoted to type shouldn't need to be explicit
232 setOperationAction(Opc, MVT::bf16, Promote);
233 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
234 }
235
237
239 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
240
241 setOperationAction(ISD::FABS, MVT::bf16, Legal);
242 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
244
245 // We only need to custom lower because we can't specify an action for bf16
246 // sources.
249 }
250
251 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
252 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
253 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
254 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
255 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
256 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
257 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
258 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
261 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
262 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
263 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
264 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
265 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
266 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
267
268 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
269 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
270 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
271 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
272 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
274 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
275
276 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
277
281 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
282
283 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
284
286 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
287
289 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
290 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
291
293 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
296 Expand);
298 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 Expand);
302
304 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
306 Custom);
307
308 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
309 setOperationAction(ISD::BR_CC,
310 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
311
313
315
317 Expand);
318
319#if 0
321#endif
322
323 // We only support LOAD/STORE and vector manipulation ops for vectors
324 // with > 4 elements.
325 for (MVT VT :
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
334 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
335 switch (Op) {
336 case ISD::LOAD:
337 case ISD::STORE:
339 case ISD::BITCAST:
340 case ISD::UNDEF:
344 case ISD::IS_FPCLASS:
345 break;
350 break;
351 default:
353 break;
354 }
355 }
356 }
357
358 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
359
360 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
361 // is expanded to avoid having two separate loops in case the index is a VGPR.
362
363 // Most operations are naturally 32-bit vector operations. We only support
364 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
365 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
377 }
378
379 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
391 }
392
393 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
405 }
406
407 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
419 }
420
421 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
423 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
430
432 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
433 }
434
436 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
438 Custom);
439
440 if (Subtarget->hasPkMovB32()) {
441 // TODO: 16-bit element vectors should be legal with even aligned elements.
442 // TODO: Can be legal with wider source types than the result with
443 // subregister extracts.
444 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
445 }
446
448 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
449 // instead lower to cndmask in SITargetLowering::LowerSELECT().
451 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
452 // alignbit.
453 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
454
455 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
456 Custom);
457
458 // Avoid stack access for these.
459 // TODO: Generalize to more vector types.
461 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
463 Custom);
464
465 // Deal with vec3 vector operations when widened to vec4.
467 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
468
469 // Deal with vec5/6/7 vector operations when widened to vec8.
471 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
475 Custom);
476
477 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
478 // and output demarshalling
479 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
480
481 // We can't return success/failure, only the old value,
482 // let LLVM add the comparison
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
484 Expand);
485
486 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
487
488 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
489
490 // FIXME: This should be narrowed to i32, but that only happens if i64 is
491 // illegal.
492 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
493 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
494
495 // On SI this is s_memtime and s_memrealtime on VI.
496 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
497
498 if (Subtarget->hasSMemRealTime() ||
499 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
500 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
501 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
502
503 if (Subtarget->has16BitInsts()) {
504 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
505 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
506 } else {
507 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
508 }
509
510 if (Subtarget->hasMadMacF32Insts())
512
513 if (!Subtarget->hasBFI())
514 // fcopysign can be done in a single instruction with BFI.
515 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
516
517 if (!Subtarget->hasBCNT(32))
519
520 if (!Subtarget->hasBCNT(64))
522
523 if (Subtarget->hasFFBH())
525
526 if (Subtarget->hasFFBL())
528
529 // We only really have 32-bit BFE instructions (and 16-bit on VI).
530 //
531 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
532 // effort to match them now. We want this to be false for i64 cases when the
533 // extraction isn't restricted to the upper or lower half. Ideally we would
534 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
535 // span the midpoint are probably relatively rare, so don't worry about them
536 // for now.
537 if (Subtarget->hasBFE())
539
540 // Clamp modifier on add/sub
541 if (Subtarget->hasIntClamp())
543
544 if (Subtarget->hasAddNoCarry())
545 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
546 Legal);
547
549 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
550 {MVT::f32, MVT::f64}, Custom);
551
552 // These are really only legal for ieee_mode functions. We should be avoiding
553 // them for functions that don't have ieee_mode enabled, so just say they are
554 // legal.
555 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
556 {MVT::f32, MVT::f64}, Legal);
557
558 if (Subtarget->haveRoundOpsF64())
559 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
560 Legal);
561 else
562 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
563 MVT::f64, Custom);
564
565 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
566 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
567 Legal);
568 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
569
570 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
572
573 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
574 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
575
576 // Custom lower these because we can't specify a rule based on an illegal
577 // source bf16.
578 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
579 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
580
581 if (Subtarget->has16BitInsts()) {
584 MVT::i16, Legal);
585
586 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
587
589 MVT::i16, Expand);
590
594 ISD::CTPOP},
595 MVT::i16, Promote);
596
597 setOperationAction(ISD::LOAD, MVT::i16, Custom);
598
599 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
600
601 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
602 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
603 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
604 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
605
609
611
612 // F16 - Constant Actions.
615
616 // F16 - Load/Store Actions.
617 setOperationAction(ISD::LOAD, MVT::f16, Promote);
618 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
619 setOperationAction(ISD::STORE, MVT::f16, Promote);
620 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
621
622 // BF16 - Load/Store Actions.
623 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
624 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
625 setOperationAction(ISD::STORE, MVT::bf16, Promote);
626 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
627
628 // F16 - VOP1 Actions.
630 ISD::FSIN, ISD::FROUND},
631 MVT::f16, Custom);
632
633 // BF16 - VOP1 Actions.
634 if (Subtarget->hasBF16TransInsts())
635 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
636
639
640 // F16 - VOP2 Actions.
641 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
642 Expand);
643 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
644 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
646
647 // F16 - VOP3 Actions.
649 if (STI.hasMadF16())
651
652 for (MVT VT :
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
656 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
657 switch (Op) {
658 case ISD::LOAD:
659 case ISD::STORE:
661 case ISD::BITCAST:
662 case ISD::UNDEF:
667 case ISD::IS_FPCLASS:
668 break;
672 break;
673 default:
675 break;
676 }
677 }
678 }
679
680 // v_perm_b32 can handle either of these.
681 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
683
684 // XXX - Do these do anything? Vector constants turn into build_vector.
685 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
686
687 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
688 Legal);
689
690 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
693 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
694
695 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
697 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
698 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
699
700 setOperationAction(ISD::AND, MVT::v2i16, Promote);
701 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
702 setOperationAction(ISD::OR, MVT::v2i16, Promote);
703 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
704 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
705 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
706
707 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
709 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
711 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
713
714 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
715 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
716 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
718 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
720
721 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
723 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
725 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
726 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
729 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
730 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
732
733 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
734 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
735 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
737 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
738 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
739
740 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
741 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
742 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
744 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
746
747 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
748 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
749 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
750 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
751 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
752 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
753
754 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
755 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
756 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
757 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
758 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
759 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
760
761 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
762 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
763 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
764 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
765 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
766 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
767
769 MVT::v2i32, Expand);
770 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
771
773 MVT::v4i32, Expand);
774
776 MVT::v8i32, Expand);
777
778 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
779 Subtarget->hasVOP3PInsts() ? Legal : Custom);
780
781 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
782 // This isn't really legal, but this avoids the legalizer unrolling it (and
783 // allows matching fneg (fabs x) patterns)
784 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
785
786 // Can do this in one BFI plus a constant materialize.
788 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
789 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
790 MVT::v32f16, MVT::v32bf16},
791 Custom);
792
794 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
795 MVT::f16, Custom);
796 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
797
798 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
799 ISD::FMAXIMUMNUM},
800 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
801 Custom);
802
803 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
804 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
805 Expand);
806
807 for (MVT Vec16 :
808 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
809 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
812 Vec16, Custom);
814 }
815 }
816
817 if (Subtarget->hasVOP3PInsts()) {
821 MVT::v2i16, Legal);
822
823 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
824 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
825 MVT::v2f16, Legal);
826
828 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
829
831 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
832 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
833 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
834 Custom);
835
836 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
837 // Split vector operations.
842 VT, Custom);
843
844 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
845 // Split vector operations.
847 VT, Custom);
848
850 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
851 {MVT::v2f16, MVT::v4f16}, Custom);
852
853 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
854 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
855 Custom);
856
857 if (Subtarget->hasBF16PackedInsts()) {
858 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
859 // Split vector operations.
861 VT, Custom);
862 }
863
864 if (Subtarget->hasPackedFP32Ops()) {
866 MVT::v2f32, Legal);
868 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
869 Custom);
870 }
871 }
872
873 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
874
875 if (Subtarget->has16BitInsts()) {
877 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
879 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
880 } else {
881 // Legalization hack.
882 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
883
884 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
885 }
886
888 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
889 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
890 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
891 MVT::v32f16, MVT::v32bf16},
892 Custom);
893
895
896 if (Subtarget->hasVectorMulU64())
898 else if (Subtarget->hasScalarSMulU64())
900
901 if (Subtarget->hasMad64_32())
903
904 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
905 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
906
907 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
908 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
909 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
910 } else {
911 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
912 if (Subtarget->hasMinimum3Maximum3F32())
913 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
914
915 if (Subtarget->hasMinimum3Maximum3PKF16()) {
916 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
917
918 // If only the vector form is available, we need to widen to a vector.
919 if (!Subtarget->hasMinimum3Maximum3F16())
920 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
921 }
922 }
923
924 if (Subtarget->hasVOP3PInsts()) {
925 // We want to break these into v2f16 pieces, not scalarize.
926 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
927 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
928 Custom);
929 }
930
931 if (Subtarget->hasIntMinMax64())
933 Legal);
934
936 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
937 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
938 MVT::i8},
939 Custom);
940
942 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
943 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
944 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
945 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
946 Custom);
947
949 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
950 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
951 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
952 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
953 Custom);
954
955 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
957 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
958 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
959 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
960
961 // TODO: Could move this to custom lowering, could benefit from combines on
962 // extract of relevant bits.
963 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
964
966
967 if (Subtarget->hasBF16ConversionInsts()) {
968 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
970 }
971
972 if (Subtarget->hasBF16PackedInsts()) {
974 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
975 MVT::v2bf16, Legal);
976 }
977
978 if (Subtarget->hasBF16TransInsts()) {
979 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
980 }
981
982 if (Subtarget->hasCvtPkF16F32Inst()) {
984 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
985 Custom);
986 }
987
989 ISD::PTRADD,
991 ISD::SUB,
993 ISD::MUL,
994 ISD::FADD,
995 ISD::FSUB,
996 ISD::FDIV,
997 ISD::FMUL,
998 ISD::FMINNUM,
999 ISD::FMAXNUM,
1000 ISD::FMINNUM_IEEE,
1001 ISD::FMAXNUM_IEEE,
1002 ISD::FMINIMUM,
1003 ISD::FMAXIMUM,
1004 ISD::FMINIMUMNUM,
1005 ISD::FMAXIMUMNUM,
1006 ISD::FMA,
1007 ISD::SMIN,
1008 ISD::SMAX,
1009 ISD::UMIN,
1010 ISD::UMAX,
1011 ISD::SETCC,
1013 ISD::SMIN,
1014 ISD::SMAX,
1015 ISD::UMIN,
1016 ISD::UMAX,
1017 ISD::AND,
1018 ISD::OR,
1019 ISD::XOR,
1020 ISD::SHL,
1021 ISD::SRL,
1022 ISD::SRA,
1023 ISD::FSHR,
1033
1034 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1036
1037 // All memory operations. Some folding on the pointer operand is done to help
1038 // matching the constant offsets in the addressing modes.
1039 setTargetDAGCombine({ISD::LOAD,
1040 ISD::STORE,
1041 ISD::ATOMIC_LOAD,
1042 ISD::ATOMIC_STORE,
1043 ISD::ATOMIC_CMP_SWAP,
1044 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1045 ISD::ATOMIC_SWAP,
1046 ISD::ATOMIC_LOAD_ADD,
1047 ISD::ATOMIC_LOAD_SUB,
1048 ISD::ATOMIC_LOAD_AND,
1049 ISD::ATOMIC_LOAD_OR,
1050 ISD::ATOMIC_LOAD_XOR,
1051 ISD::ATOMIC_LOAD_NAND,
1052 ISD::ATOMIC_LOAD_MIN,
1053 ISD::ATOMIC_LOAD_MAX,
1054 ISD::ATOMIC_LOAD_UMIN,
1055 ISD::ATOMIC_LOAD_UMAX,
1056 ISD::ATOMIC_LOAD_FADD,
1057 ISD::ATOMIC_LOAD_FMIN,
1058 ISD::ATOMIC_LOAD_FMAX,
1059 ISD::ATOMIC_LOAD_UINC_WRAP,
1060 ISD::ATOMIC_LOAD_UDEC_WRAP,
1063
1064 // FIXME: In other contexts we pretend this is a per-function property.
1066
1068}
1069
1070const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1071
1073 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1074 return RCRegs;
1075}
1076
1077//===----------------------------------------------------------------------===//
1078// TargetLowering queries
1079//===----------------------------------------------------------------------===//
1080
1081// v_mad_mix* support a conversion from f16 to f32.
1082//
1083// There is only one special case when denormals are enabled we don't currently,
1084// where this is OK to use.
1085bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1086 EVT DestVT, EVT SrcVT) const {
1087 return DestVT.getScalarType() == MVT::f32 &&
1088 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1090 SrcVT.getScalarType() == MVT::f16) ||
1091 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1092 SrcVT.getScalarType() == MVT::bf16)) &&
1093 // TODO: This probably only requires no input flushing?
1095}
1096
1098 LLT DestTy, LLT SrcTy) const {
1099 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1100 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1101 DestTy.getScalarSizeInBits() == 32 &&
1102 SrcTy.getScalarSizeInBits() == 16 &&
1103 // TODO: This probably only requires no input flushing?
1104 denormalModeIsFlushAllF32(*MI.getMF());
1105}
1106
1108 // SI has some legal vector types, but no legal vector operations. Say no
1109 // shuffles are legal in order to prefer scalarizing some vector operations.
1110 return false;
1111}
1112
1114 CallingConv::ID CC,
1115 EVT VT) const {
1117 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1118
1119 if (VT.isVector()) {
1120 EVT ScalarVT = VT.getScalarType();
1121 unsigned Size = ScalarVT.getSizeInBits();
1122 if (Size == 16) {
1123 if (Subtarget->has16BitInsts()) {
1124 if (VT.isInteger())
1125 return MVT::v2i16;
1126 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1127 }
1128 return VT.isInteger() ? MVT::i32 : MVT::f32;
1129 }
1130
1131 if (Size < 16)
1132 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1133 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1134 }
1135
1136 if (VT.getSizeInBits() > 32)
1137 return MVT::i32;
1138
1139 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1140}
1141
1143 CallingConv::ID CC,
1144 EVT VT) const {
1146 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1147
1148 if (VT.isVector()) {
1149 unsigned NumElts = VT.getVectorNumElements();
1150 EVT ScalarVT = VT.getScalarType();
1151 unsigned Size = ScalarVT.getSizeInBits();
1152
1153 // FIXME: Should probably promote 8-bit vectors to i16.
1154 if (Size == 16 && Subtarget->has16BitInsts())
1155 return (NumElts + 1) / 2;
1156
1157 if (Size <= 32)
1158 return NumElts;
1159
1160 if (Size > 32)
1161 return NumElts * ((Size + 31) / 32);
1162 } else if (VT.getSizeInBits() > 32)
1163 return (VT.getSizeInBits() + 31) / 32;
1164
1165 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1166}
1167
1169 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1170 unsigned &NumIntermediates, MVT &RegisterVT) const {
1171 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1172 unsigned NumElts = VT.getVectorNumElements();
1173 EVT ScalarVT = VT.getScalarType();
1174 unsigned Size = ScalarVT.getSizeInBits();
1175 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1176 // support, but unless we can properly handle 3-vectors, it will be still be
1177 // inconsistent.
1178 if (Size == 16 && Subtarget->has16BitInsts()) {
1179 if (ScalarVT == MVT::bf16) {
1180 RegisterVT = MVT::i32;
1181 IntermediateVT = MVT::v2bf16;
1182 } else {
1183 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1184 IntermediateVT = RegisterVT;
1185 }
1186 NumIntermediates = (NumElts + 1) / 2;
1187 return NumIntermediates;
1188 }
1189
1190 if (Size == 32) {
1191 RegisterVT = ScalarVT.getSimpleVT();
1192 IntermediateVT = RegisterVT;
1193 NumIntermediates = NumElts;
1194 return NumIntermediates;
1195 }
1196
1197 if (Size < 16 && Subtarget->has16BitInsts()) {
1198 // FIXME: Should probably form v2i16 pieces
1199 RegisterVT = MVT::i16;
1200 IntermediateVT = ScalarVT;
1201 NumIntermediates = NumElts;
1202 return NumIntermediates;
1203 }
1204
1205 if (Size != 16 && Size <= 32) {
1206 RegisterVT = MVT::i32;
1207 IntermediateVT = ScalarVT;
1208 NumIntermediates = NumElts;
1209 return NumIntermediates;
1210 }
1211
1212 if (Size > 32) {
1213 RegisterVT = MVT::i32;
1214 IntermediateVT = RegisterVT;
1215 NumIntermediates = NumElts * ((Size + 31) / 32);
1216 return NumIntermediates;
1217 }
1218 }
1219
1221 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1222}
1223
1225 const DataLayout &DL, Type *Ty,
1226 unsigned MaxNumLanes) {
1227 assert(MaxNumLanes != 0);
1228
1229 LLVMContext &Ctx = Ty->getContext();
1230 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1231 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1232 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1233 NumElts);
1234 }
1235
1236 return TLI.getValueType(DL, Ty);
1237}
1238
1239// Peek through TFE struct returns to only use the data size.
1241 const DataLayout &DL, Type *Ty,
1242 unsigned MaxNumLanes) {
1243 auto *ST = dyn_cast<StructType>(Ty);
1244 if (!ST)
1245 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1246
1247 // TFE intrinsics return an aggregate type.
1248 assert(ST->getNumContainedTypes() == 2 &&
1249 ST->getContainedType(1)->isIntegerTy(32));
1250 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1251}
1252
1253/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1254/// in-memory representation. This return value is a custom type because there
1255/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1256/// could cause issues during codegen, these address space 7 pointers will be
1257/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1258/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1259/// for cost modeling, to work. (This also sets us up decently for doing the
1260/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1262 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1263 return MVT::amdgpuBufferFatPointer;
1265 DL.getPointerSizeInBits(AS) == 192)
1266 return MVT::amdgpuBufferStridedPointer;
1268}
1269/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1270/// v8i32 when padding is added.
1271/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1272/// also v8i32 with padding.
1274 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1275 DL.getPointerSizeInBits(AS) == 160) ||
1277 DL.getPointerSizeInBits(AS) == 192))
1278 return MVT::v8i32;
1280}
1281
1282static unsigned getIntrMemWidth(unsigned IntrID) {
1283 switch (IntrID) {
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1287 return 8;
1288 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1289 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1290 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1291 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1292 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1293 return 32;
1294 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1295 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1296 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1297 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1298 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1299 return 64;
1300 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1301 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1302 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1303 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1304 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1305 return 128;
1306 default:
1307 llvm_unreachable("Unknown width");
1308 }
1309}
1310
1311static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1313 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1314 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1315 switch (AtomicOrderingCABI(Ord)) {
1318 break;
1321 break;
1324 break;
1325 default:
1327 break;
1328 }
1329
1330 Info.flags =
1332 Info.flags |= MOCooperative;
1333
1334 MDNode *ScopeMD = cast<MDNode>(
1335 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1336 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1337 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1338}
1339
1341 const CallInst &CI,
1342 MachineFunction &MF,
1343 unsigned IntrID) const {
1344 Info.flags = MachineMemOperand::MONone;
1345 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1346 Info.flags |= MachineMemOperand::MOInvariant;
1347 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1349 Info.flags |= getTargetMMOFlags(CI);
1350
1351 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1353 AttributeSet Attr =
1355 MemoryEffects ME = Attr.getMemoryEffects();
1356 if (ME.doesNotAccessMemory())
1357 return false;
1358
1359 // TODO: Should images get their own address space?
1360 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1361
1362 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1363 if (RsrcIntr->IsImage) {
1364 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1366 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1367 Info.align.reset();
1368 }
1369
1370 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1371 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1372 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1373 // We conservatively set the memory operand of a buffer intrinsic to the
1374 // base resource pointer, so that we can access alias information about
1375 // those pointers. Cases like "this points at the same value
1376 // but with a different offset" are handled in
1377 // areMemAccessesTriviallyDisjoint.
1378 Info.ptrVal = RsrcArg;
1379 }
1380
1381 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1382 if (!IsSPrefetch) {
1383 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1384 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1385 Info.flags |= MachineMemOperand::MOVolatile;
1386 }
1387
1389 if (ME.onlyReadsMemory()) {
1390 if (RsrcIntr->IsImage) {
1391 unsigned MaxNumLanes = 4;
1392
1393 if (!BaseOpcode->Gather4) {
1394 // If this isn't a gather, we may have excess loaded elements in the
1395 // IR type. Check the dmask for the real number of elements loaded.
1396 unsigned DMask =
1397 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1398 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1399 }
1400
1401 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1402 CI.getType(), MaxNumLanes);
1403 } else {
1404 Info.memVT =
1406 std::numeric_limits<unsigned>::max());
1407 }
1408
1409 // FIXME: What does alignment mean for an image?
1410 Info.opc = ISD::INTRINSIC_W_CHAIN;
1411 Info.flags |= MachineMemOperand::MOLoad;
1412 } else if (ME.onlyWritesMemory()) {
1413 Info.opc = ISD::INTRINSIC_VOID;
1414
1415 Type *DataTy = CI.getArgOperand(0)->getType();
1416 if (RsrcIntr->IsImage) {
1417 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1418 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1419 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1420 DMaskLanes);
1421 } else
1422 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1423
1424 Info.flags |= MachineMemOperand::MOStore;
1425 } else {
1426 // Atomic, NoReturn Sampler or prefetch
1427 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1429 Info.flags |=
1431
1432 if (!IsSPrefetch)
1433 Info.flags |= MachineMemOperand::MOStore;
1434
1435 switch (IntrID) {
1436 default:
1437 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1438 // Fake memory access type for no return sampler intrinsics
1439 Info.memVT = MVT::i32;
1440 } else {
1441 // XXX - Should this be volatile without known ordering?
1442 Info.flags |= MachineMemOperand::MOVolatile;
1443 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1444 }
1445 break;
1446 case Intrinsic::amdgcn_raw_buffer_load_lds:
1447 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1448 case Intrinsic::amdgcn_struct_buffer_load_lds:
1449 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1450 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1451 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1452 Info.ptrVal = CI.getArgOperand(1);
1453 return true;
1454 }
1455 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1456 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1457 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1458 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1459 Info.memVT =
1461 std::numeric_limits<unsigned>::max());
1462 Info.flags &= ~MachineMemOperand::MOStore;
1463 return true;
1464 }
1465 }
1466 }
1467 return true;
1468 }
1469
1470 switch (IntrID) {
1471 case Intrinsic::amdgcn_ds_ordered_add:
1472 case Intrinsic::amdgcn_ds_ordered_swap: {
1473 Info.opc = ISD::INTRINSIC_W_CHAIN;
1474 Info.memVT = MVT::getVT(CI.getType());
1475 Info.ptrVal = CI.getOperand(0);
1476 Info.align.reset();
1478
1479 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1480 if (!Vol->isZero())
1481 Info.flags |= MachineMemOperand::MOVolatile;
1482
1483 return true;
1484 }
1485 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1486 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1487 Info.opc = ISD::INTRINSIC_W_CHAIN;
1488 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1489 Info.ptrVal = nullptr;
1490 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1492 return true;
1493 }
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume: {
1496 Info.opc = ISD::INTRINSIC_W_CHAIN;
1497 Info.memVT = MVT::getVT(CI.getType());
1498 Info.ptrVal = CI.getOperand(0);
1499 Info.align.reset();
1501
1502 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1503 if (!Vol->isZero())
1504 Info.flags |= MachineMemOperand::MOVolatile;
1505
1506 return true;
1507 }
1508 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1509 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1510 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1513 Info.memVT = MVT::getVT(CI.getType());
1514 Info.ptrVal = CI.getOperand(0);
1515 Info.memVT = MVT::i64;
1516 Info.size = 8;
1517 Info.align.reset();
1519 return true;
1520 }
1521 case Intrinsic::amdgcn_global_atomic_csub: {
1522 Info.opc = ISD::INTRINSIC_W_CHAIN;
1523 Info.memVT = MVT::getVT(CI.getType());
1524 Info.ptrVal = CI.getOperand(0);
1525 Info.align.reset();
1528 return true;
1529 }
1530 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1531 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1532 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1533 Info.opc = ISD::INTRINSIC_W_CHAIN;
1534 Info.memVT =
1535 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1536 ? CI.getType()
1538 ->getElementType(0)); // XXX: what is correct VT?
1539
1540 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1541 Info.align.reset();
1542 Info.flags |=
1544 return true;
1545 }
1546 case Intrinsic::amdgcn_global_atomic_fmin_num:
1547 case Intrinsic::amdgcn_global_atomic_fmax_num:
1548 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1549 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1550 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1551 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1552 Info.opc = ISD::INTRINSIC_W_CHAIN;
1553 Info.memVT = MVT::getVT(CI.getType());
1554 Info.ptrVal = CI.getOperand(0);
1555 Info.align.reset();
1559 return true;
1560 }
1561 case Intrinsic::amdgcn_flat_load_monitor_b32:
1562 case Intrinsic::amdgcn_flat_load_monitor_b64:
1563 case Intrinsic::amdgcn_flat_load_monitor_b128:
1564 case Intrinsic::amdgcn_global_load_monitor_b32:
1565 case Intrinsic::amdgcn_global_load_monitor_b64:
1566 case Intrinsic::amdgcn_global_load_monitor_b128:
1567 case Intrinsic::amdgcn_cluster_load_b32:
1568 case Intrinsic::amdgcn_cluster_load_b64:
1569 case Intrinsic::amdgcn_cluster_load_b128:
1570 case Intrinsic::amdgcn_ds_load_tr6_b96:
1571 case Intrinsic::amdgcn_ds_load_tr4_b64:
1572 case Intrinsic::amdgcn_ds_load_tr8_b64:
1573 case Intrinsic::amdgcn_ds_load_tr16_b128:
1574 case Intrinsic::amdgcn_global_load_tr6_b96:
1575 case Intrinsic::amdgcn_global_load_tr4_b64:
1576 case Intrinsic::amdgcn_global_load_tr_b64:
1577 case Intrinsic::amdgcn_global_load_tr_b128:
1578 case Intrinsic::amdgcn_ds_read_tr4_b64:
1579 case Intrinsic::amdgcn_ds_read_tr6_b96:
1580 case Intrinsic::amdgcn_ds_read_tr8_b64:
1581 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1582 Info.opc = ISD::INTRINSIC_W_CHAIN;
1583 Info.memVT = MVT::getVT(CI.getType());
1584 Info.ptrVal = CI.getOperand(0);
1585 Info.align.reset();
1586 Info.flags |= MachineMemOperand::MOLoad;
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1592 Info.opc = ISD::INTRINSIC_W_CHAIN;
1593 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1594 Info.ptrVal = CI.getOperand(0);
1595 Info.align.reset();
1596 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1597 return true;
1598 }
1599 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1600 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1601 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1602 Info.opc = ISD::INTRINSIC_VOID;
1603 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1604 Info.ptrVal = CI.getArgOperand(0);
1605 Info.align.reset();
1606 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1607 return true;
1608 }
1609 case Intrinsic::amdgcn_ds_gws_init:
1610 case Intrinsic::amdgcn_ds_gws_barrier:
1611 case Intrinsic::amdgcn_ds_gws_sema_v:
1612 case Intrinsic::amdgcn_ds_gws_sema_br:
1613 case Intrinsic::amdgcn_ds_gws_sema_p:
1614 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1615 Info.opc = ISD::INTRINSIC_VOID;
1616
1617 const GCNTargetMachine &TM =
1618 static_cast<const GCNTargetMachine &>(getTargetMachine());
1619
1621 Info.ptrVal = MFI->getGWSPSV(TM);
1622
1623 // This is an abstract access, but we need to specify a type and size.
1624 Info.memVT = MVT::i32;
1625 Info.size = 4;
1626 Info.align = Align(4);
1627
1628 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1629 Info.flags |= MachineMemOperand::MOLoad;
1630 else
1631 Info.flags |= MachineMemOperand::MOStore;
1632 return true;
1633 }
1634 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1635 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1636 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1637 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1638 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1639 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1640 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1641 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1642 Info.opc = ISD::INTRINSIC_VOID;
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1644 Info.ptrVal = CI.getArgOperand(1);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1649 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1650 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1651 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1652 Info.opc = ISD::INTRINSIC_VOID;
1653 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1654 Info.ptrVal = CI.getArgOperand(0);
1656 return true;
1657 }
1658 case Intrinsic::amdgcn_load_to_lds:
1659 case Intrinsic::amdgcn_global_load_lds: {
1660 Info.opc = ISD::INTRINSIC_VOID;
1661 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1662 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1663 Info.ptrVal = CI.getArgOperand(1);
1665 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1666 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1667 Info.flags |= MachineMemOperand::MOVolatile;
1668 return true;
1669 }
1670 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1671 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1672 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1673 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1674 Info.opc = ISD::INTRINSIC_W_CHAIN;
1675
1676 const GCNTargetMachine &TM =
1677 static_cast<const GCNTargetMachine &>(getTargetMachine());
1678
1680 Info.ptrVal = MFI->getGWSPSV(TM);
1681
1682 // This is an abstract access, but we need to specify a type and size.
1683 Info.memVT = MVT::i32;
1684 Info.size = 4;
1685 Info.align = Align(4);
1686
1688 return true;
1689 }
1690 case Intrinsic::amdgcn_s_prefetch_data:
1691 case Intrinsic::amdgcn_flat_prefetch:
1692 case Intrinsic::amdgcn_global_prefetch: {
1693 Info.opc = ISD::INTRINSIC_VOID;
1694 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1695 Info.ptrVal = CI.getArgOperand(0);
1696 Info.flags |= MachineMemOperand::MOLoad;
1697 return true;
1698 }
1699 default:
1700 return false;
1701 }
1702}
1703
1705 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1707 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1708 // The DAG's ValueType loses the addrspaces.
1709 // Add them as 2 extra Constant operands "from" and "to".
1710 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1711 unsigned DstAS = I.getType()->getPointerAddressSpace();
1712 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1713 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1714 break;
1715 }
1716 default:
1717 break;
1718 }
1719}
1720
1723 Type *&AccessTy) const {
1724 Value *Ptr = nullptr;
1725 switch (II->getIntrinsicID()) {
1726 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1727 case Intrinsic::amdgcn_cluster_load_b128:
1728 case Intrinsic::amdgcn_cluster_load_b64:
1729 case Intrinsic::amdgcn_cluster_load_b32:
1730 case Intrinsic::amdgcn_ds_append:
1731 case Intrinsic::amdgcn_ds_consume:
1732 case Intrinsic::amdgcn_ds_load_tr8_b64:
1733 case Intrinsic::amdgcn_ds_load_tr16_b128:
1734 case Intrinsic::amdgcn_ds_load_tr4_b64:
1735 case Intrinsic::amdgcn_ds_load_tr6_b96:
1736 case Intrinsic::amdgcn_ds_read_tr4_b64:
1737 case Intrinsic::amdgcn_ds_read_tr6_b96:
1738 case Intrinsic::amdgcn_ds_read_tr8_b64:
1739 case Intrinsic::amdgcn_ds_read_tr16_b64:
1740 case Intrinsic::amdgcn_ds_ordered_add:
1741 case Intrinsic::amdgcn_ds_ordered_swap:
1742 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1743 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1744 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1745 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1746 case Intrinsic::amdgcn_flat_load_monitor_b128:
1747 case Intrinsic::amdgcn_flat_load_monitor_b32:
1748 case Intrinsic::amdgcn_flat_load_monitor_b64:
1749 case Intrinsic::amdgcn_global_atomic_csub:
1750 case Intrinsic::amdgcn_global_atomic_fmax_num:
1751 case Intrinsic::amdgcn_global_atomic_fmin_num:
1752 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1753 case Intrinsic::amdgcn_global_load_monitor_b128:
1754 case Intrinsic::amdgcn_global_load_monitor_b32:
1755 case Intrinsic::amdgcn_global_load_monitor_b64:
1756 case Intrinsic::amdgcn_global_load_tr_b64:
1757 case Intrinsic::amdgcn_global_load_tr_b128:
1758 case Intrinsic::amdgcn_global_load_tr4_b64:
1759 case Intrinsic::amdgcn_global_load_tr6_b96:
1760 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1761 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1762 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1763 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1764 Ptr = II->getArgOperand(0);
1765 break;
1766 case Intrinsic::amdgcn_load_to_lds:
1767 case Intrinsic::amdgcn_global_load_lds:
1768 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1769 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1770 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1771 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1772 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1773 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1774 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1775 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1776 Ptr = II->getArgOperand(1);
1777 break;
1778 default:
1779 return false;
1780 }
1781 AccessTy = II->getType();
1782 Ops.push_back(Ptr);
1783 return true;
1784}
1785
1787 unsigned AddrSpace) const {
1788 if (!Subtarget->hasFlatInstOffsets()) {
1789 // Flat instructions do not have offsets, and only have the register
1790 // address.
1791 return AM.BaseOffs == 0 && AM.Scale == 0;
1792 }
1793
1794 decltype(SIInstrFlags::FLAT) FlatVariant =
1798
1799 return AM.Scale == 0 &&
1800 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1801 AM.BaseOffs, AddrSpace, FlatVariant));
1802}
1803
1805 if (Subtarget->hasFlatGlobalInsts())
1807
1808 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1809 // Assume the we will use FLAT for all global memory accesses
1810 // on VI.
1811 // FIXME: This assumption is currently wrong. On VI we still use
1812 // MUBUF instructions for the r + i addressing mode. As currently
1813 // implemented, the MUBUF instructions only work on buffer < 4GB.
1814 // It may be possible to support > 4GB buffers with MUBUF instructions,
1815 // by setting the stride value in the resource descriptor which would
1816 // increase the size limit to (stride * 4GB). However, this is risky,
1817 // because it has never been validated.
1819 }
1820
1821 return isLegalMUBUFAddressingMode(AM);
1822}
1823
1824bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1825 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1826 // additionally can do r + r + i with addr64. 32-bit has more addressing
1827 // mode options. Depending on the resource constant, it can also do
1828 // (i64 r0) + (i32 r1) * (i14 i).
1829 //
1830 // Private arrays end up using a scratch buffer most of the time, so also
1831 // assume those use MUBUF instructions. Scratch loads / stores are currently
1832 // implemented as mubuf instructions with offen bit set, so slightly
1833 // different than the normal addr64.
1834 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1835 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1836 return false;
1837
1838 // FIXME: Since we can split immediate into soffset and immediate offset,
1839 // would it make sense to allow any immediate?
1840
1841 switch (AM.Scale) {
1842 case 0: // r + i or just i, depending on HasBaseReg.
1843 return true;
1844 case 1:
1845 return true; // We have r + r or r + i.
1846 case 2:
1847 if (AM.HasBaseReg) {
1848 // Reject 2 * r + r.
1849 return false;
1850 }
1851
1852 // Allow 2 * r as r + r
1853 // Or 2 * r + i is allowed as r + r + i.
1854 return true;
1855 default: // Don't allow n * r
1856 return false;
1857 }
1858}
1859
1861 const AddrMode &AM, Type *Ty,
1862 unsigned AS,
1863 Instruction *I) const {
1864 // No global is ever allowed as a base.
1865 if (AM.BaseGV)
1866 return false;
1867
1868 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1869 return isLegalGlobalAddressingMode(AM);
1870
1871 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1875 // If the offset isn't a multiple of 4, it probably isn't going to be
1876 // correctly aligned.
1877 // FIXME: Can we get the real alignment here?
1878 if (AM.BaseOffs % 4 != 0)
1879 return isLegalMUBUFAddressingMode(AM);
1880
1881 if (!Subtarget->hasScalarSubwordLoads()) {
1882 // There are no SMRD extloads, so if we have to do a small type access we
1883 // will use a MUBUF load.
1884 // FIXME?: We also need to do this if unaligned, but we don't know the
1885 // alignment here.
1886 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1887 return isLegalGlobalAddressingMode(AM);
1888 }
1889
1890 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1891 // SMRD instructions have an 8-bit, dword offset on SI.
1892 if (!isUInt<8>(AM.BaseOffs / 4))
1893 return false;
1894 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1895 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1896 // in 8-bits, it can use a smaller encoding.
1897 if (!isUInt<32>(AM.BaseOffs / 4))
1898 return false;
1899 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1900 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1901 if (!isUInt<20>(AM.BaseOffs))
1902 return false;
1903 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1904 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1905 // for S_BUFFER_* instructions).
1906 if (!isInt<21>(AM.BaseOffs))
1907 return false;
1908 } else {
1909 // On GFX12, all offsets are signed 24-bit in bytes.
1910 if (!isInt<24>(AM.BaseOffs))
1911 return false;
1912 }
1913
1914 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1916 AM.BaseOffs < 0) {
1917 // Scalar (non-buffer) loads can only use a negative offset if
1918 // soffset+offset is non-negative. Since the compiler can only prove that
1919 // in a few special cases, it is safer to claim that negative offsets are
1920 // not supported.
1921 return false;
1922 }
1923
1924 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1925 return true;
1926
1927 if (AM.Scale == 1 && AM.HasBaseReg)
1928 return true;
1929
1930 return false;
1931 }
1932
1933 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1934 return Subtarget->enableFlatScratch()
1936 : isLegalMUBUFAddressingMode(AM);
1937
1938 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1939 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1940 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1941 // field.
1942 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1943 // an 8-bit dword offset but we don't know the alignment here.
1944 if (!isUInt<16>(AM.BaseOffs))
1945 return false;
1946
1947 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1948 return true;
1949
1950 if (AM.Scale == 1 && AM.HasBaseReg)
1951 return true;
1952
1953 return false;
1954 }
1955
1957 // For an unknown address space, this usually means that this is for some
1958 // reason being used for pure arithmetic, and not based on some addressing
1959 // computation. We don't have instructions that compute pointers with any
1960 // addressing modes, so treat them as having no offset like flat
1961 // instructions.
1963 }
1964
1965 // Assume a user alias of global for unknown address spaces.
1966 return isLegalGlobalAddressingMode(AM);
1967}
1968
1970 const MachineFunction &MF) const {
1972 return (MemVT.getSizeInBits() <= 4 * 32);
1973 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1974 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1975 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1976 }
1978 return (MemVT.getSizeInBits() <= 2 * 32);
1979 return true;
1980}
1981
1983 unsigned Size, unsigned AddrSpace, Align Alignment,
1984 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1985 if (IsFast)
1986 *IsFast = 0;
1987
1988 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1989 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1990 // Check if alignment requirements for ds_read/write instructions are
1991 // disabled.
1992 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1993 return false;
1994
1995 Align RequiredAlignment(
1996 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1997 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1998 Alignment < RequiredAlignment)
1999 return false;
2000
2001 // Either, the alignment requirements are "enabled", or there is an
2002 // unaligned LDS access related hardware bug though alignment requirements
2003 // are "disabled". In either case, we need to check for proper alignment
2004 // requirements.
2005 //
2006 switch (Size) {
2007 case 64:
2008 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2009 // address is negative, then the instruction is incorrectly treated as
2010 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2011 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2012 // load later in the SILoadStoreOptimizer.
2013 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2014 return false;
2015
2016 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2017 // can do a 4 byte aligned, 8 byte access in a single operation using
2018 // ds_read2/write2_b32 with adjacent offsets.
2019 RequiredAlignment = Align(4);
2020
2021 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2022 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2023 // ds_write2_b32 depending on the alignment. In either case with either
2024 // alignment there is no faster way of doing this.
2025
2026 // The numbers returned here and below are not additive, it is a 'speed
2027 // rank'. They are just meant to be compared to decide if a certain way
2028 // of lowering an operation is faster than another. For that purpose
2029 // naturally aligned operation gets it bitsize to indicate that "it
2030 // operates with a speed comparable to N-bit wide load". With the full
2031 // alignment ds128 is slower than ds96 for example. If underaligned it
2032 // is comparable to a speed of a single dword access, which would then
2033 // mean 32 < 128 and it is faster to issue a wide load regardless.
2034 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2035 // wider load which will not be aligned anymore the latter is slower.
2036 if (IsFast)
2037 *IsFast = (Alignment >= RequiredAlignment) ? 64
2038 : (Alignment < Align(4)) ? 32
2039 : 1;
2040 return true;
2041 }
2042
2043 break;
2044 case 96:
2045 if (!Subtarget->hasDS96AndDS128())
2046 return false;
2047
2048 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2049 // gfx8 and older.
2050
2051 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2052 // Naturally aligned access is fastest. However, also report it is Fast
2053 // if memory is aligned less than DWORD. A narrow load or store will be
2054 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2055 // be more of them, so overall we will pay less penalty issuing a single
2056 // instruction.
2057
2058 // See comment on the values above.
2059 if (IsFast)
2060 *IsFast = (Alignment >= RequiredAlignment) ? 96
2061 : (Alignment < Align(4)) ? 32
2062 : 1;
2063 return true;
2064 }
2065
2066 break;
2067 case 128:
2068 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2069 return false;
2070
2071 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2072 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2073 // single operation using ds_read2/write2_b64.
2074 RequiredAlignment = Align(8);
2075
2076 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2077 // Naturally aligned access is fastest. However, also report it is Fast
2078 // if memory is aligned less than DWORD. A narrow load or store will be
2079 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2080 // will be more of them, so overall we will pay less penalty issuing a
2081 // single instruction.
2082
2083 // See comment on the values above.
2084 if (IsFast)
2085 *IsFast = (Alignment >= RequiredAlignment) ? 128
2086 : (Alignment < Align(4)) ? 32
2087 : 1;
2088 return true;
2089 }
2090
2091 break;
2092 default:
2093 if (Size > 32)
2094 return false;
2095
2096 break;
2097 }
2098
2099 // See comment on the values above.
2100 // Note that we have a single-dword or sub-dword here, so if underaligned
2101 // it is a slowest possible access, hence returned value is 0.
2102 if (IsFast)
2103 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2104
2105 return Alignment >= RequiredAlignment ||
2106 Subtarget->hasUnalignedDSAccessEnabled();
2107 }
2108
2109 // FIXME: We have to be conservative here and assume that flat operations
2110 // will access scratch. If we had access to the IR function, then we
2111 // could determine if any private memory was used in the function.
2112 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2113 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2114 bool AlignedBy4 = Alignment >= Align(4);
2115 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2116 if (IsFast)
2117 *IsFast = AlignedBy4 ? Size : 1;
2118 return true;
2119 }
2120
2121 if (IsFast)
2122 *IsFast = AlignedBy4;
2123
2124 return AlignedBy4;
2125 }
2126
2127 // So long as they are correct, wide global memory operations perform better
2128 // than multiple smaller memory ops -- even when misaligned
2129 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2130 if (IsFast)
2131 *IsFast = Size;
2132
2133 return Alignment >= Align(4) ||
2134 Subtarget->hasUnalignedBufferAccessEnabled();
2135 }
2136
2137 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2138 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2139 // out-of-bounds behavior, but in the edge case where an access starts
2140 // out-of-bounds and then enter in-bounds, the entire access would be treated
2141 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2142 // natural alignment of buffer accesses.
2143 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2144 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2145 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2146 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2147 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2148 return false;
2149 }
2150
2151 // Smaller than dword value must be aligned.
2152 if (Size < 32)
2153 return false;
2154
2155 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2156 // byte-address are ignored, thus forcing Dword alignment.
2157 // This applies to private, global, and constant memory.
2158 if (IsFast)
2159 *IsFast = 1;
2160
2161 return Size >= 32 && Alignment >= Align(4);
2162}
2163
2165 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2166 unsigned *IsFast) const {
2168 Alignment, Flags, IsFast);
2169}
2170
2172 LLVMContext &Context, const MemOp &Op,
2173 const AttributeList &FuncAttributes) const {
2174 // FIXME: Should account for address space here.
2175
2176 // The default fallback uses the private pointer size as a guess for a type to
2177 // use. Make sure we switch these to 64-bit accesses.
2178
2179 if (Op.size() >= 16 &&
2180 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2181 return MVT::v4i32;
2182
2183 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2184 return MVT::v2i32;
2185
2186 // Use the default.
2187 return MVT::Other;
2188}
2189
2191 const MemSDNode *MemNode = cast<MemSDNode>(N);
2192 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2193}
2194
2199
2201 unsigned DestAS) const {
2202 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2203 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2204 Subtarget->hasGloballyAddressableScratch()) {
2205 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2206 return false;
2207 }
2208
2209 // Flat -> private/local is a simple truncate.
2210 // Flat -> global is no-op
2211 return true;
2212 }
2213
2214 const GCNTargetMachine &TM =
2215 static_cast<const GCNTargetMachine &>(getTargetMachine());
2216 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2217}
2218
2226
2228 Type *Ty) const {
2229 // FIXME: Could be smarter if called for vector constants.
2230 return true;
2231}
2232
2234 unsigned Index) const {
2236 return false;
2237
2238 // TODO: Add more cases that are cheap.
2239 return Index == 0;
2240}
2241
2242bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2243 // TODO: This should be more aggressive, particular for 16-bit element
2244 // vectors. However there are some mixed improvements and regressions.
2245 EVT EltTy = VT.getVectorElementType();
2246 return EltTy.getSizeInBits() % 32 == 0;
2247}
2248
2250 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2251 switch (Op) {
2252 case ISD::LOAD:
2253 case ISD::STORE:
2254 return true;
2255 default:
2256 return false;
2257 }
2258 }
2259
2260 // SimplifySetCC uses this function to determine whether or not it should
2261 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2262 if (VT == MVT::i1 && Op == ISD::SETCC)
2263 return false;
2264
2266}
2267
2268SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2269 const SDLoc &SL,
2270 SDValue Chain,
2271 uint64_t Offset) const {
2272 const DataLayout &DL = DAG.getDataLayout();
2276
2277 auto [InputPtrReg, RC, ArgTy] =
2278 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2279
2280 // We may not have the kernarg segment argument if we have no kernel
2281 // arguments.
2282 if (!InputPtrReg)
2283 return DAG.getConstant(Offset, SL, PtrVT);
2284
2286 SDValue BasePtr = DAG.getCopyFromReg(
2287 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2288
2289 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2290}
2291
2292SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2293 const SDLoc &SL) const {
2296 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2297}
2298
2299SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2300 const SDLoc &SL) const {
2301
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2306 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2307 return SDValue();
2308}
2309
2310SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2311 const SDLoc &SL, SDValue Val,
2312 bool Signed,
2313 const ISD::InputArg *Arg) const {
2314 // First, if it is a widened vector, narrow it.
2315 if (VT.isVector() &&
2317 EVT NarrowedVT =
2320 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2321 DAG.getConstant(0, SL, MVT::i32));
2322 }
2323
2324 // Then convert the vector elements or scalar value.
2325 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2326 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2327 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2328 }
2329
2330 if (MemVT.isFloatingPoint())
2331 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2332 else if (Signed)
2333 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2334 else
2335 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2336
2337 return Val;
2338}
2339
2340SDValue SITargetLowering::lowerKernargMemParameter(
2341 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2342 uint64_t Offset, Align Alignment, bool Signed,
2343 const ISD::InputArg *Arg) const {
2344 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2345
2346 // Try to avoid using an extload by loading earlier than the argument address,
2347 // and extracting the relevant bits. The load should hopefully be merged with
2348 // the previous argument.
2349 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2350 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2351 int64_t AlignDownOffset = alignDown(Offset, 4);
2352 int64_t OffsetDiff = Offset - AlignDownOffset;
2353
2354 EVT IntVT = MemVT.changeTypeToInteger();
2355
2356 // TODO: If we passed in the base kernel offset we could have a better
2357 // alignment than 4, but we don't really need it.
2358 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2359 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2362
2363 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2364 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2365
2366 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2367 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2368 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2369
2370 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2371 }
2372
2373 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2374 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2377
2378 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2379 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2380}
2381
2382/// Coerce an argument which was passed in a different ABI type to the original
2383/// expected value type.
2384SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2385 SDValue Val,
2386 CCValAssign &VA,
2387 const SDLoc &SL) const {
2388 EVT ValVT = VA.getValVT();
2389
2390 // If this is an 8 or 16-bit value, it is really passed promoted
2391 // to 32 bits. Insert an assert[sz]ext to capture this, then
2392 // truncate to the right size.
2393 switch (VA.getLocInfo()) {
2394 case CCValAssign::Full:
2395 return Val;
2396 case CCValAssign::BCvt:
2397 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2398 case CCValAssign::SExt:
2399 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2400 DAG.getValueType(ValVT));
2401 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2402 case CCValAssign::ZExt:
2403 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2404 DAG.getValueType(ValVT));
2405 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2406 case CCValAssign::AExt:
2407 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2408 default:
2409 llvm_unreachable("Unknown loc info!");
2410 }
2411}
2412
2413SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2414 CCValAssign &VA, const SDLoc &SL,
2415 SDValue Chain,
2416 const ISD::InputArg &Arg) const {
2417 MachineFunction &MF = DAG.getMachineFunction();
2418 MachineFrameInfo &MFI = MF.getFrameInfo();
2419
2420 if (Arg.Flags.isByVal()) {
2421 unsigned Size = Arg.Flags.getByValSize();
2422 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2423 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2424 }
2425
2426 unsigned ArgOffset = VA.getLocMemOffset();
2427 unsigned ArgSize = VA.getValVT().getStoreSize();
2428
2429 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2430
2431 // Create load nodes to retrieve arguments from the stack.
2432 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2433
2434 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2436 MVT MemVT = VA.getValVT();
2437
2438 switch (VA.getLocInfo()) {
2439 default:
2440 break;
2441 case CCValAssign::BCvt:
2442 MemVT = VA.getLocVT();
2443 break;
2444 case CCValAssign::SExt:
2445 ExtType = ISD::SEXTLOAD;
2446 break;
2447 case CCValAssign::ZExt:
2448 ExtType = ISD::ZEXTLOAD;
2449 break;
2450 case CCValAssign::AExt:
2451 ExtType = ISD::EXTLOAD;
2452 break;
2453 }
2454
2455 SDValue ArgValue = DAG.getExtLoad(
2456 ExtType, SL, VA.getLocVT(), Chain, FIN,
2458
2459 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2460 if (ConvertedVal == ArgValue)
2461 return ConvertedVal;
2462
2463 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2464}
2465
2466SDValue SITargetLowering::lowerWorkGroupId(
2467 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2470 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2471 if (!Subtarget->hasClusters())
2472 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2473
2474 // Clusters are supported. Return the global position in the grid. If clusters
2475 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2476
2477 // WorkGroupIdXYZ = ClusterId == 0 ?
2478 // ClusterIdXYZ :
2479 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2480 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2481 SDLoc SL(ClusterIdXYZ);
2482 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2483 SDValue One = DAG.getConstant(1, SL, VT);
2484 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2485 SDValue ClusterWorkGroupIdXYZ =
2486 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2487 SDValue GlobalIdXYZ =
2488 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2489 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2490
2491 switch (MFI.getClusterDims().getKind()) {
2494 return GlobalIdXYZ;
2496 return ClusterIdXYZ;
2498 using namespace AMDGPU::Hwreg;
2499 SDValue ClusterIdField =
2500 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2501 SDNode *GetReg =
2502 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2503 SDValue ClusterId(GetReg, 0);
2504 SDValue Zero = DAG.getConstant(0, SL, VT);
2505 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2506 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2507 }
2508 }
2509
2510 llvm_unreachable("nothing should reach here");
2511}
2512
2513SDValue SITargetLowering::getPreloadedValue(
2514 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2516 const ArgDescriptor *Reg = nullptr;
2517 const TargetRegisterClass *RC;
2518 LLT Ty;
2519
2521 const ArgDescriptor WorkGroupIDX =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2523 // If GridZ is not programmed in an entry function then the hardware will set
2524 // it to all zeros, so there is no need to mask the GridY value in the low
2525 // order bits.
2526 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2527 AMDGPU::TTMP7,
2528 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2529 const ArgDescriptor WorkGroupIDZ =
2530 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2531 const ArgDescriptor ClusterWorkGroupIDX =
2532 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2533 const ArgDescriptor ClusterWorkGroupIDY =
2534 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2535 const ArgDescriptor ClusterWorkGroupIDZ =
2536 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2537 const ArgDescriptor ClusterWorkGroupMaxIDX =
2538 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2539 const ArgDescriptor ClusterWorkGroupMaxIDY =
2540 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2541 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2542 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2543 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2544 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2545
2546 auto LoadConstant = [&](unsigned N) {
2547 return DAG.getConstant(N, SDLoc(), VT);
2548 };
2549
2550 if (Subtarget->hasArchitectedSGPRs() &&
2552 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2553 bool HasFixedDims = ClusterDims.isFixedDims();
2554
2555 switch (PVID) {
2557 Reg = &WorkGroupIDX;
2558 RC = &AMDGPU::SReg_32RegClass;
2559 Ty = LLT::scalar(32);
2560 break;
2562 Reg = &WorkGroupIDY;
2563 RC = &AMDGPU::SReg_32RegClass;
2564 Ty = LLT::scalar(32);
2565 break;
2567 Reg = &WorkGroupIDZ;
2568 RC = &AMDGPU::SReg_32RegClass;
2569 Ty = LLT::scalar(32);
2570 break;
2572 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDX;
2575 RC = &AMDGPU::SReg_32RegClass;
2576 Ty = LLT::scalar(32);
2577 break;
2579 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2580 return LoadConstant(0);
2581 Reg = &ClusterWorkGroupIDY;
2582 RC = &AMDGPU::SReg_32RegClass;
2583 Ty = LLT::scalar(32);
2584 break;
2586 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2587 return LoadConstant(0);
2588 Reg = &ClusterWorkGroupIDZ;
2589 RC = &AMDGPU::SReg_32RegClass;
2590 Ty = LLT::scalar(32);
2591 break;
2593 if (HasFixedDims)
2594 return LoadConstant(ClusterDims.getDims()[0] - 1);
2595 Reg = &ClusterWorkGroupMaxIDX;
2596 RC = &AMDGPU::SReg_32RegClass;
2597 Ty = LLT::scalar(32);
2598 break;
2600 if (HasFixedDims)
2601 return LoadConstant(ClusterDims.getDims()[1] - 1);
2602 Reg = &ClusterWorkGroupMaxIDY;
2603 RC = &AMDGPU::SReg_32RegClass;
2604 Ty = LLT::scalar(32);
2605 break;
2607 if (HasFixedDims)
2608 return LoadConstant(ClusterDims.getDims()[2] - 1);
2609 Reg = &ClusterWorkGroupMaxIDZ;
2610 RC = &AMDGPU::SReg_32RegClass;
2611 Ty = LLT::scalar(32);
2612 break;
2614 Reg = &ClusterWorkGroupMaxFlatID;
2615 RC = &AMDGPU::SReg_32RegClass;
2616 Ty = LLT::scalar(32);
2617 break;
2618 default:
2619 break;
2620 }
2621 }
2622
2623 if (!Reg)
2624 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2625 if (!Reg) {
2627 // It's possible for a kernarg intrinsic call to appear in a kernel with
2628 // no allocated segment, in which case we do not add the user sgpr
2629 // argument, so just return null.
2630 return DAG.getConstant(0, SDLoc(), VT);
2631 }
2632
2633 // It's undefined behavior if a function marked with the amdgpu-no-*
2634 // attributes uses the corresponding intrinsic.
2635 return DAG.getPOISON(VT);
2636 }
2637
2638 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2639}
2640
2642 CallingConv::ID CallConv,
2643 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2644 FunctionType *FType,
2646 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2647 const ISD::InputArg *Arg = &Ins[I];
2648
2649 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2650 "vector type argument should have been split");
2651
2652 // First check if it's a PS input addr.
2653 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2654 PSInputNum <= 15) {
2655 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2656
2657 // Inconveniently only the first part of the split is marked as isSplit,
2658 // so skip to the end. We only want to increment PSInputNum once for the
2659 // entire split argument.
2660 if (Arg->Flags.isSplit()) {
2661 while (!Arg->Flags.isSplitEnd()) {
2662 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2663 "unexpected vector split in ps argument type");
2664 if (!SkipArg)
2665 Splits.push_back(*Arg);
2666 Arg = &Ins[++I];
2667 }
2668 }
2669
2670 if (SkipArg) {
2671 // We can safely skip PS inputs.
2672 Skipped.set(Arg->getOrigArgIndex());
2673 ++PSInputNum;
2674 continue;
2675 }
2676
2677 Info->markPSInputAllocated(PSInputNum);
2678 if (Arg->Used)
2679 Info->markPSInputEnabled(PSInputNum);
2680
2681 ++PSInputNum;
2682 }
2683
2684 Splits.push_back(*Arg);
2685 }
2686}
2687
2688// Allocate special inputs passed in VGPRs.
2690 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2691 SIMachineFunctionInfo &Info) const {
2692 const LLT S32 = LLT::scalar(32);
2694
2695 if (Info.hasWorkItemIDX()) {
2696 Register Reg = AMDGPU::VGPR0;
2697 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2698
2699 CCInfo.AllocateReg(Reg);
2700 unsigned Mask =
2701 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2702 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2703 }
2704
2705 if (Info.hasWorkItemIDY()) {
2706 assert(Info.hasWorkItemIDX());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDY(
2709 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2710 } else {
2711 unsigned Reg = AMDGPU::VGPR1;
2712 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2713
2714 CCInfo.AllocateReg(Reg);
2715 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2716 }
2717 }
2718
2719 if (Info.hasWorkItemIDZ()) {
2720 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2721 if (Subtarget->hasPackedTID()) {
2722 Info.setWorkItemIDZ(
2723 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2724 } else {
2725 unsigned Reg = AMDGPU::VGPR2;
2726 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2727
2728 CCInfo.AllocateReg(Reg);
2729 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2730 }
2731 }
2732}
2733
2734// Try to allocate a VGPR at the end of the argument list, or if no argument
2735// VGPRs are left allocating a stack slot.
2736// If \p Mask is is given it indicates bitfield position in the register.
2737// If \p Arg is given use it with new ]p Mask instead of allocating new.
2738static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2739 ArgDescriptor Arg = ArgDescriptor()) {
2740 if (Arg.isSet())
2741 return ArgDescriptor::createArg(Arg, Mask);
2742
2743 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2744 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2745 if (RegIdx == ArgVGPRs.size()) {
2746 // Spill to stack required.
2747 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2748
2749 return ArgDescriptor::createStack(Offset, Mask);
2750 }
2751
2752 unsigned Reg = ArgVGPRs[RegIdx];
2753 Reg = CCInfo.AllocateReg(Reg);
2754 assert(Reg != AMDGPU::NoRegister);
2755
2756 MachineFunction &MF = CCInfo.getMachineFunction();
2757 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2758 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2759 return ArgDescriptor::createRegister(Reg, Mask);
2760}
2761
2763 const TargetRegisterClass *RC,
2764 unsigned NumArgRegs) {
2765 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2766 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2767 if (RegIdx == ArgSGPRs.size())
2768 report_fatal_error("ran out of SGPRs for arguments");
2769
2770 unsigned Reg = ArgSGPRs[RegIdx];
2771 Reg = CCInfo.AllocateReg(Reg);
2772 assert(Reg != AMDGPU::NoRegister);
2773
2774 MachineFunction &MF = CCInfo.getMachineFunction();
2775 MF.addLiveIn(Reg, RC);
2777}
2778
2779// If this has a fixed position, we still should allocate the register in the
2780// CCInfo state. Technically we could get away with this for values passed
2781// outside of the normal argument range.
2783 const TargetRegisterClass *RC,
2784 MCRegister Reg) {
2785 Reg = CCInfo.AllocateReg(Reg);
2786 assert(Reg != AMDGPU::NoRegister);
2787 MachineFunction &MF = CCInfo.getMachineFunction();
2788 MF.addLiveIn(Reg, RC);
2789}
2790
2791static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2792 if (Arg) {
2793 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2794 Arg.getRegister());
2795 } else
2796 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2797}
2798
2799static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2800 if (Arg) {
2801 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2802 Arg.getRegister());
2803 } else
2804 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2805}
2806
2807/// Allocate implicit function VGPR arguments at the end of allocated user
2808/// arguments.
2810 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2811 SIMachineFunctionInfo &Info) const {
2812 const unsigned Mask = 0x3ff;
2813 ArgDescriptor Arg;
2814
2815 if (Info.hasWorkItemIDX()) {
2816 Arg = allocateVGPR32Input(CCInfo, Mask);
2817 Info.setWorkItemIDX(Arg);
2818 }
2819
2820 if (Info.hasWorkItemIDY()) {
2821 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2822 Info.setWorkItemIDY(Arg);
2823 }
2824
2825 if (Info.hasWorkItemIDZ())
2826 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2827}
2828
2829/// Allocate implicit function VGPR arguments in fixed registers.
2831 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2832 SIMachineFunctionInfo &Info) const {
2833 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2834 if (!Reg)
2835 report_fatal_error("failed to allocate VGPR for implicit arguments");
2836
2837 const unsigned Mask = 0x3ff;
2838 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2839 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2840 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2841}
2842
2844 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2845 SIMachineFunctionInfo &Info) const {
2846 auto &ArgInfo = Info.getArgInfo();
2847 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2848
2849 // TODO: Unify handling with private memory pointers.
2850 if (UserSGPRInfo.hasDispatchPtr())
2851 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2852
2853 if (UserSGPRInfo.hasQueuePtr())
2854 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2855
2856 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2857 // constant offset from the kernarg segment.
2858 if (Info.hasImplicitArgPtr())
2859 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2860
2861 if (UserSGPRInfo.hasDispatchID())
2862 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2863
2864 // flat_scratch_init is not applicable for non-kernel functions.
2865
2866 if (Info.hasWorkGroupIDX())
2867 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2868
2869 if (Info.hasWorkGroupIDY())
2870 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2871
2872 if (Info.hasWorkGroupIDZ())
2873 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2874
2875 if (Info.hasLDSKernelId())
2876 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2877}
2878
2879// Allocate special inputs passed in user SGPRs.
2881 MachineFunction &MF,
2882 const SIRegisterInfo &TRI,
2883 SIMachineFunctionInfo &Info) const {
2884 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2885 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2886 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2887 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2888 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2889 }
2890
2891 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2892 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2893 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2894 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2895 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2896 }
2897
2898 if (UserSGPRInfo.hasDispatchPtr()) {
2899 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2900 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2901 CCInfo.AllocateReg(DispatchPtrReg);
2902 }
2903
2904 if (UserSGPRInfo.hasQueuePtr()) {
2905 Register QueuePtrReg = Info.addQueuePtr(TRI);
2906 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2907 CCInfo.AllocateReg(QueuePtrReg);
2908 }
2909
2910 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2912 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2913 CCInfo.AllocateReg(InputPtrReg);
2914
2915 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2916 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2917 }
2918
2919 if (UserSGPRInfo.hasDispatchID()) {
2920 Register DispatchIDReg = Info.addDispatchID(TRI);
2921 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2922 CCInfo.AllocateReg(DispatchIDReg);
2923 }
2924
2925 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2926 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2927 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2928 CCInfo.AllocateReg(FlatScratchInitReg);
2929 }
2930
2931 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2932 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2933 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2934 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2935 }
2936
2937 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2938 // these from the dispatch pointer.
2939}
2940
2941// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2942// sequential starting from the first argument.
2944 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2946 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2947 Function &F = MF.getFunction();
2948 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2949 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2950 bool InPreloadSequence = true;
2951 unsigned InIdx = 0;
2952 bool AlignedForImplictArgs = false;
2953 unsigned ImplicitArgOffset = 0;
2954 for (auto &Arg : F.args()) {
2955 if (!InPreloadSequence || !Arg.hasInRegAttr())
2956 break;
2957
2958 unsigned ArgIdx = Arg.getArgNo();
2959 // Don't preload non-original args or parts not in the current preload
2960 // sequence.
2961 if (InIdx < Ins.size() &&
2962 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2963 break;
2964
2965 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2966 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2967 InIdx++) {
2968 assert(ArgLocs[ArgIdx].isMemLoc());
2969 auto &ArgLoc = ArgLocs[InIdx];
2970 const Align KernelArgBaseAlign = Align(16);
2971 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2972 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2973 unsigned NumAllocSGPRs =
2974 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2975
2976 // Fix alignment for hidden arguments.
2977 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2978 if (!AlignedForImplictArgs) {
2979 ImplicitArgOffset =
2980 alignTo(LastExplicitArgOffset,
2981 Subtarget->getAlignmentForImplicitArgPtr()) -
2982 LastExplicitArgOffset;
2983 AlignedForImplictArgs = true;
2984 }
2985 ArgOffset += ImplicitArgOffset;
2986 }
2987
2988 // Arg is preloaded into the previous SGPR.
2989 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2990 assert(InIdx >= 1 && "No previous SGPR");
2991 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2992 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2993 continue;
2994 }
2995
2996 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2997 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2998 // Check for free user SGPRs for preloading.
2999 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3000 InPreloadSequence = false;
3001 break;
3002 }
3003
3004 // Preload this argument.
3005 const TargetRegisterClass *RC =
3006 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3007 SmallVectorImpl<MCRegister> *PreloadRegs =
3008 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3009
3010 if (PreloadRegs->size() > 1)
3011 RC = &AMDGPU::SGPR_32RegClass;
3012 for (auto &Reg : *PreloadRegs) {
3013 assert(Reg);
3014 MF.addLiveIn(Reg, RC);
3015 CCInfo.AllocateReg(Reg);
3016 }
3017
3018 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3019 }
3020 }
3021}
3022
3024 const SIRegisterInfo &TRI,
3025 SIMachineFunctionInfo &Info) const {
3026 // Always allocate this last since it is a synthetic preload.
3027 if (Info.hasLDSKernelId()) {
3028 Register Reg = Info.addLDSKernelId();
3029 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3030 CCInfo.AllocateReg(Reg);
3031 }
3032}
3033
3034// Allocate special input registers that are initialized per-wave.
3037 CallingConv::ID CallConv,
3038 bool IsShader) const {
3039 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3040 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3041 // Note: user SGPRs are handled by the front-end for graphics shaders
3042 // Pad up the used user SGPRs with dead inputs.
3043
3044 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3045 // before enabling architected SGPRs for workgroup IDs.
3046 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3047
3048 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3049 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3050 // rely on it to reach 16 since if we end up having no stack usage, it will
3051 // not really be added.
3052 unsigned NumRequiredSystemSGPRs =
3053 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3054 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3055 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3056 Register Reg = Info.addReservedUserSGPR();
3057 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 CCInfo.AllocateReg(Reg);
3059 }
3060 }
3061
3062 if (!HasArchitectedSGPRs) {
3063 if (Info.hasWorkGroupIDX()) {
3064 Register Reg = Info.addWorkGroupIDX();
3065 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3066 CCInfo.AllocateReg(Reg);
3067 }
3068
3069 if (Info.hasWorkGroupIDY()) {
3070 Register Reg = Info.addWorkGroupIDY();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074
3075 if (Info.hasWorkGroupIDZ()) {
3076 Register Reg = Info.addWorkGroupIDZ();
3077 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3078 CCInfo.AllocateReg(Reg);
3079 }
3080 }
3081
3082 if (Info.hasWorkGroupInfo()) {
3083 Register Reg = Info.addWorkGroupInfo();
3084 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3085 CCInfo.AllocateReg(Reg);
3086 }
3087
3088 if (Info.hasPrivateSegmentWaveByteOffset()) {
3089 // Scratch wave offset passed in system SGPR.
3090 unsigned PrivateSegmentWaveByteOffsetReg;
3091
3092 if (IsShader) {
3093 PrivateSegmentWaveByteOffsetReg =
3094 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3095
3096 // This is true if the scratch wave byte offset doesn't have a fixed
3097 // location.
3098 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3099 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3100 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3101 }
3102 } else
3103 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3104
3105 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3106 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3107 }
3108
3109 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3110 Info.getNumPreloadedSGPRs() >= 16);
3111}
3112
3114 MachineFunction &MF,
3115 const SIRegisterInfo &TRI,
3117 // Now that we've figured out where the scratch register inputs are, see if
3118 // should reserve the arguments and use them directly.
3119 MachineFrameInfo &MFI = MF.getFrameInfo();
3120 bool HasStackObjects = MFI.hasStackObjects();
3121 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3122
3123 // Record that we know we have non-spill stack objects so we don't need to
3124 // check all stack objects later.
3125 if (HasStackObjects)
3126 Info.setHasNonSpillStackObjects(true);
3127
3128 // Everything live out of a block is spilled with fast regalloc, so it's
3129 // almost certain that spilling will be required.
3131 HasStackObjects = true;
3132
3133 // For now assume stack access is needed in any callee functions, so we need
3134 // the scratch registers to pass in.
3135 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3136
3137 if (!ST.enableFlatScratch()) {
3138 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3139 // If we have stack objects, we unquestionably need the private buffer
3140 // resource. For the Code Object V2 ABI, this will be the first 4 user
3141 // SGPR inputs. We can reserve those and use them directly.
3142
3143 Register PrivateSegmentBufferReg =
3145 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3146 } else {
3147 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3148 // We tentatively reserve the last registers (skipping the last registers
3149 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3150 // we'll replace these with the ones immediately after those which were
3151 // really allocated. In the prologue copies will be inserted from the
3152 // argument to these reserved registers.
3153
3154 // Without HSA, relocations are used for the scratch pointer and the
3155 // buffer resource setup is always inserted in the prologue. Scratch wave
3156 // offset is still in an input SGPR.
3157 Info.setScratchRSrcReg(ReservedBufferReg);
3158 }
3159 }
3160
3162
3163 // For entry functions we have to set up the stack pointer if we use it,
3164 // whereas non-entry functions get this "for free". This means there is no
3165 // intrinsic advantage to using S32 over S34 in cases where we do not have
3166 // calls but do need a frame pointer (i.e. if we are requested to have one
3167 // because frame pointer elimination is disabled). To keep things simple we
3168 // only ever use S32 as the call ABI stack pointer, and so using it does not
3169 // imply we need a separate frame pointer.
3170 //
3171 // Try to use s32 as the SP, but move it if it would interfere with input
3172 // arguments. This won't work with calls though.
3173 //
3174 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3175 // registers.
3176 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3177 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3178 } else {
3180
3181 if (MFI.hasCalls())
3182 report_fatal_error("call in graphics shader with too many input SGPRs");
3183
3184 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3185 if (!MRI.isLiveIn(Reg)) {
3186 Info.setStackPtrOffsetReg(Reg);
3187 break;
3188 }
3189 }
3190
3191 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3192 report_fatal_error("failed to find register for SP");
3193 }
3194
3195 // hasFP should be accurate for entry functions even before the frame is
3196 // finalized, because it does not rely on the known stack size, only
3197 // properties like whether variable sized objects are present.
3198 if (ST.getFrameLowering()->hasFP(MF)) {
3199 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3200 }
3201}
3202
3205 return !Info->isEntryFunction();
3206}
3207
3209
3211 MachineBasicBlock *Entry,
3212 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3214
3215 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3216 if (!IStart)
3217 return;
3218
3219 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3220 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3221 MachineBasicBlock::iterator MBBI = Entry->begin();
3222 for (const MCPhysReg *I = IStart; *I; ++I) {
3223 const TargetRegisterClass *RC = nullptr;
3224 if (AMDGPU::SReg_64RegClass.contains(*I))
3225 RC = &AMDGPU::SGPR_64RegClass;
3226 else if (AMDGPU::SReg_32RegClass.contains(*I))
3227 RC = &AMDGPU::SGPR_32RegClass;
3228 else
3229 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3230
3231 Register NewVR = MRI->createVirtualRegister(RC);
3232 // Create copy from CSR to a virtual register.
3233 Entry->addLiveIn(*I);
3234 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3235 .addReg(*I);
3236
3237 // Insert the copy-back instructions right before the terminator.
3238 for (auto *Exit : Exits)
3239 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3240 TII->get(TargetOpcode::COPY), *I)
3241 .addReg(NewVR);
3242 }
3243}
3244
3246 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3247 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3248 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3250
3252 const Function &Fn = MF.getFunction();
3255 bool IsError = false;
3256
3257 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3259 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3260 IsError = true;
3261 }
3262
3265 BitVector Skipped(Ins.size());
3266 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3267 *DAG.getContext());
3268
3269 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3270 bool IsKernel = AMDGPU::isKernel(CallConv);
3271 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3272
3273 if (IsGraphics) {
3274 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3275 assert(!UserSGPRInfo.hasDispatchPtr() &&
3276 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3277 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3278 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3279 (void)UserSGPRInfo;
3280 if (!Subtarget->enableFlatScratch())
3281 assert(!UserSGPRInfo.hasFlatScratchInit());
3282 if ((CallConv != CallingConv::AMDGPU_CS &&
3283 CallConv != CallingConv::AMDGPU_Gfx &&
3284 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3285 !Subtarget->hasArchitectedSGPRs())
3286 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3287 !Info->hasWorkGroupIDZ());
3288 }
3289
3290 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3291
3292 if (CallConv == CallingConv::AMDGPU_PS) {
3293 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3294
3295 // At least one interpolation mode must be enabled or else the GPU will
3296 // hang.
3297 //
3298 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3299 // set PSInputAddr, the user wants to enable some bits after the compilation
3300 // based on run-time states. Since we can't know what the final PSInputEna
3301 // will look like, so we shouldn't do anything here and the user should take
3302 // responsibility for the correct programming.
3303 //
3304 // Otherwise, the following restrictions apply:
3305 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3306 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3307 // enabled too.
3308 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3309 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3310 CCInfo.AllocateReg(AMDGPU::VGPR0);
3311 CCInfo.AllocateReg(AMDGPU::VGPR1);
3312 Info->markPSInputAllocated(0);
3313 Info->markPSInputEnabled(0);
3314 }
3315 if (Subtarget->isAmdPalOS()) {
3316 // For isAmdPalOS, the user does not enable some bits after compilation
3317 // based on run-time states; the register values being generated here are
3318 // the final ones set in hardware. Therefore we need to apply the
3319 // workaround to PSInputAddr and PSInputEnable together. (The case where
3320 // a bit is set in PSInputAddr but not PSInputEnable is where the
3321 // frontend set up an input arg for a particular interpolation mode, but
3322 // nothing uses that input arg. Really we should have an earlier pass
3323 // that removes such an arg.)
3324 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3325 if ((PsInputBits & 0x7F) == 0 ||
3326 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3327 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3328 }
3329 } else if (IsKernel) {
3330 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3331 } else {
3332 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3333 Ins.end());
3334 }
3335
3336 if (IsKernel)
3337 analyzeFormalArgumentsCompute(CCInfo, Ins);
3338
3339 if (IsEntryFunc) {
3340 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3341 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3342 if (IsKernel && Subtarget->hasKernargPreload())
3343 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3344
3345 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3346 } else if (!IsGraphics) {
3347 // For the fixed ABI, pass workitem IDs in the last argument register.
3348 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3349
3350 // FIXME: Sink this into allocateSpecialInputSGPRs
3351 if (!Subtarget->enableFlatScratch())
3352 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3353
3354 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3355 }
3356
3357 if (!IsKernel) {
3358 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3359 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3360
3361 // This assumes the registers are allocated by CCInfo in ascending order
3362 // with no gaps.
3363 Info->setNumWaveDispatchSGPRs(
3364 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3365 Info->setNumWaveDispatchVGPRs(
3366 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3367 } else if (Info->getNumKernargPreloadedSGPRs()) {
3368 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3369 }
3370
3372
3373 if (IsWholeWaveFunc) {
3374 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3375 {MVT::i1, MVT::Other}, Chain);
3376 InVals.push_back(Setup.getValue(0));
3377 Chains.push_back(Setup.getValue(1));
3378 }
3379
3380 // FIXME: This is the minimum kernel argument alignment. We should improve
3381 // this to the maximum alignment of the arguments.
3382 //
3383 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3384 // kern arg offset.
3385 const Align KernelArgBaseAlign = Align(16);
3386
3387 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3388 ++i) {
3389 const ISD::InputArg &Arg = Ins[i];
3390 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3391 InVals.push_back(DAG.getPOISON(Arg.VT));
3392 continue;
3393 }
3394
3395 CCValAssign &VA = ArgLocs[ArgIdx++];
3396 MVT VT = VA.getLocVT();
3397
3398 if (IsEntryFunc && VA.isMemLoc()) {
3399 VT = Ins[i].VT;
3400 EVT MemVT = VA.getLocVT();
3401
3402 const uint64_t Offset = VA.getLocMemOffset();
3403 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3404
3405 if (Arg.Flags.isByRef()) {
3406 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3407
3408 const GCNTargetMachine &TM =
3409 static_cast<const GCNTargetMachine &>(getTargetMachine());
3410 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3411 Arg.Flags.getPointerAddrSpace())) {
3414 }
3415
3416 InVals.push_back(Ptr);
3417 continue;
3418 }
3419
3420 SDValue NewArg;
3421 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3422 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3423 // In this case the argument is packed into the previous preload SGPR.
3424 int64_t AlignDownOffset = alignDown(Offset, 4);
3425 int64_t OffsetDiff = Offset - AlignDownOffset;
3426 EVT IntVT = MemVT.changeTypeToInteger();
3427
3428 const SIMachineFunctionInfo *Info =
3431 Register Reg =
3432 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3433
3434 assert(Reg);
3435 Register VReg = MRI.getLiveInVirtReg(Reg);
3436 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3437
3438 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3439 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3440
3441 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3442 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3443 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3444 Ins[i].Flags.isSExt(), &Ins[i]);
3445
3446 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3447 } else {
3448 const SIMachineFunctionInfo *Info =
3451 const SmallVectorImpl<MCRegister> &PreloadRegs =
3452 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3453
3454 SDValue Copy;
3455 if (PreloadRegs.size() == 1) {
3456 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3457 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3458 NewArg = DAG.getCopyFromReg(
3459 Chain, DL, VReg,
3461 TRI->getRegSizeInBits(*RC)));
3462
3463 } else {
3464 // If the kernarg alignment does not match the alignment of the SGPR
3465 // tuple RC that can accommodate this argument, it will be built up
3466 // via copies from from the individual SGPRs that the argument was
3467 // preloaded to.
3469 for (auto Reg : PreloadRegs) {
3470 Register VReg = MRI.getLiveInVirtReg(Reg);
3471 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3472 Elts.push_back(Copy);
3473 }
3474 NewArg =
3475 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3476 PreloadRegs.size()),
3477 DL, Elts);
3478 }
3479
3480 // If the argument was preloaded to multiple consecutive 32-bit
3481 // registers because of misalignment between addressable SGPR tuples
3482 // and the argument size, we can still assume that because of kernarg
3483 // segment alignment restrictions that NewArg's size is the same as
3484 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3485 // truncate since we cannot preload to less than a single SGPR and the
3486 // MemVT may be smaller.
3487 EVT MemVTInt =
3489 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3490 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3491
3492 NewArg = DAG.getBitcast(MemVT, NewArg);
3493 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3494 Ins[i].Flags.isSExt(), &Ins[i]);
3495 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3496 }
3497 } else {
3498 // Hidden arguments that are in the kernel signature must be preloaded
3499 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3500 // the argument list and is not preloaded.
3501 if (Arg.isOrigArg()) {
3502 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3503 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3505 *OrigArg->getParent(),
3506 "hidden argument in kernel signature was not preloaded",
3507 DL.getDebugLoc()));
3508 }
3509 }
3510
3511 NewArg =
3512 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3513 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3514 }
3515 Chains.push_back(NewArg.getValue(1));
3516
3517 auto *ParamTy =
3518 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3519 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3520 ParamTy &&
3521 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3522 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3523 // On SI local pointers are just offsets into LDS, so they are always
3524 // less than 16-bits. On CI and newer they could potentially be
3525 // real pointers, so we can't guarantee their size.
3526 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3527 DAG.getValueType(MVT::i16));
3528 }
3529
3530 InVals.push_back(NewArg);
3531 continue;
3532 }
3533 if (!IsEntryFunc && VA.isMemLoc()) {
3534 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3535 InVals.push_back(Val);
3536 if (!Arg.Flags.isByVal())
3537 Chains.push_back(Val.getValue(1));
3538 continue;
3539 }
3540
3541 assert(VA.isRegLoc() && "Parameter must be in a register!");
3542
3543 Register Reg = VA.getLocReg();
3544 const TargetRegisterClass *RC = nullptr;
3545 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3546 RC = &AMDGPU::VGPR_32RegClass;
3547 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3548 RC = &AMDGPU::SGPR_32RegClass;
3549 else
3550 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3551
3552 Reg = MF.addLiveIn(Reg, RC);
3553 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3554
3555 if (Arg.Flags.isSRet()) {
3556 // The return object should be reasonably addressable.
3557
3558 // FIXME: This helps when the return is a real sret. If it is a
3559 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3560 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3561 unsigned NumBits =
3563 Val = DAG.getNode(
3564 ISD::AssertZext, DL, VT, Val,
3565 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3566 }
3567
3568 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3569 InVals.push_back(Val);
3570 }
3571
3572 // Start adding system SGPRs.
3573 if (IsEntryFunc)
3574 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3575
3576 // DAG.getPass() returns nullptr when using new pass manager.
3577 // TODO: Use DAG.getMFAM() to access analysis result.
3578 if (DAG.getPass()) {
3579 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3580 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3581 }
3582
3583 unsigned StackArgSize = CCInfo.getStackSize();
3584 Info->setBytesInStackArgArea(StackArgSize);
3585
3586 return Chains.empty() ? Chain
3587 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3588}
3589
3590// TODO: If return values can't fit in registers, we should return as many as
3591// possible in registers before passing on stack.
3593 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3594 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3595 const Type *RetTy) const {
3596 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3597 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3598 // for shaders. Vector types should be explicitly handled by CC.
3599 if (AMDGPU::isEntryFunctionCC(CallConv))
3600 return true;
3601
3603 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3604 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3605 return false;
3606
3607 // We must use the stack if return would require unavailable registers.
3608 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3609 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3610 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3611 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3612 return false;
3613
3614 return true;
3615}
3616
3617SDValue
3619 bool isVarArg,
3621 const SmallVectorImpl<SDValue> &OutVals,
3622 const SDLoc &DL, SelectionDAG &DAG) const {
3626
3627 if (AMDGPU::isKernel(CallConv)) {
3628 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3629 OutVals, DL, DAG);
3630 }
3631
3632 bool IsShader = AMDGPU::isShader(CallConv);
3633
3634 Info->setIfReturnsVoid(Outs.empty());
3635 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3636
3637 // CCValAssign - represent the assignment of the return value to a location.
3639
3640 // CCState - Info about the registers and stack slots.
3641 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3642 *DAG.getContext());
3643
3644 // Analyze outgoing return values.
3645 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3646
3647 SDValue Glue;
3649 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3650
3651 SDValue ReadFirstLane =
3652 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3653 // Copy the result values into the output registers.
3654 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3655 ++I, ++RealRVLocIdx) {
3656 CCValAssign &VA = RVLocs[I];
3657 assert(VA.isRegLoc() && "Can only return in registers!");
3658 // TODO: Partially return in registers if return values don't fit.
3659 SDValue Arg = OutVals[RealRVLocIdx];
3660
3661 // Copied from other backends.
3662 switch (VA.getLocInfo()) {
3663 case CCValAssign::Full:
3664 break;
3665 case CCValAssign::BCvt:
3666 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3667 break;
3668 case CCValAssign::SExt:
3669 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3670 break;
3671 case CCValAssign::ZExt:
3672 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3673 break;
3674 case CCValAssign::AExt:
3675 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3676 break;
3677 default:
3678 llvm_unreachable("Unknown loc info!");
3679 }
3680 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3682 ReadFirstLane, Arg);
3683 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3684 Glue = Chain.getValue(1);
3685 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3686 }
3687
3688 // FIXME: Does sret work properly?
3689 if (!Info->isEntryFunction()) {
3690 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3691 const MCPhysReg *I =
3692 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3693 if (I) {
3694 for (; *I; ++I) {
3695 if (AMDGPU::SReg_64RegClass.contains(*I))
3696 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3697 else if (AMDGPU::SReg_32RegClass.contains(*I))
3698 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3699 else
3700 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3701 }
3702 }
3703 }
3704
3705 // Update chain and glue.
3706 RetOps[0] = Chain;
3707 if (Glue.getNode())
3708 RetOps.push_back(Glue);
3709
3710 unsigned Opc = AMDGPUISD::ENDPGM;
3711 if (!IsWaveEnd)
3712 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3713 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3714 : AMDGPUISD::RET_GLUE;
3715 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3716}
3717
3719 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3720 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3721 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3722 SDValue ThisVal) const {
3723 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3724
3725 // Assign locations to each value returned by this call.
3727 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3728 *DAG.getContext());
3729 CCInfo.AnalyzeCallResult(Ins, RetCC);
3730
3731 // Copy all of the result registers out of their specified physreg.
3732 for (CCValAssign VA : RVLocs) {
3733 SDValue Val;
3734
3735 if (VA.isRegLoc()) {
3736 Val =
3737 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3738 Chain = Val.getValue(1);
3739 InGlue = Val.getValue(2);
3740 } else if (VA.isMemLoc()) {
3741 report_fatal_error("TODO: return values in memory");
3742 } else
3743 llvm_unreachable("unknown argument location type");
3744
3745 switch (VA.getLocInfo()) {
3746 case CCValAssign::Full:
3747 break;
3748 case CCValAssign::BCvt:
3749 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3750 break;
3751 case CCValAssign::ZExt:
3752 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3753 DAG.getValueType(VA.getValVT()));
3754 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3755 break;
3756 case CCValAssign::SExt:
3757 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3758 DAG.getValueType(VA.getValVT()));
3759 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3760 break;
3761 case CCValAssign::AExt:
3762 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3763 break;
3764 default:
3765 llvm_unreachable("Unknown loc info!");
3766 }
3767
3768 InVals.push_back(Val);
3769 }
3770
3771 return Chain;
3772}
3773
3774// Add code to pass special inputs required depending on used features separate
3775// from the explicit user arguments present in the IR.
3777 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3778 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3779 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3780 // If we don't have a call site, this was a call inserted by
3781 // legalization. These can never use special inputs.
3782 if (!CLI.CB)
3783 return;
3784
3785 SelectionDAG &DAG = CLI.DAG;
3786 const SDLoc &DL = CLI.DL;
3787 const Function &F = DAG.getMachineFunction().getFunction();
3788
3789 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3790 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3791
3792 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3794 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3795 // DAG.getPass() returns nullptr when using new pass manager.
3796 // TODO: Use DAG.getMFAM() to access analysis result.
3797 if (DAG.getPass()) {
3798 auto &ArgUsageInfo =
3800 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3801 }
3802 }
3803
3804 // TODO: Unify with private memory register handling. This is complicated by
3805 // the fact that at least in kernels, the input argument is not necessarily
3806 // in the same location as the input.
3807 // clang-format off
3808 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3809 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3810 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3811 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3812 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3813 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3814 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3815 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3816 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3817 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3818 };
3819 // clang-format on
3820
3821 for (auto [InputID, Attrs] : ImplicitAttrs) {
3822 // If the callee does not use the attribute value, skip copying the value.
3823 if (all_of(Attrs, [&](StringRef Attr) {
3824 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3825 }))
3826 continue;
3827
3828 const auto [OutgoingArg, ArgRC, ArgTy] =
3829 CalleeArgInfo->getPreloadedValue(InputID);
3830 if (!OutgoingArg)
3831 continue;
3832
3833 const auto [IncomingArg, IncomingArgRC, Ty] =
3834 CallerArgInfo.getPreloadedValue(InputID);
3835 assert(IncomingArgRC == ArgRC);
3836
3837 // All special arguments are ints for now.
3838 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3839 SDValue InputReg;
3840
3841 if (IncomingArg) {
3842 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3843 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3844 // The implicit arg ptr is special because it doesn't have a corresponding
3845 // input for kernels, and is computed from the kernarg segment pointer.
3846 InputReg = getImplicitArgPtr(DAG, DL);
3847 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3848 std::optional<uint32_t> Id =
3850 if (Id.has_value()) {
3851 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3852 } else {
3853 InputReg = DAG.getPOISON(ArgVT);
3854 }
3855 } else {
3856 // We may have proven the input wasn't needed, although the ABI is
3857 // requiring it. We just need to allocate the register appropriately.
3858 InputReg = DAG.getPOISON(ArgVT);
3859 }
3860
3861 if (OutgoingArg->isRegister()) {
3862 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3863 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3864 report_fatal_error("failed to allocate implicit input argument");
3865 } else {
3866 unsigned SpecialArgOffset =
3867 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3868 SDValue ArgStore =
3869 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3870 MemOpChains.push_back(ArgStore);
3871 }
3872 }
3873
3874 // Pack workitem IDs into a single register or pass it as is if already
3875 // packed.
3876
3877 auto [OutgoingArg, ArgRC, Ty] =
3879 if (!OutgoingArg)
3880 std::tie(OutgoingArg, ArgRC, Ty) =
3882 if (!OutgoingArg)
3883 std::tie(OutgoingArg, ArgRC, Ty) =
3885 if (!OutgoingArg)
3886 return;
3887
3888 const ArgDescriptor *IncomingArgX = std::get<0>(
3890 const ArgDescriptor *IncomingArgY = std::get<0>(
3892 const ArgDescriptor *IncomingArgZ = std::get<0>(
3894
3895 SDValue InputReg;
3896 SDLoc SL;
3897
3898 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3899 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3900 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3901
3902 // If incoming ids are not packed we need to pack them.
3903 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3904 NeedWorkItemIDX) {
3905 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3906 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3907 } else {
3908 InputReg = DAG.getConstant(0, DL, MVT::i32);
3909 }
3910 }
3911
3912 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3913 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3914 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3915 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3916 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3917 InputReg = InputReg.getNode()
3918 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3919 : Y;
3920 }
3921
3922 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3923 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3924 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3925 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3926 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3927 InputReg = InputReg.getNode()
3928 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3929 : Z;
3930 }
3931
3932 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3933 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3934 // We're in a situation where the outgoing function requires the workitem
3935 // ID, but the calling function does not have it (e.g a graphics function
3936 // calling a C calling convention function). This is illegal, but we need
3937 // to produce something.
3938 InputReg = DAG.getPOISON(MVT::i32);
3939 } else {
3940 // Workitem ids are already packed, any of present incoming arguments
3941 // will carry all required fields.
3942 ArgDescriptor IncomingArg =
3943 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3944 : IncomingArgY ? *IncomingArgY
3945 : *IncomingArgZ,
3946 ~0u);
3947 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3948 }
3949 }
3950
3951 if (OutgoingArg->isRegister()) {
3952 if (InputReg)
3953 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3954
3955 CCInfo.AllocateReg(OutgoingArg->getRegister());
3956 } else {
3957 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3958 if (InputReg) {
3959 SDValue ArgStore =
3960 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3961 MemOpChains.push_back(ArgStore);
3962 }
3963 }
3964}
3965
3967 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3969 const SmallVectorImpl<SDValue> &OutVals,
3970 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3971 if (AMDGPU::isChainCC(CalleeCC))
3972 return true;
3973
3974 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3975 return false;
3976
3977 // For a divergent call target, we need to do a waterfall loop over the
3978 // possible callees which precludes us from using a simple jump.
3979 if (Callee->isDivergent())
3980 return false;
3981
3983 const Function &CallerF = MF.getFunction();
3984 CallingConv::ID CallerCC = CallerF.getCallingConv();
3986 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3987
3988 // Kernels aren't callable, and don't have a live in return address so it
3989 // doesn't make sense to do a tail call with entry functions.
3990 if (!CallerPreserved)
3991 return false;
3992
3993 bool CCMatch = CallerCC == CalleeCC;
3994
3996 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3997 return true;
3998 return false;
3999 }
4000
4001 // TODO: Can we handle var args?
4002 if (IsVarArg)
4003 return false;
4004
4005 for (const Argument &Arg : CallerF.args()) {
4006 if (Arg.hasByValAttr())
4007 return false;
4008 }
4009
4010 LLVMContext &Ctx = *DAG.getContext();
4011
4012 // Check that the call results are passed in the same way.
4013 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4014 CCAssignFnForCall(CalleeCC, IsVarArg),
4015 CCAssignFnForCall(CallerCC, IsVarArg)))
4016 return false;
4017
4018 // The callee has to preserve all registers the caller needs to preserve.
4019 if (!CCMatch) {
4020 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4021 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4022 return false;
4023 }
4024
4025 // Nothing more to check if the callee is taking no arguments.
4026 if (Outs.empty())
4027 return true;
4028
4030 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4031
4032 // FIXME: We are not allocating special input registers, so we will be
4033 // deciding based on incorrect register assignments.
4034 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4035
4036 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4037 // If the stack arguments for this call do not fit into our own save area then
4038 // the call cannot be made tail.
4039 // TODO: Is this really necessary?
4040 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4041 return false;
4042
4043 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4044 // FIXME: What about inreg arguments that end up passed in memory?
4045 if (!CCVA.isRegLoc())
4046 continue;
4047
4048 // If we are passing an argument in an SGPR, and the value is divergent,
4049 // this call requires a waterfall loop.
4050 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4051 LLVM_DEBUG(
4052 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4053 << printReg(CCVA.getLocReg(), TRI) << '\n');
4054 return false;
4055 }
4056 }
4057
4058 const MachineRegisterInfo &MRI = MF.getRegInfo();
4059 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4060}
4061
4063 if (!CI->isTailCall())
4064 return false;
4065
4066 const Function *ParentFn = CI->getFunction();
4068 return false;
4069 return true;
4070}
4071
4072namespace {
4073// Chain calls have special arguments that we need to handle. These are
4074// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4075// arguments (index 0 and 1 respectively).
4076enum ChainCallArgIdx {
4077 Exec = 2,
4078 Flags,
4079 NumVGPRs,
4080 FallbackExec,
4081 FallbackCallee
4082};
4083} // anonymous namespace
4084
4085// The wave scratch offset register is used as the global base pointer.
4087 SmallVectorImpl<SDValue> &InVals) const {
4088 CallingConv::ID CallConv = CLI.CallConv;
4089 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4090
4091 SelectionDAG &DAG = CLI.DAG;
4092
4093 const SDLoc &DL = CLI.DL;
4094 SDValue Chain = CLI.Chain;
4095 SDValue Callee = CLI.Callee;
4096
4097 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4098 bool UsesDynamicVGPRs = false;
4099 if (IsChainCallConv) {
4100 // The last arguments should be the value that we need to put in EXEC,
4101 // followed by the flags and any other arguments with special meanings.
4102 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4103 // we don't treat them like the "real" arguments.
4104 auto RequestedExecIt =
4105 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4106 return Arg.OrigArgIndex == 2;
4107 });
4108 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4109
4110 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4111 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4112 CLI.OutVals.end());
4113 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4114
4115 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4116 "Haven't popped all the special args");
4117
4118 TargetLowering::ArgListEntry RequestedExecArg =
4119 CLI.Args[ChainCallArgIdx::Exec];
4120 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4121 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4122
4123 // Convert constants into TargetConstants, so they become immediate operands
4124 // instead of being selected into S_MOV.
4125 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4126 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4127 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4128 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4129 } else
4130 ChainCallSpecialArgs.push_back(Arg.Node);
4131 };
4132
4133 PushNodeOrTargetConstant(RequestedExecArg);
4134
4135 // Process any other special arguments depending on the value of the flags.
4136 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4137
4138 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4139 if (FlagsValue.isZero()) {
4140 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4141 return lowerUnhandledCall(CLI, InVals,
4142 "no additional args allowed if flags == 0");
4143 } else if (FlagsValue.isOneBitSet(0)) {
4144 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4145 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4146 }
4147
4148 if (!Subtarget->isWave32()) {
4149 return lowerUnhandledCall(
4150 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4151 }
4152
4153 UsesDynamicVGPRs = true;
4154 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4155 CLI.Args.end(), PushNodeOrTargetConstant);
4156 }
4157 }
4158
4160 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4162 bool &IsTailCall = CLI.IsTailCall;
4163 bool IsVarArg = CLI.IsVarArg;
4164 bool IsSibCall = false;
4166
4167 if (Callee.isUndef() || isNullConstant(Callee)) {
4168 if (!CLI.IsTailCall) {
4169 for (ISD::InputArg &Arg : CLI.Ins)
4170 InVals.push_back(DAG.getPOISON(Arg.VT));
4171 }
4172
4173 return Chain;
4174 }
4175
4176 if (IsVarArg) {
4177 return lowerUnhandledCall(CLI, InVals,
4178 "unsupported call to variadic function ");
4179 }
4180
4181 if (!CLI.CB)
4182 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4183
4184 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4185 return lowerUnhandledCall(CLI, InVals,
4186 "unsupported required tail call to function ");
4187 }
4188
4189 if (IsTailCall) {
4190 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4191 Outs, OutVals, Ins, DAG);
4192 if (!IsTailCall &&
4193 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4194 report_fatal_error("failed to perform tail call elimination on a call "
4195 "site marked musttail or on llvm.amdgcn.cs.chain");
4196 }
4197
4198 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4199
4200 // A sibling call is one where we're under the usual C ABI and not planning
4201 // to change that but can still do a tail call:
4202 if (!TailCallOpt && IsTailCall)
4203 IsSibCall = true;
4204
4205 if (IsTailCall)
4206 ++NumTailCalls;
4207 }
4208
4211 SmallVector<SDValue, 8> MemOpChains;
4212
4213 // Analyze operands of the call, assigning locations to each operand.
4215 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4216 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4217
4218 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4220 // With a fixed ABI, allocate fixed registers before user arguments.
4221 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4222 }
4223
4224 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4225
4226 // Get a count of how many bytes are to be pushed on the stack.
4227 unsigned NumBytes = CCInfo.getStackSize();
4228
4229 if (IsSibCall) {
4230 // Since we're not changing the ABI to make this a tail call, the memory
4231 // operands are already available in the caller's incoming argument space.
4232 NumBytes = 0;
4233 }
4234
4235 // FPDiff is the byte offset of the call's argument area from the callee's.
4236 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4237 // by this amount for a tail call. In a sibling call it must be 0 because the
4238 // caller will deallocate the entire stack and the callee still expects its
4239 // arguments to begin at SP+0. Completely unused for non-tail calls.
4240 int32_t FPDiff = 0;
4241 MachineFrameInfo &MFI = MF.getFrameInfo();
4242 auto *TRI = Subtarget->getRegisterInfo();
4243
4244 // Adjust the stack pointer for the new arguments...
4245 // These operations are automatically eliminated by the prolog/epilog pass
4246 if (!IsSibCall)
4247 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4248
4249 if (!IsSibCall || IsChainCallConv) {
4250 if (!Subtarget->enableFlatScratch()) {
4251 SmallVector<SDValue, 4> CopyFromChains;
4252
4253 // In the HSA case, this should be an identity copy.
4254 SDValue ScratchRSrcReg =
4255 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4256 RegsToPass.emplace_back(IsChainCallConv
4257 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4258 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4259 ScratchRSrcReg);
4260 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4261 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4262 }
4263 }
4264
4265 const unsigned NumSpecialInputs = RegsToPass.size();
4266
4267 MVT PtrVT = MVT::i32;
4268
4269 // Walk the register/memloc assignments, inserting copies/loads.
4270 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4271 CCValAssign &VA = ArgLocs[i];
4272 SDValue Arg = OutVals[i];
4273
4274 // Promote the value if needed.
4275 switch (VA.getLocInfo()) {
4276 case CCValAssign::Full:
4277 break;
4278 case CCValAssign::BCvt:
4279 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4280 break;
4281 case CCValAssign::ZExt:
4282 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4283 break;
4284 case CCValAssign::SExt:
4285 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4286 break;
4287 case CCValAssign::AExt:
4288 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4289 break;
4290 case CCValAssign::FPExt:
4291 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4292 break;
4293 default:
4294 llvm_unreachable("Unknown loc info!");
4295 }
4296
4297 if (VA.isRegLoc()) {
4298 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4299 } else {
4300 assert(VA.isMemLoc());
4301
4302 SDValue DstAddr;
4303 MachinePointerInfo DstInfo;
4304
4305 unsigned LocMemOffset = VA.getLocMemOffset();
4306 int32_t Offset = LocMemOffset;
4307
4308 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4309 MaybeAlign Alignment;
4310
4311 if (IsTailCall) {
4312 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4313 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4314 : VA.getValVT().getStoreSize();
4315
4316 // FIXME: We can have better than the minimum byval required alignment.
4317 Alignment =
4318 Flags.isByVal()
4319 ? Flags.getNonZeroByValAlign()
4320 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4321
4322 Offset = Offset + FPDiff;
4323 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4324
4325 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4326 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4327
4328 // Make sure any stack arguments overlapping with where we're storing
4329 // are loaded before this eventual operation. Otherwise they'll be
4330 // clobbered.
4331
4332 // FIXME: Why is this really necessary? This seems to just result in a
4333 // lot of code to copy the stack and write them back to the same
4334 // locations, which are supposed to be immutable?
4335 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4336 } else {
4337 // Stores to the argument stack area are relative to the stack pointer.
4338 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4339 MVT::i32);
4340 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4341 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4342 Alignment =
4343 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4344 }
4345
4346 if (Outs[i].Flags.isByVal()) {
4347 SDValue SizeNode =
4348 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4349 SDValue Cpy =
4350 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4351 Outs[i].Flags.getNonZeroByValAlign(),
4352 /*isVol = */ false, /*AlwaysInline = */ true,
4353 /*CI=*/nullptr, std::nullopt, DstInfo,
4355
4356 MemOpChains.push_back(Cpy);
4357 } else {
4358 SDValue Store =
4359 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4360 MemOpChains.push_back(Store);
4361 }
4362 }
4363 }
4364
4365 if (!MemOpChains.empty())
4366 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4367
4368 SDValue ReadFirstLaneID =
4369 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4370
4371 SDValue TokenGlue;
4372 if (CLI.ConvergenceControlToken) {
4373 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4375 }
4376
4377 // Build a sequence of copy-to-reg nodes chained together with token chain
4378 // and flag operands which copy the outgoing args into the appropriate regs.
4379 SDValue InGlue;
4380
4381 unsigned ArgIdx = 0;
4382 for (auto [Reg, Val] : RegsToPass) {
4383 if (ArgIdx++ >= NumSpecialInputs &&
4384 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4385 // For chain calls, the inreg arguments are required to be
4386 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4387 // they are uniform.
4388 //
4389 // For other calls, if an inreg arguments is known to be uniform,
4390 // speculatively insert a readfirstlane in case it is in a VGPR.
4391 //
4392 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4393 // value, so let that continue to produce invalid code.
4394
4395 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4396 if (TokenGlue)
4397 ReadfirstlaneArgs.push_back(TokenGlue);
4399 ReadfirstlaneArgs);
4400 }
4401
4402 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4403 InGlue = Chain.getValue(1);
4404 }
4405
4406 // We don't usually want to end the call-sequence here because we would tidy
4407 // the frame up *after* the call, however in the ABI-changing tail-call case
4408 // we've carefully laid out the parameters so that when sp is reset they'll be
4409 // in the correct location.
4410 if (IsTailCall && !IsSibCall) {
4411 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4412 InGlue = Chain.getValue(1);
4413 }
4414
4415 std::vector<SDValue> Ops({Chain});
4416
4417 // Add a redundant copy of the callee global which will not be legalized, as
4418 // we need direct access to the callee later.
4420 const GlobalValue *GV = GSD->getGlobal();
4421 Ops.push_back(Callee);
4422 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4423 } else {
4424 if (IsTailCall) {
4425 // isEligibleForTailCallOptimization considered whether the call target is
4426 // divergent, but we may still end up with a uniform value in a VGPR.
4427 // Insert a readfirstlane just in case.
4428 SDValue ReadFirstLaneID =
4429 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4430
4431 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4432 if (TokenGlue)
4433 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4434 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4435 ReadfirstlaneArgs);
4436 }
4437
4438 Ops.push_back(Callee);
4439 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4440 }
4441
4442 if (IsTailCall) {
4443 // Each tail call may have to adjust the stack by a different amount, so
4444 // this information must travel along with the operation for eventual
4445 // consumption by emitEpilogue.
4446 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4447 }
4448
4449 if (IsChainCallConv)
4450 llvm::append_range(Ops, ChainCallSpecialArgs);
4451
4452 // Add argument registers to the end of the list so that they are known live
4453 // into the call.
4454 for (auto &[Reg, Val] : RegsToPass)
4455 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4456
4457 // Add a register mask operand representing the call-preserved registers.
4458 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4459 assert(Mask && "Missing call preserved mask for calling convention");
4460 Ops.push_back(DAG.getRegisterMask(Mask));
4461
4462 if (SDValue Token = CLI.ConvergenceControlToken) {
4464 GlueOps.push_back(Token);
4465 if (InGlue)
4466 GlueOps.push_back(InGlue);
4467
4468 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4469 MVT::Glue, GlueOps),
4470 0);
4471 }
4472
4473 if (InGlue)
4474 Ops.push_back(InGlue);
4475
4476 // If we're doing a tall call, use a TC_RETURN here rather than an
4477 // actual call instruction.
4478 if (IsTailCall) {
4479 MFI.setHasTailCall();
4480 unsigned OPC = AMDGPUISD::TC_RETURN;
4481 switch (CallConv) {
4483 OPC = AMDGPUISD::TC_RETURN_GFX;
4484 break;
4487 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4488 : AMDGPUISD::TC_RETURN_CHAIN;
4489 break;
4490 }
4491
4492 // If the caller is a whole wave function, we need to use a special opcode
4493 // so we can patch up EXEC.
4494 if (Info->isWholeWaveFunction())
4495 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4496
4497 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4498 }
4499
4500 // Returns a chain and a flag for retval copy to use.
4501 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4502 Chain = Call.getValue(0);
4503 InGlue = Call.getValue(1);
4504
4505 uint64_t CalleePopBytes = NumBytes;
4506 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4507 if (!Ins.empty())
4508 InGlue = Chain.getValue(1);
4509
4510 // Handle result values, copying them out of physregs into vregs that we
4511 // return.
4512 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4513 InVals, /*IsThisReturn=*/false, SDValue());
4514}
4515
4516// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4517// except for:
4518// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4519// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4521 SelectionDAG &DAG) const {
4522 const MachineFunction &MF = DAG.getMachineFunction();
4524
4525 SDLoc dl(Op);
4526 EVT VT = Op.getValueType();
4527 SDValue Chain = Op.getOperand(0);
4528 Register SPReg = Info->getStackPtrOffsetReg();
4529
4530 // Chain the dynamic stack allocation so that it doesn't modify the stack
4531 // pointer when other instructions are using the stack.
4532 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4533
4534 SDValue Size = Op.getOperand(1);
4535 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4536 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4537
4538 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4540 "Stack grows upwards for AMDGPU");
4541
4542 Chain = BaseAddr.getValue(1);
4543 Align StackAlign = TFL->getStackAlign();
4544 if (Alignment > StackAlign) {
4545 uint64_t ScaledAlignment = Alignment.value()
4546 << Subtarget->getWavefrontSizeLog2();
4547 uint64_t StackAlignMask = ScaledAlignment - 1;
4548 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4549 DAG.getConstant(StackAlignMask, dl, VT));
4550 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4551 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4552 }
4553
4554 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4555 SDValue NewSP;
4557 // For constant sized alloca, scale alloca size by wave-size
4558 SDValue ScaledSize = DAG.getNode(
4559 ISD::SHL, dl, VT, Size,
4560 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4561 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4562 } else {
4563 // For dynamic sized alloca, perform wave-wide reduction to get max of
4564 // alloca size(divergent) and then scale it by wave-size
4565 SDValue WaveReduction =
4566 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4567 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4568 Size, DAG.getConstant(0, dl, MVT::i32));
4569 SDValue ScaledSize = DAG.getNode(
4570 ISD::SHL, dl, VT, Size,
4571 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4572 NewSP =
4573 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4574 SDValue ReadFirstLaneID =
4575 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4576 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4577 NewSP);
4578 }
4579
4580 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4581 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4582
4583 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4584}
4585
4587 if (Op.getValueType() != MVT::i32)
4588 return Op; // Defer to cannot select error.
4589
4591 SDLoc SL(Op);
4592
4593 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4594
4595 // Convert from wave uniform to swizzled vector address. This should protect
4596 // from any edge cases where the stacksave result isn't directly used with
4597 // stackrestore.
4598 SDValue VectorAddress =
4599 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4600 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4601}
4602
4604 SelectionDAG &DAG) const {
4605 SDLoc SL(Op);
4606 assert(Op.getValueType() == MVT::i32);
4607
4608 uint32_t BothRoundHwReg =
4610 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4611
4612 SDValue IntrinID =
4613 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4614 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4615 Op.getOperand(0), IntrinID, GetRoundBothImm);
4616
4617 // There are two rounding modes, one for f32 and one for f64/f16. We only
4618 // report in the standard value range if both are the same.
4619 //
4620 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4621 // ties away from zero is not supported, and the other values are rotated by
4622 // 1.
4623 //
4624 // If the two rounding modes are not the same, report a target defined value.
4625
4626 // Mode register rounding mode fields:
4627 //
4628 // [1:0] Single-precision round mode.
4629 // [3:2] Double/Half-precision round mode.
4630 //
4631 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4632 //
4633 // Hardware Spec
4634 // Toward-0 3 0
4635 // Nearest Even 0 1
4636 // +Inf 1 2
4637 // -Inf 2 3
4638 // NearestAway0 N/A 4
4639 //
4640 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4641 // table we can index by the raw hardware mode.
4642 //
4643 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4644
4645 SDValue BitTable =
4647
4648 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4649 SDValue RoundModeTimesNumBits =
4650 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4651
4652 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4653 // knew only one mode was demanded.
4654 SDValue TableValue =
4655 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4656 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4657
4658 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4659 SDValue TableEntry =
4660 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4661
4662 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4663 // if it's an extended value.
4664 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4665 SDValue IsStandardValue =
4666 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4667 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4668 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4669 TableEntry, EnumOffset);
4670
4671 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4672}
4673
4675 SelectionDAG &DAG) const {
4676 SDLoc SL(Op);
4677
4678 SDValue NewMode = Op.getOperand(1);
4679 assert(NewMode.getValueType() == MVT::i32);
4680
4681 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4682 // hardware MODE.fp_round values.
4683 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4684 uint32_t ClampedVal = std::min(
4685 static_cast<uint32_t>(ConstMode->getZExtValue()),
4687 NewMode = DAG.getConstant(
4688 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4689 } else {
4690 // If we know the input can only be one of the supported standard modes in
4691 // the range 0-3, we can use a simplified mapping to hardware values.
4692 KnownBits KB = DAG.computeKnownBits(NewMode);
4693 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4694 // The supported standard values are 0-3. The extended values start at 8. We
4695 // need to offset by 4 if the value is in the extended range.
4696
4697 if (UseReducedTable) {
4698 // Truncate to the low 32-bits.
4699 SDValue BitTable = DAG.getConstant(
4700 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4701
4702 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4703 SDValue RoundModeTimesNumBits =
4704 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4705
4706 NewMode =
4707 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4708
4709 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4710 // the table extracted bits into inline immediates.
4711 } else {
4712 // table_index = umin(value, value - 4)
4713 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4714 SDValue BitTable =
4716
4717 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4718 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4719 SDValue IndexVal =
4720 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4721
4722 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4723 SDValue RoundModeTimesNumBits =
4724 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4725
4726 SDValue TableValue =
4727 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4728 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4729
4730 // No need to mask out the high bits since the setreg will ignore them
4731 // anyway.
4732 NewMode = TruncTable;
4733 }
4734
4735 // Insert a readfirstlane in case the value is a VGPR. We could do this
4736 // earlier and keep more operations scalar, but that interferes with
4737 // combining the source.
4738 SDValue ReadFirstLaneID =
4739 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4740 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4741 ReadFirstLaneID, NewMode);
4742 }
4743
4744 // N.B. The setreg will be later folded into s_round_mode on supported
4745 // targets.
4746 SDValue IntrinID =
4747 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4748 uint32_t BothRoundHwReg =
4750 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4751
4752 SDValue SetReg =
4753 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4754 IntrinID, RoundBothImm, NewMode);
4755
4756 return SetReg;
4757}
4758
4760 if (Op->isDivergent() &&
4761 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4762 // Cannot do I$ prefetch with divergent pointer.
4763 return SDValue();
4764
4765 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4769 break;
4771 if (Subtarget->hasSafeSmemPrefetch())
4772 break;
4773 [[fallthrough]];
4774 default:
4775 return SDValue();
4776 }
4777
4778 // I$ prefetch
4779 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4780 return SDValue();
4781
4782 return Op;
4783}
4784
4785// Work around DAG legality rules only based on the result type.
4787 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4788 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4789 EVT SrcVT = Src.getValueType();
4790
4791 if (SrcVT.getScalarType() != MVT::bf16)
4792 return Op;
4793
4794 SDLoc SL(Op);
4795 SDValue BitCast =
4796 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4797
4798 EVT DstVT = Op.getValueType();
4799 if (IsStrict)
4800 llvm_unreachable("Need STRICT_BF16_TO_FP");
4801
4802 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4803}
4804
4806 SDLoc SL(Op);
4807 if (Op.getValueType() != MVT::i64)
4808 return Op;
4809
4810 uint32_t ModeHwReg =
4812 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4813 uint32_t TrapHwReg =
4815 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4816
4817 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4818 SDValue IntrinID =
4819 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4820 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4821 Op.getOperand(0), IntrinID, ModeHwRegImm);
4822 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4823 Op.getOperand(0), IntrinID, TrapHwRegImm);
4824 SDValue TokenReg =
4825 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4826 GetTrapReg.getValue(1));
4827
4828 SDValue CvtPtr =
4829 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4830 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4831
4832 return DAG.getMergeValues({Result, TokenReg}, SL);
4833}
4834
4836 SDLoc SL(Op);
4837 if (Op.getOperand(1).getValueType() != MVT::i64)
4838 return Op;
4839
4840 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4841 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4842 DAG.getConstant(0, SL, MVT::i32));
4843 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4844 DAG.getConstant(1, SL, MVT::i32));
4845
4846 SDValue ReadFirstLaneID =
4847 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4848 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4849 ReadFirstLaneID, NewModeReg);
4850 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4851 ReadFirstLaneID, NewTrapReg);
4852
4853 unsigned ModeHwReg =
4855 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4856 unsigned TrapHwReg =
4858 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4859
4860 SDValue IntrinID =
4861 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4862 SDValue SetModeReg =
4863 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4864 IntrinID, ModeHwRegImm, NewModeReg);
4865 SDValue SetTrapReg =
4866 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4867 IntrinID, TrapHwRegImm, NewTrapReg);
4868 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4869}
4870
4872 const MachineFunction &MF) const {
4873 const Function &Fn = MF.getFunction();
4874
4876 .Case("m0", AMDGPU::M0)
4877 .Case("exec", AMDGPU::EXEC)
4878 .Case("exec_lo", AMDGPU::EXEC_LO)
4879 .Case("exec_hi", AMDGPU::EXEC_HI)
4880 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4881 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4882 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4883 .Default(Register());
4884 if (!Reg)
4885 return Reg;
4886
4887 if (!Subtarget->hasFlatScrRegister() &&
4888 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4889 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4890 "\" for subtarget."));
4891 }
4892
4893 switch (Reg) {
4894 case AMDGPU::M0:
4895 case AMDGPU::EXEC_LO:
4896 case AMDGPU::EXEC_HI:
4897 case AMDGPU::FLAT_SCR_LO:
4898 case AMDGPU::FLAT_SCR_HI:
4899 if (VT.getSizeInBits() == 32)
4900 return Reg;
4901 break;
4902 case AMDGPU::EXEC:
4903 case AMDGPU::FLAT_SCR:
4904 if (VT.getSizeInBits() == 64)
4905 return Reg;
4906 break;
4907 default:
4908 llvm_unreachable("missing register type checking");
4909 }
4910
4912 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4913}
4914
4915// If kill is not the last instruction, split the block so kill is always a
4916// proper terminator.
4919 MachineBasicBlock *BB) const {
4920 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4922 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4923 return SplitBB;
4924}
4925
4926// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4927// \p MI will be the only instruction in the loop body block. Otherwise, it will
4928// be the first instruction in the remainder block.
4929//
4930/// \returns { LoopBody, Remainder }
4931static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4933 MachineFunction *MF = MBB.getParent();
4935
4936 // To insert the loop we need to split the block. Move everything after this
4937 // point to a new block, and insert a new empty block between the two.
4939 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4941 ++MBBI;
4942
4943 MF->insert(MBBI, LoopBB);
4944 MF->insert(MBBI, RemainderBB);
4945
4946 LoopBB->addSuccessor(LoopBB);
4947 LoopBB->addSuccessor(RemainderBB);
4948
4949 // Move the rest of the block into a new block.
4950 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4951
4952 if (InstInLoop) {
4953 auto Next = std::next(I);
4954
4955 // Move instruction to loop body.
4956 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4957
4958 // Move the rest of the block.
4959 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4960 } else {
4961 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4962 }
4963
4964 MBB.addSuccessor(LoopBB);
4965
4966 return std::pair(LoopBB, RemainderBB);
4967}
4968
4969/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4971 MachineBasicBlock *MBB = MI.getParent();
4973 auto I = MI.getIterator();
4974 auto E = std::next(I);
4975
4976 // clang-format off
4977 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4978 .addImm(0);
4979 // clang-format on
4980
4981 MIBundleBuilder Bundler(*MBB, I, E);
4982 finalizeBundle(*MBB, Bundler.begin());
4983}
4984
4987 MachineBasicBlock *BB) const {
4988 const DebugLoc &DL = MI.getDebugLoc();
4989
4991
4993
4994 // Apparently kill flags are only valid if the def is in the same block?
4995 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4996 Src->setIsKill(false);
4997
4998 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4999
5000 MachineBasicBlock::iterator I = LoopBB->end();
5001
5002 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5004
5005 // Clear TRAP_STS.MEM_VIOL
5006 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5007 .addImm(0)
5008 .addImm(EncodedReg);
5009
5011
5012 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5013
5014 // Load and check TRAP_STS.MEM_VIOL
5015 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5016 .addImm(EncodedReg);
5017
5018 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5019 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5020 .addReg(Reg, RegState::Kill)
5021 .addImm(0);
5022 // clang-format off
5023 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5024 .addMBB(LoopBB);
5025 // clang-format on
5026
5027 return RemainderBB;
5028}
5029
5030// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5031// wavefront. If the value is uniform and just happens to be in a VGPR, this
5032// will only do one iteration. In the worst case, this will loop 64 times.
5033//
5034// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5037 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5038 const DebugLoc &DL, const MachineOperand &Idx,
5039 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5040 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5041 Register &SGPRIdxReg) {
5042
5043 MachineFunction *MF = OrigBB.getParent();
5044 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5045 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5048
5049 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5050 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5051 Register NewExec = MRI.createVirtualRegister(BoolRC);
5052 Register CurrentIdxReg =
5053 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5054 Register CondReg = MRI.createVirtualRegister(BoolRC);
5055
5056 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5057 .addReg(InitReg)
5058 .addMBB(&OrigBB)
5059 .addReg(ResultReg)
5060 .addMBB(&LoopBB);
5061
5062 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5063 .addReg(InitSaveExecReg)
5064 .addMBB(&OrigBB)
5065 .addReg(NewExec)
5066 .addMBB(&LoopBB);
5067
5068 // Read the next variant <- also loop target.
5069 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5070 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5071
5072 // Compare the just read M0 value to all possible Idx values.
5073 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5074 .addReg(CurrentIdxReg)
5075 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5076
5077 // Update EXEC, save the original EXEC value to VCC.
5078 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5079 .addReg(CondReg, RegState::Kill);
5080
5081 MRI.setSimpleHint(NewExec, CondReg);
5082
5083 if (UseGPRIdxMode) {
5084 if (Offset == 0) {
5085 SGPRIdxReg = CurrentIdxReg;
5086 } else {
5087 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5088 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5089 .addReg(CurrentIdxReg, RegState::Kill)
5090 .addImm(Offset);
5091 }
5092 } else {
5093 // Move index from VCC into M0
5094 if (Offset == 0) {
5095 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5096 .addReg(CurrentIdxReg, RegState::Kill);
5097 } else {
5098 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5099 .addReg(CurrentIdxReg, RegState::Kill)
5100 .addImm(Offset);
5101 }
5102 }
5103
5104 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5105 MachineInstr *InsertPt =
5106 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5107 .addReg(LMC.ExecReg)
5108 .addReg(NewExec);
5109
5110 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5111 // s_cbranch_scc0?
5112
5113 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5114 // clang-format off
5115 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5116 .addMBB(&LoopBB);
5117 // clang-format on
5118
5119 return InsertPt->getIterator();
5120}
5121
5122// This has slightly sub-optimal regalloc when the source vector is killed by
5123// the read. The register allocator does not understand that the kill is
5124// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5125// subregister from it, using 1 more VGPR than necessary. This was saved when
5126// this was expanded after register allocation.
5129 unsigned InitResultReg, unsigned PhiReg, int Offset,
5130 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5131 MachineFunction *MF = MBB.getParent();
5132 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5133 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5135 const DebugLoc &DL = MI.getDebugLoc();
5137
5138 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5139 Register DstReg = MI.getOperand(0).getReg();
5140 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5141 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5143
5144 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5145
5146 // Save the EXEC mask
5147 // clang-format off
5148 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5149 .addReg(LMC.ExecReg);
5150 // clang-format on
5151
5152 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5153
5154 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5155
5156 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5157 InitResultReg, DstReg, PhiReg, TmpExec,
5158 Offset, UseGPRIdxMode, SGPRIdxReg);
5159
5160 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5162 ++MBBI;
5163 MF->insert(MBBI, LandingPad);
5164 LoopBB->removeSuccessor(RemainderBB);
5165 LandingPad->addSuccessor(RemainderBB);
5166 LoopBB->addSuccessor(LandingPad);
5167 MachineBasicBlock::iterator First = LandingPad->begin();
5168 // clang-format off
5169 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5170 .addReg(SaveExec);
5171 // clang-format on
5172
5173 return InsPt;
5174}
5175
5176// Returns subreg index, offset
5177static std::pair<unsigned, int>
5179 const TargetRegisterClass *SuperRC, unsigned VecReg,
5180 int Offset) {
5181 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5182
5183 // Skip out of bounds offsets, or else we would end up using an undefined
5184 // register.
5185 if (Offset >= NumElts || Offset < 0)
5186 return std::pair(AMDGPU::sub0, Offset);
5187
5188 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5189}
5190
5193 int Offset) {
5194 MachineBasicBlock *MBB = MI.getParent();
5195 const DebugLoc &DL = MI.getDebugLoc();
5197
5198 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5199
5200 assert(Idx->getReg() != AMDGPU::NoRegister);
5201
5202 if (Offset == 0) {
5203 // clang-format off
5204 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5205 .add(*Idx);
5206 // clang-format on
5207 } else {
5208 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5209 .add(*Idx)
5210 .addImm(Offset);
5211 }
5212}
5213
5216 int Offset) {
5217 MachineBasicBlock *MBB = MI.getParent();
5218 const DebugLoc &DL = MI.getDebugLoc();
5220
5221 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5222
5223 if (Offset == 0)
5224 return Idx->getReg();
5225
5226 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5227 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5228 .add(*Idx)
5229 .addImm(Offset);
5230 return Tmp;
5231}
5232
5235 const GCNSubtarget &ST) {
5236 const SIInstrInfo *TII = ST.getInstrInfo();
5237 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5238 MachineFunction *MF = MBB.getParent();
5240
5241 Register Dst = MI.getOperand(0).getReg();
5242 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5243 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5244 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5245
5246 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5247 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5248
5249 unsigned SubReg;
5250 std::tie(SubReg, Offset) =
5251 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5252
5253 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5254
5255 // Check for a SGPR index.
5256 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5258 const DebugLoc &DL = MI.getDebugLoc();
5259
5260 if (UseGPRIdxMode) {
5261 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5262 // to avoid interfering with other uses, so probably requires a new
5263 // optimization pass.
5265
5266 const MCInstrDesc &GPRIDXDesc =
5267 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5268 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5269 .addReg(SrcReg)
5270 .addReg(Idx)
5271 .addImm(SubReg);
5272 } else {
5274
5275 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5276 .addReg(SrcReg, 0, SubReg)
5277 .addReg(SrcReg, RegState::Implicit);
5278 }
5279
5280 MI.eraseFromParent();
5281
5282 return &MBB;
5283 }
5284
5285 // Control flow needs to be inserted if indexing with a VGPR.
5286 const DebugLoc &DL = MI.getDebugLoc();
5288
5289 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5290 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5291
5292 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5293
5294 Register SGPRIdxReg;
5295 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5296 UseGPRIdxMode, SGPRIdxReg);
5297
5298 MachineBasicBlock *LoopBB = InsPt->getParent();
5299
5300 if (UseGPRIdxMode) {
5301 const MCInstrDesc &GPRIDXDesc =
5302 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5303
5304 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5305 .addReg(SrcReg)
5306 .addReg(SGPRIdxReg)
5307 .addImm(SubReg);
5308 } else {
5309 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5310 .addReg(SrcReg, 0, SubReg)
5311 .addReg(SrcReg, RegState::Implicit);
5312 }
5313
5314 MI.eraseFromParent();
5315
5316 return LoopBB;
5317}
5318
5321 const GCNSubtarget &ST) {
5322 const SIInstrInfo *TII = ST.getInstrInfo();
5323 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5324 MachineFunction *MF = MBB.getParent();
5326
5327 Register Dst = MI.getOperand(0).getReg();
5328 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5329 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5330 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5331 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5332 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5333 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5334
5335 // This can be an immediate, but will be folded later.
5336 assert(Val->getReg());
5337
5338 unsigned SubReg;
5339 std::tie(SubReg, Offset) =
5340 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5341 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5342
5343 if (Idx->getReg() == AMDGPU::NoRegister) {
5345 const DebugLoc &DL = MI.getDebugLoc();
5346
5347 assert(Offset == 0);
5348
5349 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5350 .add(*SrcVec)
5351 .add(*Val)
5352 .addImm(SubReg);
5353
5354 MI.eraseFromParent();
5355 return &MBB;
5356 }
5357
5358 // Check for a SGPR index.
5359 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5361 const DebugLoc &DL = MI.getDebugLoc();
5362
5363 if (UseGPRIdxMode) {
5365
5366 const MCInstrDesc &GPRIDXDesc =
5367 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5368 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5369 .addReg(SrcVec->getReg())
5370 .add(*Val)
5371 .addReg(Idx)
5372 .addImm(SubReg);
5373 } else {
5375
5376 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5377 TRI.getRegSizeInBits(*VecRC), 32, false);
5378 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5379 .addReg(SrcVec->getReg())
5380 .add(*Val)
5381 .addImm(SubReg);
5382 }
5383 MI.eraseFromParent();
5384 return &MBB;
5385 }
5386
5387 // Control flow needs to be inserted if indexing with a VGPR.
5388 if (Val->isReg())
5389 MRI.clearKillFlags(Val->getReg());
5390
5391 const DebugLoc &DL = MI.getDebugLoc();
5392
5393 Register PhiReg = MRI.createVirtualRegister(VecRC);
5394
5395 Register SGPRIdxReg;
5396 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5397 UseGPRIdxMode, SGPRIdxReg);
5398 MachineBasicBlock *LoopBB = InsPt->getParent();
5399
5400 if (UseGPRIdxMode) {
5401 const MCInstrDesc &GPRIDXDesc =
5402 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5403
5404 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5405 .addReg(PhiReg)
5406 .add(*Val)
5407 .addReg(SGPRIdxReg)
5408 .addImm(SubReg);
5409 } else {
5410 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5411 TRI.getRegSizeInBits(*VecRC), 32, false);
5412 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5413 .addReg(PhiReg)
5414 .add(*Val)
5415 .addImm(SubReg);
5416 }
5417
5418 MI.eraseFromParent();
5419 return LoopBB;
5420}
5421
5423 MachineBasicBlock *BB) {
5424 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5425 // For GFX12, we emit s_add_u64 and s_sub_u64.
5426 MachineFunction *MF = BB->getParent();
5427 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5428 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5430 const DebugLoc &DL = MI.getDebugLoc();
5431 MachineOperand &Dest = MI.getOperand(0);
5432 MachineOperand &Src0 = MI.getOperand(1);
5433 MachineOperand &Src1 = MI.getOperand(2);
5434 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5435 if (ST.hasScalarAddSub64()) {
5436 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5437 // clang-format off
5438 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5439 .add(Src0)
5440 .add(Src1);
5441 // clang-format on
5442 } else {
5443 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5444 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5445
5446 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5447 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5448
5449 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5450 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5451 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5452 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5453
5454 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5455 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5456 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5457 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5458
5459 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5460 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5461 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5462 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5463 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5464 .addReg(DestSub0)
5465 .addImm(AMDGPU::sub0)
5466 .addReg(DestSub1)
5467 .addImm(AMDGPU::sub1);
5468 }
5469 MI.eraseFromParent();
5470 return BB;
5471}
5472
5474 switch (Opc) {
5475 case AMDGPU::S_MIN_U32:
5476 return std::numeric_limits<uint32_t>::max();
5477 case AMDGPU::S_MIN_I32:
5478 return std::numeric_limits<int32_t>::max();
5479 case AMDGPU::S_MAX_U32:
5480 return std::numeric_limits<uint32_t>::min();
5481 case AMDGPU::S_MAX_I32:
5482 return std::numeric_limits<int32_t>::min();
5483 case AMDGPU::V_ADD_F32_e64: // -0.0
5484 return 0x80000000;
5485 case AMDGPU::V_SUB_F32_e64: // +0.0
5486 return 0x0;
5487 case AMDGPU::S_ADD_I32:
5488 case AMDGPU::S_SUB_I32:
5489 case AMDGPU::S_OR_B32:
5490 case AMDGPU::S_XOR_B32:
5491 return std::numeric_limits<uint32_t>::min();
5492 case AMDGPU::S_AND_B32:
5493 return std::numeric_limits<uint32_t>::max();
5494 case AMDGPU::V_MIN_F32_e64:
5495 case AMDGPU::V_MAX_F32_e64:
5496 return 0x7fc00000; // qNAN
5497 default:
5499 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5500 }
5501}
5502
5504 switch (Opc) {
5505 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5506 return std::numeric_limits<uint64_t>::max();
5507 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5508 return std::numeric_limits<int64_t>::max();
5509 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5510 return std::numeric_limits<uint64_t>::min();
5511 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5512 return std::numeric_limits<int64_t>::min();
5513 case AMDGPU::S_ADD_U64_PSEUDO:
5514 case AMDGPU::S_SUB_U64_PSEUDO:
5515 case AMDGPU::S_OR_B64:
5516 case AMDGPU::S_XOR_B64:
5517 return std::numeric_limits<uint64_t>::min();
5518 case AMDGPU::S_AND_B64:
5519 return std::numeric_limits<uint64_t>::max();
5520 default:
5522 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5523 }
5524}
5525
5526static bool is32bitWaveReduceOperation(unsigned Opc) {
5527 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5528 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5529 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5530 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5531 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5532 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5533 Opc == AMDGPU::V_SUB_F32_e64;
5534}
5535
5537 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5538 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5539}
5540
5543 const GCNSubtarget &ST,
5544 unsigned Opc) {
5546 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5547 const DebugLoc &DL = MI.getDebugLoc();
5548 const SIInstrInfo *TII = ST.getInstrInfo();
5549
5550 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5551 Register SrcReg = MI.getOperand(1).getReg();
5552 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5553 Register DstReg = MI.getOperand(0).getReg();
5554 MachineBasicBlock *RetBB = nullptr;
5555 if (isSGPR) {
5556 switch (Opc) {
5557 case AMDGPU::S_MIN_U32:
5558 case AMDGPU::S_MIN_I32:
5559 case AMDGPU::V_MIN_F32_e64:
5560 case AMDGPU::S_MAX_U32:
5561 case AMDGPU::S_MAX_I32:
5562 case AMDGPU::V_MAX_F32_e64:
5563 case AMDGPU::S_AND_B32:
5564 case AMDGPU::S_OR_B32: {
5565 // Idempotent operations.
5566 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5567 RetBB = &BB;
5568 break;
5569 }
5570 case AMDGPU::V_CMP_LT_U64_e64: // umin
5571 case AMDGPU::V_CMP_LT_I64_e64: // min
5572 case AMDGPU::V_CMP_GT_U64_e64: // umax
5573 case AMDGPU::V_CMP_GT_I64_e64: // max
5574 case AMDGPU::S_AND_B64:
5575 case AMDGPU::S_OR_B64: {
5576 // Idempotent operations.
5577 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5578 RetBB = &BB;
5579 break;
5580 }
5581 case AMDGPU::S_XOR_B32:
5582 case AMDGPU::S_XOR_B64:
5583 case AMDGPU::S_ADD_I32:
5584 case AMDGPU::S_ADD_U64_PSEUDO:
5585 case AMDGPU::V_ADD_F32_e64:
5586 case AMDGPU::S_SUB_I32:
5587 case AMDGPU::S_SUB_U64_PSEUDO:
5588 case AMDGPU::V_SUB_F32_e64: {
5589 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5590 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5591 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5592 Register NumActiveLanes =
5593 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5594
5595 bool IsWave32 = ST.isWave32();
5596 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5597 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5598 unsigned BitCountOpc =
5599 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5600
5601 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5602
5603 auto NewAccumulator =
5604 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5605 .addReg(ExecMask);
5606
5607 switch (Opc) {
5608 case AMDGPU::S_XOR_B32:
5609 case AMDGPU::S_XOR_B64: {
5610 // Performing an XOR operation on a uniform value
5611 // depends on the parity of the number of active lanes.
5612 // For even parity, the result will be 0, for odd
5613 // parity the result will be the same as the input value.
5614 Register ParityRegister =
5615 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5616
5617 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5618 .addReg(NewAccumulator->getOperand(0).getReg())
5619 .addImm(1)
5620 .setOperandDead(3); // Dead scc
5621 if (Opc == AMDGPU::S_XOR_B32) {
5622 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5623 .addReg(SrcReg)
5624 .addReg(ParityRegister);
5625 } else {
5626 Register DestSub0 =
5627 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5628 Register DestSub1 =
5629 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5630
5631 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5632 const TargetRegisterClass *SrcSubRC =
5633 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5634
5635 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5636 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5637 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5638 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5639
5640 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5641 .add(Op1L)
5642 .addReg(ParityRegister);
5643
5644 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5645 .add(Op1H)
5646 .addReg(ParityRegister);
5647
5648 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5649 .addReg(DestSub0)
5650 .addImm(AMDGPU::sub0)
5651 .addReg(DestSub1)
5652 .addImm(AMDGPU::sub1);
5653 }
5654 break;
5655 }
5656 case AMDGPU::S_SUB_I32: {
5657 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5658
5659 // Take the negation of the source operand.
5660 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5661 .addImm(0)
5662 .addReg(SrcReg);
5663 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5664 .addReg(NegatedVal)
5665 .addReg(NewAccumulator->getOperand(0).getReg());
5666 break;
5667 }
5668 case AMDGPU::S_ADD_I32: {
5669 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5670 .addReg(SrcReg)
5671 .addReg(NewAccumulator->getOperand(0).getReg());
5672 break;
5673 }
5674 case AMDGPU::S_ADD_U64_PSEUDO:
5675 case AMDGPU::S_SUB_U64_PSEUDO: {
5676 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5677 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5678 Register Op1H_Op0L_Reg =
5679 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5680 Register Op1L_Op0H_Reg =
5681 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5682 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5683 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5684 Register NegatedValLo =
5685 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5686 Register NegatedValHi =
5687 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5688
5689 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5690 const TargetRegisterClass *Src1SubRC =
5691 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5692
5693 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5694 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5695 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5696 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5697
5698 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5699 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5700 .addImm(0)
5701 .addReg(NewAccumulator->getOperand(0).getReg())
5702 .setOperandDead(3); // Dead scc
5703 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5704 .addReg(NegatedValLo)
5705 .addImm(31)
5706 .setOperandDead(3); // Dead scc
5707 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5708 .add(Op1L)
5709 .addReg(NegatedValHi);
5710 }
5711 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5712 ? NegatedValLo
5713 : NewAccumulator->getOperand(0).getReg();
5714 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5715 .add(Op1L)
5716 .addReg(LowOpcode);
5717 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5718 .add(Op1L)
5719 .addReg(LowOpcode);
5720 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5721 .add(Op1H)
5722 .addReg(LowOpcode);
5723
5724 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5725 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5726 .addReg(CarryReg)
5727 .addReg(Op1H_Op0L_Reg)
5728 .setOperandDead(3); // Dead scc
5729
5730 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5731 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5732 .addReg(HiVal)
5733 .addReg(Op1L_Op0H_Reg)
5734 .setOperandDead(3); // Dead scc
5735 }
5736 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5737 .addReg(DestSub0)
5738 .addImm(AMDGPU::sub0)
5739 .addReg(DestSub1)
5740 .addImm(AMDGPU::sub1);
5741 break;
5742 }
5743 case AMDGPU::V_ADD_F32_e64:
5744 case AMDGPU::V_SUB_F32_e64: {
5745 Register ActiveLanesVreg =
5746 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5747 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5748 // Get number of active lanes as a float val.
5749 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5750 ActiveLanesVreg)
5751 .addReg(NewAccumulator->getOperand(0).getReg())
5752 .addImm(0) // clamp
5753 .addImm(0); // output-modifier
5754
5755 // Take negation of input for SUB reduction
5756 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5757 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5758 .addImm(srcMod) // src0 modifier
5759 .addReg(SrcReg)
5760 .addImm(0) // src1 modifier
5761 .addReg(ActiveLanesVreg)
5762 .addImm(0) // clamp
5763 .addImm(0); // output-mod
5764 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5765 .addReg(DstVreg);
5766 }
5767 }
5768 RetBB = &BB;
5769 }
5770 }
5771 } else {
5772 // TODO: Implement DPP Strategy and switch based on immediate strategy
5773 // operand. For now, for all the cases (default, Iterative and DPP we use
5774 // iterative approach by default.)
5775
5776 // To reduce the VGPR using iterative approach, we need to iterate
5777 // over all the active lanes. Lowering consists of ComputeLoop,
5778 // which iterate over only active lanes. We use copy of EXEC register
5779 // as induction variable and every active lane modifies it using bitset0
5780 // so that we will get the next active lane for next iteration.
5782 Register SrcReg = MI.getOperand(1).getReg();
5783 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5785
5786 // Create Control flow for loop
5787 // Split MI's Machine Basic block into For loop
5788 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5789
5790 // Create virtual registers required for lowering.
5791 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5792 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5793 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5794 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5795 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5796 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5797 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5798 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5799 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5800
5801 bool IsWave32 = ST.isWave32();
5802 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5803 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5804
5805 // Create initial values of induction variable from Exec, Accumulator and
5806 // insert branch instr to newly created ComputeBlock
5807 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5808 if (is32BitOpc) {
5810 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5811 .addImm(IdentityValue);
5812 } else {
5814 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5815 .addImm(IdentityValue);
5816 }
5817 // clang-format off
5818 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5819 .addMBB(ComputeLoop);
5820 // clang-format on
5821
5822 // Start constructing ComputeLoop
5823 I = ComputeLoop->begin();
5824 auto Accumulator =
5825 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5826 .addReg(IdentityValReg)
5827 .addMBB(&BB);
5828 auto ActiveBits =
5829 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5830 .addReg(LoopIterator)
5831 .addMBB(&BB);
5832
5833 I = ComputeLoop->end();
5834 MachineInstr *NewAccumulator;
5835 // Perform the computations
5836 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5837 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5838 .addReg(ActiveBitsReg);
5839 if (is32BitOpc) {
5840 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5841 LaneValueReg)
5842 .addReg(SrcReg)
5843 .addReg(FF1Reg);
5844 if (isFPOp) {
5845 Register LaneValVreg =
5846 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5847 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5848 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5849 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5850 LaneValVreg)
5851 .addReg(LaneValueReg);
5852 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5853 .addImm(0) // src0 modifier
5854 .addReg(Accumulator->getOperand(0).getReg())
5855 .addImm(0) // src1 modifier
5856 .addReg(LaneValVreg)
5857 .addImm(0) // clamp
5858 .addImm(0); // omod
5859 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5860 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5861 .addReg(DstVreg);
5862 } else {
5863 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5864 .addReg(Accumulator->getOperand(0).getReg())
5865 .addReg(LaneValueReg);
5866 }
5867 } else {
5868 Register LaneValueLoReg =
5869 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5870 Register LaneValueHiReg =
5871 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5872 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5873 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5874 const TargetRegisterClass *SrcSubRC =
5875 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5876 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5877 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5878 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5879 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5880 // lane value input should be in an sgpr
5881 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5882 LaneValueLoReg)
5883 .add(Op1L)
5884 .addReg(FF1Reg);
5885 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5886 LaneValueHiReg)
5887 .add(Op1H)
5888 .addReg(FF1Reg);
5889 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5890 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5891 .addReg(LaneValueLoReg)
5892 .addImm(AMDGPU::sub0)
5893 .addReg(LaneValueHiReg)
5894 .addImm(AMDGPU::sub1);
5895 switch (Opc) {
5896 case AMDGPU::S_OR_B64:
5897 case AMDGPU::S_AND_B64:
5898 case AMDGPU::S_XOR_B64: {
5899 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5900 .addReg(Accumulator->getOperand(0).getReg())
5901 .addReg(LaneValue->getOperand(0).getReg())
5902 .setOperandDead(3); // Dead scc
5903 break;
5904 }
5905 case AMDGPU::V_CMP_GT_I64_e64:
5906 case AMDGPU::V_CMP_GT_U64_e64:
5907 case AMDGPU::V_CMP_LT_I64_e64:
5908 case AMDGPU::V_CMP_LT_U64_e64: {
5909 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5910 Register ComparisonResultReg =
5911 MRI.createVirtualRegister(WaveMaskRegClass);
5912 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5913 const TargetRegisterClass *VSubRegClass =
5914 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5915 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5916 MachineOperand SrcReg0Sub0 =
5917 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5918 VregClass, AMDGPU::sub0, VSubRegClass);
5919 MachineOperand SrcReg0Sub1 =
5920 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5921 VregClass, AMDGPU::sub1, VSubRegClass);
5922 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5923 AccumulatorVReg)
5924 .add(SrcReg0Sub0)
5925 .addImm(AMDGPU::sub0)
5926 .add(SrcReg0Sub1)
5927 .addImm(AMDGPU::sub1);
5928 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5929 .addReg(LaneValue->getOperand(0).getReg())
5930 .addReg(AccumulatorVReg);
5931
5932 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5933 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5934 .addReg(LaneMaskReg)
5935 .addReg(ActiveBitsReg);
5936
5937 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5938 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5939 .addReg(LaneValue->getOperand(0).getReg())
5940 .addReg(Accumulator->getOperand(0).getReg());
5941 break;
5942 }
5943 case AMDGPU::S_ADD_U64_PSEUDO:
5944 case AMDGPU::S_SUB_U64_PSEUDO: {
5945 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5946 .addReg(Accumulator->getOperand(0).getReg())
5947 .addReg(LaneValue->getOperand(0).getReg());
5948 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5949 break;
5950 }
5951 }
5952 }
5953 // Manipulate the iterator to get the next active lane
5954 unsigned BITSETOpc =
5955 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5956 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5957 .addReg(FF1Reg)
5958 .addReg(ActiveBitsReg);
5959
5960 // Add phi nodes
5961 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5962 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5963
5964 // Creating branching
5965 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5966 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5967 .addReg(NewActiveBitsReg)
5968 .addImm(0);
5969 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5970 .addMBB(ComputeLoop);
5971
5972 RetBB = ComputeEnd;
5973 }
5974 MI.eraseFromParent();
5975 return RetBB;
5976}
5977
5980 MachineBasicBlock *BB) const {
5981 MachineFunction *MF = BB->getParent();
5983 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5985 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5987 const DebugLoc &DL = MI.getDebugLoc();
5988
5989 switch (MI.getOpcode()) {
5990 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5991 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5992 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5993 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5994 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5995 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5996 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5997 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5998 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5999 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6000 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6001 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6002 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6003 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6004 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6005 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6006 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6007 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6008 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6009 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6010 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6011 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6012 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6013 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6014 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6015 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6016 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6017 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6018 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6019 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6020 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6021 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6022 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6023 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6024 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6025 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6026 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6027 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6028 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6029 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6030 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6031 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6032 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6033 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6034 case AMDGPU::S_UADDO_PSEUDO:
6035 case AMDGPU::S_USUBO_PSEUDO: {
6036 MachineOperand &Dest0 = MI.getOperand(0);
6037 MachineOperand &Dest1 = MI.getOperand(1);
6038 MachineOperand &Src0 = MI.getOperand(2);
6039 MachineOperand &Src1 = MI.getOperand(3);
6040
6041 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6042 ? AMDGPU::S_ADD_U32
6043 : AMDGPU::S_SUB_U32;
6044 // clang-format off
6045 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6046 .add(Src0)
6047 .add(Src1);
6048 // clang-format on
6049
6050 unsigned SelOpc =
6051 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6052 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6053
6054 MI.eraseFromParent();
6055 return BB;
6056 }
6057 case AMDGPU::S_ADD_U64_PSEUDO:
6058 case AMDGPU::S_SUB_U64_PSEUDO: {
6059 return Expand64BitScalarArithmetic(MI, BB);
6060 }
6061 case AMDGPU::V_ADD_U64_PSEUDO:
6062 case AMDGPU::V_SUB_U64_PSEUDO: {
6063 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6064
6065 MachineOperand &Dest = MI.getOperand(0);
6066 MachineOperand &Src0 = MI.getOperand(1);
6067 MachineOperand &Src1 = MI.getOperand(2);
6068
6069 if (ST.hasAddSubU64Insts()) {
6070 auto I = BuildMI(*BB, MI, DL,
6071 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6072 : AMDGPU::V_SUB_U64_e64),
6073 Dest.getReg())
6074 .add(Src0)
6075 .add(Src1)
6076 .addImm(0); // clamp
6077 TII->legalizeOperands(*I);
6078 MI.eraseFromParent();
6079 return BB;
6080 }
6081
6082 if (IsAdd && ST.hasLshlAddU64Inst()) {
6083 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6084 Dest.getReg())
6085 .add(Src0)
6086 .addImm(0)
6087 .add(Src1);
6088 TII->legalizeOperands(*Add);
6089 MI.eraseFromParent();
6090 return BB;
6091 }
6092
6093 const auto *CarryRC = TRI->getWaveMaskRegClass();
6094
6095 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6096 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6097
6098 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6099 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6100
6101 const TargetRegisterClass *Src0RC = Src0.isReg()
6102 ? MRI.getRegClass(Src0.getReg())
6103 : &AMDGPU::VReg_64RegClass;
6104 const TargetRegisterClass *Src1RC = Src1.isReg()
6105 ? MRI.getRegClass(Src1.getReg())
6106 : &AMDGPU::VReg_64RegClass;
6107
6108 const TargetRegisterClass *Src0SubRC =
6109 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6110 const TargetRegisterClass *Src1SubRC =
6111 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6112
6113 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6114 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6115 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6116 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6117
6118 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6119 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6120 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6121 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6122
6123 unsigned LoOpc =
6124 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6125 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6126 .addReg(CarryReg, RegState::Define)
6127 .add(SrcReg0Sub0)
6128 .add(SrcReg1Sub0)
6129 .addImm(0); // clamp bit
6130
6131 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6132 MachineInstr *HiHalf =
6133 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6134 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6135 .add(SrcReg0Sub1)
6136 .add(SrcReg1Sub1)
6137 .addReg(CarryReg, RegState::Kill)
6138 .addImm(0); // clamp bit
6139
6140 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6141 .addReg(DestSub0)
6142 .addImm(AMDGPU::sub0)
6143 .addReg(DestSub1)
6144 .addImm(AMDGPU::sub1);
6145 TII->legalizeOperands(*LoHalf);
6146 TII->legalizeOperands(*HiHalf);
6147 MI.eraseFromParent();
6148 return BB;
6149 }
6150 case AMDGPU::S_ADD_CO_PSEUDO:
6151 case AMDGPU::S_SUB_CO_PSEUDO: {
6152 // This pseudo has a chance to be selected
6153 // only from uniform add/subcarry node. All the VGPR operands
6154 // therefore assumed to be splat vectors.
6156 MachineOperand &Dest = MI.getOperand(0);
6157 MachineOperand &CarryDest = MI.getOperand(1);
6158 MachineOperand &Src0 = MI.getOperand(2);
6159 MachineOperand &Src1 = MI.getOperand(3);
6160 MachineOperand &Src2 = MI.getOperand(4);
6161 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6162 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6163 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6164 .addReg(Src0.getReg());
6165 Src0.setReg(RegOp0);
6166 }
6167 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6168 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6169 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6170 .addReg(Src1.getReg());
6171 Src1.setReg(RegOp1);
6172 }
6173 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6174 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6176 .addReg(Src2.getReg());
6177 Src2.setReg(RegOp2);
6178 }
6179
6180 if (ST.isWave64()) {
6181 if (ST.hasScalarCompareEq64()) {
6182 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6183 .addReg(Src2.getReg())
6184 .addImm(0);
6185 } else {
6186 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6187 const TargetRegisterClass *SubRC =
6188 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6189 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6190 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6191 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6192 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6193 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6194
6195 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6196 .add(Src2Sub0)
6197 .add(Src2Sub1);
6198
6199 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6200 .addReg(Src2_32, RegState::Kill)
6201 .addImm(0);
6202 }
6203 } else {
6204 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6205 .addReg(Src2.getReg())
6206 .addImm(0);
6207 }
6208
6209 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6210 ? AMDGPU::S_ADDC_U32
6211 : AMDGPU::S_SUBB_U32;
6212
6213 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6214
6215 unsigned SelOpc =
6216 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6217
6218 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6219 .addImm(-1)
6220 .addImm(0);
6221
6222 MI.eraseFromParent();
6223 return BB;
6224 }
6225 case AMDGPU::SI_INIT_M0: {
6226 MachineOperand &M0Init = MI.getOperand(0);
6227 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6228 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6229 AMDGPU::M0)
6230 .add(M0Init);
6231 MI.eraseFromParent();
6232 return BB;
6233 }
6234 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6235 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6236 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6237 TII->get(AMDGPU::S_CMP_EQ_U32))
6238 .addImm(0)
6239 .addImm(0);
6240 return BB;
6241 }
6242 case AMDGPU::GET_GROUPSTATICSIZE: {
6243 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6244 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6245 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6246 .add(MI.getOperand(0))
6247 .addImm(MFI->getLDSSize());
6248 MI.eraseFromParent();
6249 return BB;
6250 }
6251 case AMDGPU::GET_SHADERCYCLESHILO: {
6253 // The algorithm is:
6254 //
6255 // hi1 = getreg(SHADER_CYCLES_HI)
6256 // lo1 = getreg(SHADER_CYCLES_LO)
6257 // hi2 = getreg(SHADER_CYCLES_HI)
6258 //
6259 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6260 // Otherwise there was overflow and the result is hi2:0. In both cases the
6261 // result should represent the actual time at some point during the sequence
6262 // of three getregs.
6263 using namespace AMDGPU::Hwreg;
6264 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6265 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6266 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6267 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6268 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6269 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6270 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6271 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6272 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6273 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6274 .addReg(RegHi1)
6275 .addReg(RegHi2);
6276 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6277 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6278 .addReg(RegLo1)
6279 .addImm(0);
6280 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6281 .add(MI.getOperand(0))
6282 .addReg(RegLo)
6283 .addImm(AMDGPU::sub0)
6284 .addReg(RegHi2)
6285 .addImm(AMDGPU::sub1);
6286 MI.eraseFromParent();
6287 return BB;
6288 }
6289 case AMDGPU::SI_INDIRECT_SRC_V1:
6290 case AMDGPU::SI_INDIRECT_SRC_V2:
6291 case AMDGPU::SI_INDIRECT_SRC_V4:
6292 case AMDGPU::SI_INDIRECT_SRC_V8:
6293 case AMDGPU::SI_INDIRECT_SRC_V9:
6294 case AMDGPU::SI_INDIRECT_SRC_V10:
6295 case AMDGPU::SI_INDIRECT_SRC_V11:
6296 case AMDGPU::SI_INDIRECT_SRC_V12:
6297 case AMDGPU::SI_INDIRECT_SRC_V16:
6298 case AMDGPU::SI_INDIRECT_SRC_V32:
6299 return emitIndirectSrc(MI, *BB, *getSubtarget());
6300 case AMDGPU::SI_INDIRECT_DST_V1:
6301 case AMDGPU::SI_INDIRECT_DST_V2:
6302 case AMDGPU::SI_INDIRECT_DST_V4:
6303 case AMDGPU::SI_INDIRECT_DST_V8:
6304 case AMDGPU::SI_INDIRECT_DST_V9:
6305 case AMDGPU::SI_INDIRECT_DST_V10:
6306 case AMDGPU::SI_INDIRECT_DST_V11:
6307 case AMDGPU::SI_INDIRECT_DST_V12:
6308 case AMDGPU::SI_INDIRECT_DST_V16:
6309 case AMDGPU::SI_INDIRECT_DST_V32:
6310 return emitIndirectDst(MI, *BB, *getSubtarget());
6311 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6312 case AMDGPU::SI_KILL_I1_PSEUDO:
6313 return splitKillBlock(MI, BB);
6314 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6315 Register Dst = MI.getOperand(0).getReg();
6316 const MachineOperand &Src0 = MI.getOperand(1);
6317 const MachineOperand &Src1 = MI.getOperand(2);
6318 Register SrcCond = MI.getOperand(3).getReg();
6319
6320 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6321 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6322 const auto *CondRC = TRI->getWaveMaskRegClass();
6323 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6324
6325 const TargetRegisterClass *Src0RC = Src0.isReg()
6326 ? MRI.getRegClass(Src0.getReg())
6327 : &AMDGPU::VReg_64RegClass;
6328 const TargetRegisterClass *Src1RC = Src1.isReg()
6329 ? MRI.getRegClass(Src1.getReg())
6330 : &AMDGPU::VReg_64RegClass;
6331
6332 const TargetRegisterClass *Src0SubRC =
6333 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6334 const TargetRegisterClass *Src1SubRC =
6335 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6336
6337 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6338 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6339 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6340 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6341
6342 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6343 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6344 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6345 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6346
6347 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6348 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6349 .addImm(0)
6350 .add(Src0Sub0)
6351 .addImm(0)
6352 .add(Src1Sub0)
6353 .addReg(SrcCondCopy);
6354 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6355 .addImm(0)
6356 .add(Src0Sub1)
6357 .addImm(0)
6358 .add(Src1Sub1)
6359 .addReg(SrcCondCopy);
6360
6361 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6362 .addReg(DstLo)
6363 .addImm(AMDGPU::sub0)
6364 .addReg(DstHi)
6365 .addImm(AMDGPU::sub1);
6366 MI.eraseFromParent();
6367 return BB;
6368 }
6369 case AMDGPU::SI_BR_UNDEF: {
6370 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6371 .add(MI.getOperand(0));
6372 Br->getOperand(1).setIsUndef(); // read undef SCC
6373 MI.eraseFromParent();
6374 return BB;
6375 }
6376 case AMDGPU::ADJCALLSTACKUP:
6377 case AMDGPU::ADJCALLSTACKDOWN: {
6379 MachineInstrBuilder MIB(*MF, &MI);
6380 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6381 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6382 return BB;
6383 }
6384 case AMDGPU::SI_CALL_ISEL: {
6385 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6386
6388 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6389
6390 for (const MachineOperand &MO : MI.operands())
6391 MIB.add(MO);
6392
6393 MIB.cloneMemRefs(MI);
6394 MI.eraseFromParent();
6395 return BB;
6396 }
6397 case AMDGPU::V_ADD_CO_U32_e32:
6398 case AMDGPU::V_SUB_CO_U32_e32:
6399 case AMDGPU::V_SUBREV_CO_U32_e32: {
6400 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6401 unsigned Opc = MI.getOpcode();
6402
6403 bool NeedClampOperand = false;
6404 if (TII->pseudoToMCOpcode(Opc) == -1) {
6406 NeedClampOperand = true;
6407 }
6408
6409 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6410 if (TII->isVOP3(*I)) {
6411 I.addReg(TRI->getVCC(), RegState::Define);
6412 }
6413 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6414 if (NeedClampOperand)
6415 I.addImm(0); // clamp bit for e64 encoding
6416
6417 TII->legalizeOperands(*I);
6418
6419 MI.eraseFromParent();
6420 return BB;
6421 }
6422 case AMDGPU::V_ADDC_U32_e32:
6423 case AMDGPU::V_SUBB_U32_e32:
6424 case AMDGPU::V_SUBBREV_U32_e32:
6425 // These instructions have an implicit use of vcc which counts towards the
6426 // constant bus limit.
6427 TII->legalizeOperands(MI);
6428 return BB;
6429 case AMDGPU::DS_GWS_INIT:
6430 case AMDGPU::DS_GWS_SEMA_BR:
6431 case AMDGPU::DS_GWS_BARRIER:
6432 case AMDGPU::DS_GWS_SEMA_V:
6433 case AMDGPU::DS_GWS_SEMA_P:
6434 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6435 // A s_waitcnt 0 is required to be the instruction immediately following.
6436 if (getSubtarget()->hasGWSAutoReplay()) {
6438 return BB;
6439 }
6440
6441 return emitGWSMemViolTestLoop(MI, BB);
6442 case AMDGPU::S_SETREG_B32: {
6443 // Try to optimize cases that only set the denormal mode or rounding mode.
6444 //
6445 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6446 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6447 // instead.
6448 //
6449 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6450 // allow you to have a no side effect instruction in the output of a
6451 // sideeffecting pattern.
6452 auto [ID, Offset, Width] =
6453 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6455 return BB;
6456
6457 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6458 const unsigned SetMask = WidthMask << Offset;
6459
6460 if (getSubtarget()->hasDenormModeInst()) {
6461 unsigned SetDenormOp = 0;
6462 unsigned SetRoundOp = 0;
6463
6464 // The dedicated instructions can only set the whole denorm or round mode
6465 // at once, not a subset of bits in either.
6466 if (SetMask ==
6468 // If this fully sets both the round and denorm mode, emit the two
6469 // dedicated instructions for these.
6470 SetRoundOp = AMDGPU::S_ROUND_MODE;
6471 SetDenormOp = AMDGPU::S_DENORM_MODE;
6472 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6473 SetRoundOp = AMDGPU::S_ROUND_MODE;
6474 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6475 SetDenormOp = AMDGPU::S_DENORM_MODE;
6476 }
6477
6478 if (SetRoundOp || SetDenormOp) {
6479 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6480 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6481 unsigned ImmVal = Def->getOperand(1).getImm();
6482 if (SetRoundOp) {
6483 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6484 .addImm(ImmVal & 0xf);
6485
6486 // If we also have the denorm mode, get just the denorm mode bits.
6487 ImmVal >>= 4;
6488 }
6489
6490 if (SetDenormOp) {
6491 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6492 .addImm(ImmVal & 0xf);
6493 }
6494
6495 MI.eraseFromParent();
6496 return BB;
6497 }
6498 }
6499 }
6500
6501 // If only FP bits are touched, used the no side effects pseudo.
6502 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6503 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6504 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6505
6506 return BB;
6507 }
6508 case AMDGPU::S_INVERSE_BALLOT_U32:
6509 case AMDGPU::S_INVERSE_BALLOT_U64:
6510 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6511 // necessary. After that they are equivalent to a COPY.
6512 MI.setDesc(TII->get(AMDGPU::COPY));
6513 return BB;
6514 case AMDGPU::ENDPGM_TRAP: {
6515 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6516 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6517 MI.addOperand(MachineOperand::CreateImm(0));
6518 return BB;
6519 }
6520
6521 // We need a block split to make the real endpgm a terminator. We also don't
6522 // want to break phis in successor blocks, so we can't just delete to the
6523 // end of the block.
6524
6525 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6527 MF->push_back(TrapBB);
6528 // clang-format off
6529 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6530 .addImm(0);
6531 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6532 .addMBB(TrapBB);
6533 // clang-format on
6534
6535 BB->addSuccessor(TrapBB);
6536 MI.eraseFromParent();
6537 return SplitBB;
6538 }
6539 case AMDGPU::SIMULATED_TRAP: {
6540 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6541 MachineBasicBlock *SplitBB =
6542 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6543 MI.eraseFromParent();
6544 return SplitBB;
6545 }
6546 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6547 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6549
6550 // During ISel, it's difficult to propagate the original EXEC mask to use as
6551 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6552 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6553 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6554 Register OriginalExec = Setup->getOperand(0).getReg();
6555 MF->getRegInfo().clearKillFlags(OriginalExec);
6556 MI.getOperand(0).setReg(OriginalExec);
6557 return BB;
6558 }
6559 default:
6560 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6561 if (!MI.mayStore())
6563 return BB;
6564 }
6566 }
6567}
6568
6570 // This currently forces unfolding various combinations of fsub into fma with
6571 // free fneg'd operands. As long as we have fast FMA (controlled by
6572 // isFMAFasterThanFMulAndFAdd), we should perform these.
6573
6574 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6575 // most of these combines appear to be cycle neutral but save on instruction
6576 // count / code size.
6577 return true;
6578}
6579
6581
6583 EVT VT) const {
6584 if (!VT.isVector()) {
6585 return MVT::i1;
6586 }
6587 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6588}
6589
6591 // TODO: Should i16 be used always if legal? For now it would force VALU
6592 // shifts.
6593 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6594}
6595
6597 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6598 ? Ty.changeElementSize(16)
6599 : Ty.changeElementSize(32);
6600}
6601
6602// Answering this is somewhat tricky and depends on the specific device which
6603// have different rates for fma or all f64 operations.
6604//
6605// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6606// regardless of which device (although the number of cycles differs between
6607// devices), so it is always profitable for f64.
6608//
6609// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6610// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6611// which we can always do even without fused FP ops since it returns the same
6612// result as the separate operations and since it is always full
6613// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6614// however does not support denormals, so we do report fma as faster if we have
6615// a fast fma device and require denormals.
6616//
6618 EVT VT) const {
6619 VT = VT.getScalarType();
6620
6621 switch (VT.getSimpleVT().SimpleTy) {
6622 case MVT::f32: {
6623 // If mad is not available this depends only on if f32 fma is full rate.
6624 if (!Subtarget->hasMadMacF32Insts())
6625 return Subtarget->hasFastFMAF32();
6626
6627 // Otherwise f32 mad is always full rate and returns the same result as
6628 // the separate operations so should be preferred over fma.
6629 // However does not support denormals.
6631 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6632
6633 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6634 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6635 }
6636 case MVT::f64:
6637 return true;
6638 case MVT::f16:
6639 case MVT::bf16:
6640 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6641 default:
6642 break;
6643 }
6644
6645 return false;
6646}
6647
6649 LLT Ty) const {
6650 switch (Ty.getScalarSizeInBits()) {
6651 case 16:
6652 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6653 case 32:
6654 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6655 case 64:
6656 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6657 default:
6658 break;
6659 }
6660
6661 return false;
6662}
6663
6665 if (!Ty.isScalar())
6666 return false;
6667
6668 if (Ty.getScalarSizeInBits() == 16)
6669 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6670 if (Ty.getScalarSizeInBits() == 32)
6671 return Subtarget->hasMadMacF32Insts() &&
6672 denormalModeIsFlushAllF32(*MI.getMF());
6673
6674 return false;
6675}
6676
6678 const SDNode *N) const {
6679 // TODO: Check future ftz flag
6680 // v_mad_f32/v_mac_f32 do not support denormals.
6681 EVT VT = N->getValueType(0);
6682 if (VT == MVT::f32)
6683 return Subtarget->hasMadMacF32Insts() &&
6685 if (VT == MVT::f16) {
6686 return Subtarget->hasMadF16() &&
6688 }
6689
6690 return false;
6691}
6692
6693//===----------------------------------------------------------------------===//
6694// Custom DAG Lowering Operations
6695//===----------------------------------------------------------------------===//
6696
6697// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6698// wider vector type is legal.
6700 SelectionDAG &DAG) const {
6701 unsigned Opc = Op.getOpcode();
6702 EVT VT = Op.getValueType();
6703 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6704 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6705 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6706 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6707 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6708 VT == MVT::v32bf16);
6709
6710 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6711
6712 SDLoc SL(Op);
6713 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6714 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6715
6716 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6717}
6718
6719// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6720// regression whereby extra unnecessary instructions were added to codegen
6721// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6722// instructions to extract the result from the vector.
6724 [[maybe_unused]] EVT VT = Op.getValueType();
6725
6726 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6727 VT == MVT::v16i32) &&
6728 "Unexpected ValueType.");
6729
6730 return DAG.UnrollVectorOp(Op.getNode());
6731}
6732
6733// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6734// wider vector type is legal.
6736 SelectionDAG &DAG) const {
6737 unsigned Opc = Op.getOpcode();
6738 EVT VT = Op.getValueType();
6739 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6740 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6741 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6742 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6743 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6744 VT == MVT::v32bf16);
6745
6746 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6747 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6748
6749 SDLoc SL(Op);
6750
6751 SDValue OpLo =
6752 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6753 SDValue OpHi =
6754 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6755
6756 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6757}
6758
6760 SelectionDAG &DAG) const {
6761 unsigned Opc = Op.getOpcode();
6762 EVT VT = Op.getValueType();
6763 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6764 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6765 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6766 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6767 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6768 VT == MVT::v32bf16);
6769
6770 SDValue Op0 = Op.getOperand(0);
6771 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6772 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6773 : std::pair(Op0, Op0);
6774
6775 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6776 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6777
6778 SDLoc SL(Op);
6779 auto ResVT = DAG.GetSplitDestVTs(VT);
6780
6781 SDValue OpLo =
6782 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6783 SDValue OpHi =
6784 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6785
6786 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6787}
6788
6790 switch (Op.getOpcode()) {
6791 default:
6793 case ISD::BRCOND:
6794 return LowerBRCOND(Op, DAG);
6795 case ISD::RETURNADDR:
6796 return LowerRETURNADDR(Op, DAG);
6797 case ISD::LOAD: {
6798 SDValue Result = LowerLOAD(Op, DAG);
6799 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6800 "Load should return a value and a chain");
6801 return Result;
6802 }
6803 case ISD::FSQRT: {
6804 EVT VT = Op.getValueType();
6805 if (VT == MVT::f32)
6806 return lowerFSQRTF32(Op, DAG);
6807 if (VT == MVT::f64)
6808 return lowerFSQRTF64(Op, DAG);
6809 return SDValue();
6810 }
6811 case ISD::FSIN:
6812 case ISD::FCOS:
6813 return LowerTrig(Op, DAG);
6814 case ISD::SELECT:
6815 return LowerSELECT(Op, DAG);
6816 case ISD::FDIV:
6817 return LowerFDIV(Op, DAG);
6818 case ISD::FFREXP:
6819 return LowerFFREXP(Op, DAG);
6820 case ISD::ATOMIC_CMP_SWAP:
6821 return LowerATOMIC_CMP_SWAP(Op, DAG);
6822 case ISD::STORE:
6823 return LowerSTORE(Op, DAG);
6824 case ISD::GlobalAddress: {
6827 return LowerGlobalAddress(MFI, Op, DAG);
6828 }
6830 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6832 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6834 return LowerINTRINSIC_VOID(Op, DAG);
6835 case ISD::ADDRSPACECAST:
6836 return lowerADDRSPACECAST(Op, DAG);
6838 return lowerINSERT_SUBVECTOR(Op, DAG);
6840 return lowerINSERT_VECTOR_ELT(Op, DAG);
6842 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6844 return lowerVECTOR_SHUFFLE(Op, DAG);
6846 return lowerSCALAR_TO_VECTOR(Op, DAG);
6847 case ISD::BUILD_VECTOR:
6848 return lowerBUILD_VECTOR(Op, DAG);
6849 case ISD::FP_ROUND:
6851 return lowerFP_ROUND(Op, DAG);
6852 case ISD::TRAP:
6853 return lowerTRAP(Op, DAG);
6854 case ISD::DEBUGTRAP:
6855 return lowerDEBUGTRAP(Op, DAG);
6856 case ISD::ABS:
6857 case ISD::FABS:
6858 case ISD::FNEG:
6859 case ISD::FCANONICALIZE:
6860 case ISD::BSWAP:
6861 return splitUnaryVectorOp(Op, DAG);
6862 case ISD::FMINNUM:
6863 case ISD::FMAXNUM:
6864 return lowerFMINNUM_FMAXNUM(Op, DAG);
6865 case ISD::FMINIMUMNUM:
6866 case ISD::FMAXIMUMNUM:
6867 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6868 case ISD::FMINIMUM:
6869 case ISD::FMAXIMUM:
6870 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6871 case ISD::FLDEXP:
6872 case ISD::STRICT_FLDEXP:
6873 return lowerFLDEXP(Op, DAG);
6874 case ISD::FMA:
6875 return splitTernaryVectorOp(Op, DAG);
6876 case ISD::FP_TO_SINT:
6877 case ISD::FP_TO_UINT:
6878 return LowerFP_TO_INT(Op, DAG);
6879 case ISD::SHL:
6880 case ISD::SRA:
6881 case ISD::SRL:
6882 case ISD::ADD:
6883 case ISD::SUB:
6884 case ISD::SMIN:
6885 case ISD::SMAX:
6886 case ISD::UMIN:
6887 case ISD::UMAX:
6888 case ISD::FADD:
6889 case ISD::FMUL:
6890 case ISD::FMINNUM_IEEE:
6891 case ISD::FMAXNUM_IEEE:
6892 case ISD::UADDSAT:
6893 case ISD::USUBSAT:
6894 case ISD::SADDSAT:
6895 case ISD::SSUBSAT:
6896 return splitBinaryVectorOp(Op, DAG);
6897 case ISD::FCOPYSIGN:
6898 return lowerFCOPYSIGN(Op, DAG);
6899 case ISD::MUL:
6900 return lowerMUL(Op, DAG);
6901 case ISD::SMULO:
6902 case ISD::UMULO:
6903 return lowerXMULO(Op, DAG);
6904 case ISD::SMUL_LOHI:
6905 case ISD::UMUL_LOHI:
6906 return lowerXMUL_LOHI(Op, DAG);
6907 case ISD::DYNAMIC_STACKALLOC:
6908 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6909 case ISD::STACKSAVE:
6910 return LowerSTACKSAVE(Op, DAG);
6911 case ISD::GET_ROUNDING:
6912 return lowerGET_ROUNDING(Op, DAG);
6913 case ISD::SET_ROUNDING:
6914 return lowerSET_ROUNDING(Op, DAG);
6915 case ISD::PREFETCH:
6916 return lowerPREFETCH(Op, DAG);
6917 case ISD::FP_EXTEND:
6919 return lowerFP_EXTEND(Op, DAG);
6920 case ISD::GET_FPENV:
6921 return lowerGET_FPENV(Op, DAG);
6922 case ISD::SET_FPENV:
6923 return lowerSET_FPENV(Op, DAG);
6924 case ISD::ROTR:
6925 return lowerROTR(Op, DAG);
6926 }
6927 return SDValue();
6928}
6929
6930// Used for D16: Casts the result of an instruction into the right vector,
6931// packs values if loads return unpacked values.
6933 const SDLoc &DL, SelectionDAG &DAG,
6934 bool Unpacked) {
6935 if (!LoadVT.isVector())
6936 return Result;
6937
6938 // Cast back to the original packed type or to a larger type that is a
6939 // multiple of 32 bit for D16. Widening the return type is a required for
6940 // legalization.
6941 EVT FittingLoadVT = LoadVT;
6942 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6943 FittingLoadVT =
6945 LoadVT.getVectorNumElements() + 1);
6946 }
6947
6948 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6949 // Truncate to v2i16/v4i16.
6950 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6951
6952 // Workaround legalizer not scalarizing truncate after vector op
6953 // legalization but not creating intermediate vector trunc.
6955 DAG.ExtractVectorElements(Result, Elts);
6956 for (SDValue &Elt : Elts)
6957 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6958
6959 // Pad illegal v1i16/v3fi6 to v4i16
6960 if ((LoadVT.getVectorNumElements() % 2) == 1)
6961 Elts.push_back(DAG.getPOISON(MVT::i16));
6962
6963 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6964
6965 // Bitcast to original type (v2f16/v4f16).
6966 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6967 }
6968
6969 // Cast back to the original packed type.
6970 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6971}
6972
6973SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6974 SelectionDAG &DAG,
6976 bool IsIntrinsic) const {
6977 SDLoc DL(M);
6978
6979 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6980 EVT LoadVT = M->getValueType(0);
6981
6982 EVT EquivLoadVT = LoadVT;
6983 if (LoadVT.isVector()) {
6984 if (Unpacked) {
6985 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6986 LoadVT.getVectorNumElements());
6987 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6988 // Widen v3f16 to legal type
6989 EquivLoadVT =
6991 LoadVT.getVectorNumElements() + 1);
6992 }
6993 }
6994
6995 // Change from v4f16/v2f16 to EquivLoadVT.
6996 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6997
6999 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7000 M->getMemoryVT(), M->getMemOperand());
7001
7002 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7003
7004 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7005}
7006
7007SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7008 SelectionDAG &DAG,
7009 ArrayRef<SDValue> Ops) const {
7010 SDLoc DL(M);
7011 EVT LoadVT = M->getValueType(0);
7012 EVT EltType = LoadVT.getScalarType();
7013 EVT IntVT = LoadVT.changeTypeToInteger();
7014
7015 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7016
7017 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7018 bool IsTFE = M->getNumValues() == 3;
7019
7020 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7021 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7022 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7023 : AMDGPUISD::BUFFER_LOAD;
7024
7025 if (IsD16) {
7026 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7027 }
7028
7029 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7030 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7031 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7032 IsTFE);
7033
7034 if (isTypeLegal(LoadVT)) {
7035 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7036 M->getMemOperand(), DAG);
7037 }
7038
7039 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7040 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7041 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7042 M->getMemOperand(), DAG);
7043 return DAG.getMergeValues(
7044 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7045 DL);
7046}
7047
7049 SelectionDAG &DAG) {
7050 EVT VT = N->getValueType(0);
7051 unsigned CondCode = N->getConstantOperandVal(3);
7052 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7053 return DAG.getPOISON(VT);
7054
7055 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7056
7057 SDValue LHS = N->getOperand(1);
7058 SDValue RHS = N->getOperand(2);
7059
7060 SDLoc DL(N);
7061
7062 EVT CmpVT = LHS.getValueType();
7063 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7064 unsigned PromoteOp =
7066 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7067 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7068 }
7069
7070 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7071
7072 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7073 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7074
7075 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7076 DAG.getCondCode(CCOpcode));
7077 if (VT.bitsEq(CCVT))
7078 return SetCC;
7079 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7080}
7081
7083 SelectionDAG &DAG) {
7084 EVT VT = N->getValueType(0);
7085
7086 unsigned CondCode = N->getConstantOperandVal(3);
7087 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7088 return DAG.getPOISON(VT);
7089
7090 SDValue Src0 = N->getOperand(1);
7091 SDValue Src1 = N->getOperand(2);
7092 EVT CmpVT = Src0.getValueType();
7093 SDLoc SL(N);
7094
7095 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7096 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7097 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7098 }
7099
7100 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7101 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7102 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7103 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7104 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7105 DAG.getCondCode(CCOpcode));
7106 if (VT.bitsEq(CCVT))
7107 return SetCC;
7108 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7109}
7110
7112 SelectionDAG &DAG) {
7113 EVT VT = N->getValueType(0);
7114 SDValue Src = N->getOperand(1);
7115 SDLoc SL(N);
7116
7117 if (Src.getOpcode() == ISD::SETCC) {
7118 SDValue Op0 = Src.getOperand(0);
7119 SDValue Op1 = Src.getOperand(1);
7120 // Need to expand bfloat to float for comparison (setcc).
7121 if (Op0.getValueType() == MVT::bf16) {
7122 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7123 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7124 }
7125 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7126 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7127 }
7128 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7129 // (ballot 0) -> 0
7130 if (Arg->isZero())
7131 return DAG.getConstant(0, SL, VT);
7132
7133 // (ballot 1) -> EXEC/EXEC_LO
7134 if (Arg->isOne()) {
7135 Register Exec;
7136 if (VT.getScalarSizeInBits() == 32)
7137 Exec = AMDGPU::EXEC_LO;
7138 else if (VT.getScalarSizeInBits() == 64)
7139 Exec = AMDGPU::EXEC;
7140 else
7141 return SDValue();
7142
7143 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7144 }
7145 }
7146
7147 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7148 // ISD::SETNE)
7149 return DAG.getNode(
7150 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7151 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7152}
7153
7155 SelectionDAG &DAG) {
7156 EVT VT = N->getValueType(0);
7157 unsigned ValSize = VT.getSizeInBits();
7158 unsigned IID = N->getConstantOperandVal(0);
7159 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7160 IID == Intrinsic::amdgcn_permlanex16;
7161 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7162 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7163 SDLoc SL(N);
7164 MVT IntVT = MVT::getIntegerVT(ValSize);
7165 const GCNSubtarget *ST = TLI.getSubtarget();
7166 unsigned SplitSize = 32;
7167 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7168 ST->hasDPALU_DPP() &&
7169 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7170 SplitSize = 64;
7171
7172 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7173 SDValue Src2, MVT ValT) -> SDValue {
7174 SmallVector<SDValue, 8> Operands;
7175 switch (IID) {
7176 case Intrinsic::amdgcn_permlane16:
7177 case Intrinsic::amdgcn_permlanex16:
7178 case Intrinsic::amdgcn_update_dpp:
7179 Operands.push_back(N->getOperand(6));
7180 Operands.push_back(N->getOperand(5));
7181 Operands.push_back(N->getOperand(4));
7182 [[fallthrough]];
7183 case Intrinsic::amdgcn_writelane:
7184 Operands.push_back(Src2);
7185 [[fallthrough]];
7186 case Intrinsic::amdgcn_readlane:
7187 case Intrinsic::amdgcn_set_inactive:
7188 case Intrinsic::amdgcn_set_inactive_chain_arg:
7189 case Intrinsic::amdgcn_mov_dpp8:
7190 Operands.push_back(Src1);
7191 [[fallthrough]];
7192 case Intrinsic::amdgcn_readfirstlane:
7193 case Intrinsic::amdgcn_permlane64:
7194 Operands.push_back(Src0);
7195 break;
7196 default:
7197 llvm_unreachable("unhandled lane op");
7198 }
7199
7200 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7201 std::reverse(Operands.begin(), Operands.end());
7202
7203 if (SDNode *GL = N->getGluedNode()) {
7204 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7205 GL = GL->getOperand(0).getNode();
7206 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7207 SDValue(GL, 0)));
7208 }
7209
7210 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7211 };
7212
7213 SDValue Src0 = N->getOperand(1);
7214 SDValue Src1, Src2;
7215 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7216 IID == Intrinsic::amdgcn_mov_dpp8 ||
7217 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7218 Src1 = N->getOperand(2);
7219 if (IID == Intrinsic::amdgcn_writelane ||
7220 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7221 Src2 = N->getOperand(3);
7222 }
7223
7224 if (ValSize == SplitSize) {
7225 // Already legal
7226 return SDValue();
7227 }
7228
7229 if (ValSize < 32) {
7230 bool IsFloat = VT.isFloatingPoint();
7231 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7232 SL, MVT::i32);
7233
7234 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7235 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7236 SL, MVT::i32);
7237 }
7238
7239 if (IID == Intrinsic::amdgcn_writelane) {
7240 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7241 SL, MVT::i32);
7242 }
7243
7244 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7245 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7246 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7247 }
7248
7249 if (ValSize % SplitSize != 0)
7250 return SDValue();
7251
7252 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7253 EVT VT = N->getValueType(0);
7254 unsigned NE = VT.getVectorNumElements();
7255 EVT EltVT = VT.getVectorElementType();
7257 unsigned NumOperands = N->getNumOperands();
7258 SmallVector<SDValue, 4> Operands(NumOperands);
7259 SDNode *GL = N->getGluedNode();
7260
7261 // only handle convergencectrl_glue
7262 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7263
7264 for (unsigned i = 0; i != NE; ++i) {
7265 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7266 ++j) {
7267 SDValue Operand = N->getOperand(j);
7268 EVT OperandVT = Operand.getValueType();
7269 if (OperandVT.isVector()) {
7270 // A vector operand; extract a single element.
7271 EVT OperandEltVT = OperandVT.getVectorElementType();
7272 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7273 Operand, DAG.getVectorIdxConstant(i, SL));
7274 } else {
7275 // A scalar operand; just use it as is.
7276 Operands[j] = Operand;
7277 }
7278 }
7279
7280 if (GL)
7281 Operands[NumOperands - 1] =
7282 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7283 SDValue(GL->getOperand(0).getNode(), 0));
7284
7285 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7286 }
7287
7288 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7289 return DAG.getBuildVector(VecVT, SL, Scalars);
7290 };
7291
7292 if (VT.isVector()) {
7293 switch (MVT::SimpleValueType EltTy =
7295 case MVT::i32:
7296 case MVT::f32:
7297 if (SplitSize == 32) {
7298 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7299 return unrollLaneOp(LaneOp.getNode());
7300 }
7301 [[fallthrough]];
7302 case MVT::i16:
7303 case MVT::f16:
7304 case MVT::bf16: {
7305 unsigned SubVecNumElt =
7306 SplitSize / VT.getVectorElementType().getSizeInBits();
7307 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7309 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7310 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7311 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7312 DAG.getConstant(EltIdx, SL, MVT::i32));
7313
7314 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7315 IsPermLane16)
7316 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7317 DAG.getConstant(EltIdx, SL, MVT::i32));
7318
7319 if (IID == Intrinsic::amdgcn_writelane)
7320 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7321 DAG.getConstant(EltIdx, SL, MVT::i32));
7322
7323 Pieces.push_back(
7324 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7325 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7326 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7327 EltIdx += SubVecNumElt;
7328 }
7329 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7330 }
7331 default:
7332 // Handle all other cases by bitcasting to i32 vectors
7333 break;
7334 }
7335 }
7336
7337 MVT VecVT =
7338 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7339 Src0 = DAG.getBitcast(VecVT, Src0);
7340
7341 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7342 Src1 = DAG.getBitcast(VecVT, Src1);
7343
7344 if (IID == Intrinsic::amdgcn_writelane)
7345 Src2 = DAG.getBitcast(VecVT, Src2);
7346
7347 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7348 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7349 return DAG.getBitcast(VT, UnrolledLaneOp);
7350}
7351
7354 SelectionDAG &DAG) const {
7355 switch (N->getOpcode()) {
7357 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7358 Results.push_back(Res);
7359 return;
7360 }
7362 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7363 Results.push_back(Res);
7364 return;
7365 }
7367 unsigned IID = N->getConstantOperandVal(0);
7368 switch (IID) {
7369 case Intrinsic::amdgcn_make_buffer_rsrc:
7370 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7371 return;
7372 case Intrinsic::amdgcn_cvt_pkrtz: {
7373 SDValue Src0 = N->getOperand(1);
7374 SDValue Src1 = N->getOperand(2);
7375 SDLoc SL(N);
7376 SDValue Cvt =
7377 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7378 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7379 return;
7380 }
7381 case Intrinsic::amdgcn_cvt_pknorm_i16:
7382 case Intrinsic::amdgcn_cvt_pknorm_u16:
7383 case Intrinsic::amdgcn_cvt_pk_i16:
7384 case Intrinsic::amdgcn_cvt_pk_u16: {
7385 SDValue Src0 = N->getOperand(1);
7386 SDValue Src1 = N->getOperand(2);
7387 SDLoc SL(N);
7388 unsigned Opcode;
7389
7390 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7391 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7392 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7393 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7394 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7395 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7396 else
7397 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7398
7399 EVT VT = N->getValueType(0);
7400 if (isTypeLegal(VT))
7401 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7402 else {
7403 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7404 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7405 }
7406 return;
7407 }
7408 case Intrinsic::amdgcn_s_buffer_load: {
7409 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7410 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7411 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7412 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7413 // s_buffer_load_i8.
7414 if (!Subtarget->hasScalarSubwordLoads())
7415 return;
7416 SDValue Op = SDValue(N, 0);
7417 SDValue Rsrc = Op.getOperand(1);
7418 SDValue Offset = Op.getOperand(2);
7419 SDValue CachePolicy = Op.getOperand(3);
7420 EVT VT = Op.getValueType();
7421 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7422 SDLoc DL(Op);
7424 const DataLayout &DataLayout = DAG.getDataLayout();
7425 Align Alignment =
7431 VT.getStoreSize(), Alignment);
7432 SDValue LoadVal;
7433 if (!Offset->isDivergent()) {
7434 SDValue Ops[] = {Rsrc, // source register
7435 Offset, CachePolicy};
7436 SDValue BufferLoad =
7437 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7438 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7439 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7440 } else {
7441 SDValue Ops[] = {
7442 DAG.getEntryNode(), // Chain
7443 Rsrc, // rsrc
7444 DAG.getConstant(0, DL, MVT::i32), // vindex
7445 {}, // voffset
7446 {}, // soffset
7447 {}, // offset
7448 CachePolicy, // cachepolicy
7449 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7450 };
7451 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7452 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7453 }
7454 Results.push_back(LoadVal);
7455 return;
7456 }
7457 case Intrinsic::amdgcn_dead: {
7458 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7459 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7460 return;
7461 }
7462 }
7463 break;
7464 }
7466 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7467 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7468 // FIXME: Hacky
7469 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7470 Results.push_back(Res.getOperand(I));
7471 }
7472 } else {
7473 Results.push_back(Res);
7474 Results.push_back(Res.getValue(1));
7475 }
7476 return;
7477 }
7478
7479 break;
7480 }
7481 case ISD::SELECT: {
7482 SDLoc SL(N);
7483 EVT VT = N->getValueType(0);
7484 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7485 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7486 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7487
7488 EVT SelectVT = NewVT;
7489 if (NewVT.bitsLT(MVT::i32)) {
7490 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7491 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7492 SelectVT = MVT::i32;
7493 }
7494
7495 SDValue NewSelect =
7496 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7497
7498 if (NewVT != SelectVT)
7499 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7500 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7501 return;
7502 }
7503 case ISD::FNEG: {
7504 if (N->getValueType(0) != MVT::v2f16)
7505 break;
7506
7507 SDLoc SL(N);
7508 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7509
7510 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7511 DAG.getConstant(0x80008000, SL, MVT::i32));
7512 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7513 return;
7514 }
7515 case ISD::FABS: {
7516 if (N->getValueType(0) != MVT::v2f16)
7517 break;
7518
7519 SDLoc SL(N);
7520 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7521
7522 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7523 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7524 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7525 return;
7526 }
7527 case ISD::FSQRT: {
7528 if (N->getValueType(0) != MVT::f16)
7529 break;
7530 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7531 break;
7532 }
7533 default:
7535 break;
7536 }
7537}
7538
7539/// Helper function for LowerBRCOND
7540static SDNode *findUser(SDValue Value, unsigned Opcode) {
7541
7542 for (SDUse &U : Value->uses()) {
7543 if (U.get() != Value)
7544 continue;
7545
7546 if (U.getUser()->getOpcode() == Opcode)
7547 return U.getUser();
7548 }
7549 return nullptr;
7550}
7551
7552unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7553 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7554 switch (Intr->getConstantOperandVal(1)) {
7555 case Intrinsic::amdgcn_if:
7556 return AMDGPUISD::IF;
7557 case Intrinsic::amdgcn_else:
7558 return AMDGPUISD::ELSE;
7559 case Intrinsic::amdgcn_loop:
7560 return AMDGPUISD::LOOP;
7561 case Intrinsic::amdgcn_end_cf:
7562 llvm_unreachable("should not occur");
7563 default:
7564 return 0;
7565 }
7566 }
7567
7568 // break, if_break, else_break are all only used as inputs to loop, not
7569 // directly as branch conditions.
7570 return 0;
7571}
7572
7579
7581 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7582 return false;
7583
7584 // FIXME: Either avoid relying on address space here or change the default
7585 // address space for functions to avoid the explicit check.
7586 return (GV->getValueType()->isFunctionTy() ||
7589}
7590
7592 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7593}
7594
7596 if (!GV->hasExternalLinkage())
7597 return true;
7598
7599 const auto OS = getTargetMachine().getTargetTriple().getOS();
7600 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7601}
7602
7603/// This transforms the control flow intrinsics to get the branch destination as
7604/// last parameter, also switches branch target with BR if the need arise
7605SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7606 SDLoc DL(BRCOND);
7607
7608 SDNode *Intr = BRCOND.getOperand(1).getNode();
7609 SDValue Target = BRCOND.getOperand(2);
7610 SDNode *BR = nullptr;
7611 SDNode *SetCC = nullptr;
7612
7613 switch (Intr->getOpcode()) {
7614 case ISD::SETCC: {
7615 // As long as we negate the condition everything is fine
7616 SetCC = Intr;
7617 Intr = SetCC->getOperand(0).getNode();
7618 break;
7619 }
7620 case ISD::XOR: {
7621 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7622 SDValue LHS = Intr->getOperand(0);
7623 SDValue RHS = Intr->getOperand(1);
7624 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7625 Intr = LHS.getNode();
7626 break;
7627 }
7628 [[fallthrough]];
7629 }
7630 default: {
7631 // Get the target from BR if we don't negate the condition
7632 BR = findUser(BRCOND, ISD::BR);
7633 assert(BR && "brcond missing unconditional branch user");
7634 Target = BR->getOperand(1);
7635 }
7636 }
7637
7638 unsigned CFNode = isCFIntrinsic(Intr);
7639 if (CFNode == 0) {
7640 // This is a uniform branch so we don't need to legalize.
7641 return BRCOND;
7642 }
7643
7644 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7646
7647 assert(!SetCC ||
7648 (SetCC->getConstantOperandVal(1) == 1 &&
7649 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7650 ISD::SETNE));
7651
7652 // operands of the new intrinsic call
7654 if (HaveChain)
7655 Ops.push_back(BRCOND.getOperand(0));
7656
7657 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7658 Ops.push_back(Target);
7659
7660 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7661
7662 // build the new intrinsic call
7663 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7664
7665 if (!HaveChain) {
7666 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7667
7669 }
7670
7671 if (BR) {
7672 // Give the branch instruction our target
7673 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7674 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7675 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7676 }
7677
7678 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7679
7680 // Copy the intrinsic results to registers
7681 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7682 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7683 if (!CopyToReg)
7684 continue;
7685
7686 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7687 SDValue(Result, i - 1), SDValue());
7688
7689 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7690 }
7691
7692 // Remove the old intrinsic from the chain
7693 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7694 Intr->getOperand(0));
7695
7696 return Chain;
7697}
7698
7699SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7700 MVT VT = Op.getSimpleValueType();
7701 SDLoc DL(Op);
7702 // Checking the depth
7703 if (Op.getConstantOperandVal(0) != 0)
7704 return DAG.getConstant(0, DL, VT);
7705
7706 MachineFunction &MF = DAG.getMachineFunction();
7707 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7708 // Check for kernel and shader functions
7709 if (Info->isEntryFunction())
7710 return DAG.getConstant(0, DL, VT);
7711
7712 MachineFrameInfo &MFI = MF.getFrameInfo();
7713 // There is a call to @llvm.returnaddress in this function
7714 MFI.setReturnAddressIsTaken(true);
7715
7716 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7717 // Get the return address reg and mark it as an implicit live-in
7718 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7719 getRegClassFor(VT, Op.getNode()->isDivergent()));
7720
7721 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7722}
7723
7724SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7725 const SDLoc &DL, EVT VT) const {
7726 return Op.getValueType().bitsLE(VT)
7727 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7728 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7729 DAG.getTargetConstant(0, DL, MVT::i32));
7730}
7731
7732SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7733 SelectionDAG &DAG) const {
7734 EVT DstVT = Op.getValueType();
7735 unsigned NumElts = DstVT.getVectorNumElements();
7736 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7737
7738 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7739
7740 SDLoc DL(Op);
7741 unsigned Opc = Op.getOpcode();
7742 SDValue Flags = Op.getOperand(1);
7743 EVT HalfDstVT =
7744 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7745 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7746 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7747
7748 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7749}
7750
7751SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7752 SDValue Src = Op.getOperand(0);
7753 EVT SrcVT = Src.getValueType();
7754 EVT DstVT = Op.getValueType();
7755
7756 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7757 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7758 if (SrcVT.getScalarType() != MVT::f32)
7759 return SDValue();
7760 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7761 }
7762
7763 if (SrcVT.getScalarType() != MVT::f64)
7764 return Op;
7765
7766 SDLoc DL(Op);
7767 if (DstVT == MVT::f16) {
7768 // TODO: Handle strictfp
7769 if (Op.getOpcode() != ISD::FP_ROUND)
7770 return Op;
7771
7772 if (!Subtarget->has16BitInsts()) {
7773 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7774 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7775 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7776 }
7777 if (Op->getFlags().hasApproximateFuncs()) {
7778 SDValue Flags = Op.getOperand(1);
7779 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7780 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7781 }
7782 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7783 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7784 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7785 }
7786
7787 assert(DstVT.getScalarType() == MVT::bf16 &&
7788 "custom lower FP_ROUND for f16 or bf16");
7789 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7790
7791 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7792 // hardware f32 -> bf16 instruction.
7793 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7794 MVT::f32;
7795 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7796 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7797 DAG.getTargetConstant(0, DL, MVT::i32));
7798}
7799
7800SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7801 SelectionDAG &DAG) const {
7802 EVT VT = Op.getValueType();
7803 const MachineFunction &MF = DAG.getMachineFunction();
7804 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7805 bool IsIEEEMode = Info->getMode().IEEE;
7806
7807 // FIXME: Assert during selection that this is only selected for
7808 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7809 // mode functions, but this happens to be OK since it's only done in cases
7810 // where there is known no sNaN.
7811 if (IsIEEEMode)
7812 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7813
7814 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7815 VT == MVT::v16bf16)
7816 return splitBinaryVectorOp(Op, DAG);
7817 return Op;
7818}
7819
7820SDValue
7821SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7822 SelectionDAG &DAG) const {
7823 EVT VT = Op.getValueType();
7824 const MachineFunction &MF = DAG.getMachineFunction();
7825 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7826 bool IsIEEEMode = Info->getMode().IEEE;
7827
7828 if (IsIEEEMode)
7829 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7830
7831 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7832 VT == MVT::v16bf16)
7833 return splitBinaryVectorOp(Op, DAG);
7834 return Op;
7835}
7836
7837SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7838 SelectionDAG &DAG) const {
7839 EVT VT = Op.getValueType();
7840 if (VT.isVector())
7841 return splitBinaryVectorOp(Op, DAG);
7842
7843 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7844 !Subtarget->hasMinimum3Maximum3F16() &&
7845 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7846 "should not need to widen f16 minimum/maximum to v2f16");
7847
7848 // Widen f16 operation to v2f16
7849
7850 // fminimum f16:x, f16:y ->
7851 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7852 // (v2f16 (scalar_to_vector y))), 0
7853 SDLoc SL(Op);
7854 SDValue WideSrc0 =
7855 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7856 SDValue WideSrc1 =
7857 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7858
7859 SDValue Widened =
7860 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7861
7862 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7863 DAG.getConstant(0, SL, MVT::i32));
7864}
7865
7866SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7867 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7868 EVT VT = Op.getValueType();
7869 assert(VT == MVT::f16);
7870
7871 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7872 EVT ExpVT = Exp.getValueType();
7873 if (ExpVT == MVT::i16)
7874 return Op;
7875
7876 SDLoc DL(Op);
7877
7878 // Correct the exponent type for f16 to i16.
7879 // Clamp the range of the exponent to the instruction's range.
7880
7881 // TODO: This should be a generic narrowing legalization, and can easily be
7882 // for GlobalISel.
7883
7884 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7885 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7886
7887 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7888 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7889
7890 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7891
7892 if (IsStrict) {
7893 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7894 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7895 }
7896
7897 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7898}
7899
7901 switch (Op->getOpcode()) {
7902 case ISD::SRA:
7903 case ISD::SMIN:
7904 case ISD::SMAX:
7905 return ISD::SIGN_EXTEND;
7906 case ISD::SRL:
7907 case ISD::UMIN:
7908 case ISD::UMAX:
7909 return ISD::ZERO_EXTEND;
7910 case ISD::ADD:
7911 case ISD::SUB:
7912 case ISD::AND:
7913 case ISD::OR:
7914 case ISD::XOR:
7915 case ISD::SHL:
7916 case ISD::SELECT:
7917 case ISD::MUL:
7918 // operation result won't be influenced by garbage high bits.
7919 // TODO: are all of those cases correct, and are there more?
7920 return ISD::ANY_EXTEND;
7921 case ISD::SETCC: {
7922 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7924 }
7925 default:
7926 llvm_unreachable("unexpected opcode!");
7927 }
7928}
7929
7930SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7931 DAGCombinerInfo &DCI) const {
7932 const unsigned Opc = Op.getOpcode();
7933 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7934 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7935 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7936 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7937 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7938
7939 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7940 : Op->getOperand(0).getValueType();
7941 auto ExtTy = OpTy.changeElementType(MVT::i32);
7942
7943 if (DCI.isBeforeLegalizeOps() ||
7944 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7945 return SDValue();
7946
7947 auto &DAG = DCI.DAG;
7948
7949 SDLoc DL(Op);
7950 SDValue LHS;
7951 SDValue RHS;
7952 if (Opc == ISD::SELECT) {
7953 LHS = Op->getOperand(1);
7954 RHS = Op->getOperand(2);
7955 } else {
7956 LHS = Op->getOperand(0);
7957 RHS = Op->getOperand(1);
7958 }
7959
7960 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7961 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7962
7963 // Special case: for shifts, the RHS always needs a zext.
7964 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7965 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7966 else
7967 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7968
7969 // setcc always return i1/i1 vec so no need to truncate after.
7970 if (Opc == ISD::SETCC) {
7971 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7972 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7973 }
7974
7975 // For other ops, we extend the operation's return type as well so we need to
7976 // truncate back to the original type.
7977 SDValue NewVal;
7978 if (Opc == ISD::SELECT)
7979 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7980 else
7981 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7982
7983 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7984}
7985
7986SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7987 SDValue Mag = Op.getOperand(0);
7988 EVT MagVT = Mag.getValueType();
7989
7990 if (MagVT.getVectorNumElements() > 2)
7991 return splitBinaryVectorOp(Op, DAG);
7992
7993 SDValue Sign = Op.getOperand(1);
7994 EVT SignVT = Sign.getValueType();
7995
7996 if (MagVT == SignVT)
7997 return Op;
7998
7999 // fcopysign v2f16:mag, v2f32:sign ->
8000 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8001
8002 SDLoc SL(Op);
8003 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8004 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8005
8006 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8007
8008 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8009}
8010
8011// Custom lowering for vector multiplications and s_mul_u64.
8012SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8013 EVT VT = Op.getValueType();
8014
8015 // Split vector operands.
8016 if (VT.isVector())
8017 return splitBinaryVectorOp(Op, DAG);
8018
8019 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8020
8021 // There are four ways to lower s_mul_u64:
8022 //
8023 // 1. If all the operands are uniform, then we lower it as it is.
8024 //
8025 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8026 // multiplications because there is not a vector equivalent of s_mul_u64.
8027 //
8028 // 3. If the cost model decides that it is more efficient to use vector
8029 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8030 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8031 //
8032 // 4. If the cost model decides to use vector registers and both of the
8033 // operands are zero-extended/sign-extended from 32-bits, then we split the
8034 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8035 // possible to check if the operands are zero-extended or sign-extended in
8036 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8037 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8038 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8039 // If the cost model decides that we have to use vector registers, then
8040 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8041 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8042 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8043 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8044 // SIInstrInfo.cpp .
8045
8046 if (Op->isDivergent())
8047 return SDValue();
8048
8049 SDValue Op0 = Op.getOperand(0);
8050 SDValue Op1 = Op.getOperand(1);
8051 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8052 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8053 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8054 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8055 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8056 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8057 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8058 SDLoc SL(Op);
8059 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8060 return SDValue(
8061 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8062 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8063 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8064 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8065 return SDValue(
8066 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8067 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8068 return Op;
8069}
8070
8071SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8072 EVT VT = Op.getValueType();
8073 SDLoc SL(Op);
8074 SDValue LHS = Op.getOperand(0);
8075 SDValue RHS = Op.getOperand(1);
8076 bool isSigned = Op.getOpcode() == ISD::SMULO;
8077
8078 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8079 const APInt &C = RHSC->getAPIntValue();
8080 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8081 if (C.isPowerOf2()) {
8082 // smulo(x, signed_min) is same as umulo(x, signed_min).
8083 bool UseArithShift = isSigned && !C.isMinSignedValue();
8084 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8085 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8086 SDValue Overflow =
8087 DAG.getSetCC(SL, MVT::i1,
8088 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8089 Result, ShiftAmt),
8090 LHS, ISD::SETNE);
8091 return DAG.getMergeValues({Result, Overflow}, SL);
8092 }
8093 }
8094
8095 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8096 SDValue Top =
8097 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8098
8099 SDValue Sign = isSigned
8100 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8101 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8102 SL, MVT::i32))
8103 : DAG.getConstant(0, SL, VT);
8104 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8105
8106 return DAG.getMergeValues({Result, Overflow}, SL);
8107}
8108
8109SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8110 if (Op->isDivergent()) {
8111 // Select to V_MAD_[IU]64_[IU]32.
8112 return Op;
8113 }
8114 if (Subtarget->hasSMulHi()) {
8115 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8116 return SDValue();
8117 }
8118 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8119 // calculate the high part, so we might as well do the whole thing with
8120 // V_MAD_[IU]64_[IU]32.
8121 return Op;
8122}
8123
8124SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8125 if (!Subtarget->isTrapHandlerEnabled() ||
8126 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8127 return lowerTrapEndpgm(Op, DAG);
8128
8129 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8130 : lowerTrapHsaQueuePtr(Op, DAG);
8131}
8132
8133SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8134 SDLoc SL(Op);
8135 SDValue Chain = Op.getOperand(0);
8136 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8137}
8138
8139SDValue
8140SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8141 const SDLoc &DL, Align Alignment,
8142 ImplicitParameter Param) const {
8143 MachineFunction &MF = DAG.getMachineFunction();
8144 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8145 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8146 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8147 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8150}
8151
8152SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8153 SelectionDAG &DAG) const {
8154 SDLoc SL(Op);
8155 SDValue Chain = Op.getOperand(0);
8156
8157 SDValue QueuePtr;
8158 // For code object version 5, QueuePtr is passed through implicit kernarg.
8159 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8161 QueuePtr =
8162 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8163 } else {
8164 MachineFunction &MF = DAG.getMachineFunction();
8165 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8166 Register UserSGPR = Info->getQueuePtrUserSGPR();
8167
8168 if (UserSGPR == AMDGPU::NoRegister) {
8169 // We probably are in a function incorrectly marked with
8170 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8171 // trap, so just use a null pointer.
8172 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8173 } else {
8174 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8175 MVT::i64);
8176 }
8177 }
8178
8179 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8180 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8181
8182 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8183 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8184 ToReg.getValue(1)};
8185 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8186}
8187
8188SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8189 SDLoc SL(Op);
8190 SDValue Chain = Op.getOperand(0);
8191
8192 // We need to simulate the 's_trap 2' instruction on targets that run in
8193 // PRIV=1 (where it is treated as a nop).
8194 if (Subtarget->hasPrivEnabledTrap2NopBug())
8195 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8196
8197 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8198 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8199 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8200}
8201
8202SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8203 SDLoc SL(Op);
8204 SDValue Chain = Op.getOperand(0);
8205 MachineFunction &MF = DAG.getMachineFunction();
8206
8207 if (!Subtarget->isTrapHandlerEnabled() ||
8208 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8209 LLVMContext &Ctx = MF.getFunction().getContext();
8210 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8211 "debugtrap handler not supported",
8212 Op.getDebugLoc(), DS_Warning));
8213 return Chain;
8214 }
8215
8216 uint64_t TrapID =
8217 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8218 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8219 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8220}
8221
8222SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8223 SelectionDAG &DAG) const {
8224 if (Subtarget->hasApertureRegs()) {
8225 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8226 ? AMDGPU::SRC_SHARED_BASE
8227 : AMDGPU::SRC_PRIVATE_BASE;
8228 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8229 !Subtarget->hasGloballyAddressableScratch()) &&
8230 "Cannot use src_private_base with globally addressable scratch!");
8231 // Note: this feature (register) is broken. When used as a 32-bit operand,
8232 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8233 // bits.
8234 //
8235 // To work around the issue, emit a 64 bit copy from this register
8236 // then extract the high bits. Note that this shouldn't even result in a
8237 // shift being emitted and simply become a pair of registers (e.g.):
8238 // s_mov_b64 s[6:7], src_shared_base
8239 // v_mov_b32_e32 v1, s7
8240 SDValue Copy =
8241 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8242 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8243 }
8244
8245 // For code object version 5, private_base and shared_base are passed through
8246 // implicit kernargs.
8247 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8251 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8252 }
8253
8254 MachineFunction &MF = DAG.getMachineFunction();
8255 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8256 Register UserSGPR = Info->getQueuePtrUserSGPR();
8257 if (UserSGPR == AMDGPU::NoRegister) {
8258 // We probably are in a function incorrectly marked with
8259 // amdgpu-no-queue-ptr. This is undefined.
8260 return DAG.getPOISON(MVT::i32);
8261 }
8262
8263 SDValue QueuePtr =
8264 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8265
8266 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8267 // private_segment_aperture_base_hi.
8268 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8269
8270 SDValue Ptr =
8271 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8272
8273 // TODO: Use custom target PseudoSourceValue.
8274 // TODO: We should use the value from the IR intrinsic call, but it might not
8275 // be available and how do we get it?
8276 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8277 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8278 commonAlignment(Align(64), StructOffset),
8281}
8282
8283/// Return true if the value is a known valid address, such that a null check is
8284/// not necessary.
8286 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8288 return true;
8289
8290 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8291 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8292
8293 // TODO: Search through arithmetic, handle arguments and loads
8294 // marked nonnull.
8295 return false;
8296}
8297
8298SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8299 SelectionDAG &DAG) const {
8300 SDLoc SL(Op);
8301
8302 const AMDGPUTargetMachine &TM =
8303 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8304
8305 unsigned DestAS, SrcAS;
8306 SDValue Src;
8307 bool IsNonNull = false;
8308 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8309 SrcAS = ASC->getSrcAddressSpace();
8310 Src = ASC->getOperand(0);
8311 DestAS = ASC->getDestAddressSpace();
8312 } else {
8313 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8314 Op.getConstantOperandVal(0) ==
8315 Intrinsic::amdgcn_addrspacecast_nonnull);
8316 Src = Op->getOperand(1);
8317 SrcAS = Op->getConstantOperandVal(2);
8318 DestAS = Op->getConstantOperandVal(3);
8319 IsNonNull = true;
8320 }
8321
8322 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8323
8324 // flat -> local/private
8325 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8326 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8327 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8328 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8329
8330 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8331 Subtarget->hasGloballyAddressableScratch()) {
8332 // flat -> private with globally addressable scratch: subtract
8333 // src_flat_scratch_base_lo.
8334 SDValue FlatScratchBaseLo(
8335 DAG.getMachineNode(
8336 AMDGPU::S_MOV_B32, SL, MVT::i32,
8337 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8338 0);
8339 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8340 }
8341
8342 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8343 return Ptr;
8344
8345 unsigned NullVal = TM.getNullPointerValue(DestAS);
8346 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8347 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8348
8349 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8350 SegmentNullPtr);
8351 }
8352 }
8353
8354 // local/private -> flat
8355 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8356 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8357 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8358 SDValue CvtPtr;
8359 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8360 Subtarget->hasGloballyAddressableScratch()) {
8361 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8362 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8363 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8364 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8365 ThreadID = DAG.getNode(
8366 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8367 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8368 AllOnes, ThreadID);
8369 if (Subtarget->isWave64())
8370 ThreadID = DAG.getNode(
8371 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8372 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8373 AllOnes, ThreadID);
8374 SDValue ShAmt = DAG.getShiftAmountConstant(
8375 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8376 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8377 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8378 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8379 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8380 // 64-bit hi:lo value.
8381 SDValue FlatScratchBase = {
8382 DAG.getMachineNode(
8383 AMDGPU::S_MOV_B64, SL, MVT::i64,
8384 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8385 0};
8386 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8387 } else {
8388 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8389 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8390 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8391 }
8392
8393 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8394 return CvtPtr;
8395
8396 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8397 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8398
8399 SDValue NonNull =
8400 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8401
8402 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8403 FlatNullPtr);
8404 }
8405 }
8406
8407 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8408 Op.getValueType() == MVT::i64) {
8409 const SIMachineFunctionInfo *Info =
8410 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8411 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8412 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8413 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8414 }
8415
8416 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8417 Src.getValueType() == MVT::i64)
8418 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8419
8420 // global <-> flat are no-ops and never emitted.
8421
8422 // Invalid casts are poison.
8423 return DAG.getPOISON(Op->getValueType(0));
8424}
8425
8426// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8427// the small vector and inserting them into the big vector. That is better than
8428// the default expansion of doing it via a stack slot. Even though the use of
8429// the stack slot would be optimized away afterwards, the stack slot itself
8430// remains.
8431SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8432 SelectionDAG &DAG) const {
8433 SDValue Vec = Op.getOperand(0);
8434 SDValue Ins = Op.getOperand(1);
8435 SDValue Idx = Op.getOperand(2);
8436 EVT VecVT = Vec.getValueType();
8437 EVT InsVT = Ins.getValueType();
8438 EVT EltVT = VecVT.getVectorElementType();
8439 unsigned InsNumElts = InsVT.getVectorNumElements();
8440 unsigned IdxVal = Idx->getAsZExtVal();
8441 SDLoc SL(Op);
8442
8443 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8444 // Insert 32-bit registers at a time.
8445 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8446
8447 unsigned VecNumElts = VecVT.getVectorNumElements();
8448 EVT NewVecVT =
8449 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8450 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8452 MVT::i32, InsNumElts / 2);
8453
8454 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8455 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8456
8457 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8458 SDValue Elt;
8459 if (InsNumElts == 2) {
8460 Elt = Ins;
8461 } else {
8462 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8463 DAG.getConstant(I, SL, MVT::i32));
8464 }
8465 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8466 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8467 }
8468
8469 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8470 }
8471
8472 for (unsigned I = 0; I != InsNumElts; ++I) {
8473 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8474 DAG.getConstant(I, SL, MVT::i32));
8475 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8476 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8477 }
8478 return Vec;
8479}
8480
8481SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8482 SelectionDAG &DAG) const {
8483 SDValue Vec = Op.getOperand(0);
8484 SDValue InsVal = Op.getOperand(1);
8485 SDValue Idx = Op.getOperand(2);
8486 EVT VecVT = Vec.getValueType();
8487 EVT EltVT = VecVT.getVectorElementType();
8488 unsigned VecSize = VecVT.getSizeInBits();
8489 unsigned EltSize = EltVT.getSizeInBits();
8490 SDLoc SL(Op);
8491
8492 // Specially handle the case of v4i16 with static indexing.
8493 unsigned NumElts = VecVT.getVectorNumElements();
8494 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8495 if (NumElts == 4 && EltSize == 16 && KIdx) {
8496 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8497
8498 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8499 DAG.getConstant(0, SL, MVT::i32));
8500 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8501 DAG.getConstant(1, SL, MVT::i32));
8502
8503 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8504 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8505
8506 unsigned Idx = KIdx->getZExtValue();
8507 bool InsertLo = Idx < 2;
8508 SDValue InsHalf = DAG.getNode(
8509 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8510 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8511 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8512
8513 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8514
8515 SDValue Concat =
8516 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8517 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8518
8519 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8520 }
8521
8522 // Static indexing does not lower to stack access, and hence there is no need
8523 // for special custom lowering to avoid stack access.
8524 if (isa<ConstantSDNode>(Idx))
8525 return SDValue();
8526
8527 // Avoid stack access for dynamic indexing by custom lowering to
8528 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8529
8530 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8531
8532 MVT IntVT = MVT::getIntegerVT(VecSize);
8533
8534 // Convert vector index to bit-index and get the required bit mask.
8535 assert(isPowerOf2_32(EltSize));
8536 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8537 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8538 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8539 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8540 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8541
8542 // 1. Create a congruent vector with the target value in each element.
8543 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8544 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8545
8546 // 2. Mask off all other indices except the required index within (1).
8547 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8548
8549 // 3. Mask off the required index within the target vector.
8550 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8551 SDValue RHS =
8552 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8553
8554 // 4. Get (2) and (3) ORed into the target vector.
8555 SDValue BFI =
8556 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8557
8558 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8559}
8560
8561SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8562 SelectionDAG &DAG) const {
8563 SDLoc SL(Op);
8564
8565 EVT ResultVT = Op.getValueType();
8566 SDValue Vec = Op.getOperand(0);
8567 SDValue Idx = Op.getOperand(1);
8568 EVT VecVT = Vec.getValueType();
8569 unsigned VecSize = VecVT.getSizeInBits();
8570 EVT EltVT = VecVT.getVectorElementType();
8571
8572 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8573
8574 // Make sure we do any optimizations that will make it easier to fold
8575 // source modifiers before obscuring it with bit operations.
8576
8577 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8578 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8579 return Combined;
8580
8581 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8582 SDValue Lo, Hi;
8583 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8584
8585 if (VecSize == 128) {
8586 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8587 Lo = DAG.getBitcast(LoVT,
8588 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8589 DAG.getConstant(0, SL, MVT::i32)));
8590 Hi = DAG.getBitcast(HiVT,
8591 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8592 DAG.getConstant(1, SL, MVT::i32)));
8593 } else if (VecSize == 256) {
8594 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8595 SDValue Parts[4];
8596 for (unsigned P = 0; P < 4; ++P) {
8597 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8598 DAG.getConstant(P, SL, MVT::i32));
8599 }
8600
8601 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8602 Parts[0], Parts[1]));
8603 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8604 Parts[2], Parts[3]));
8605 } else {
8606 assert(VecSize == 512);
8607
8608 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8609 SDValue Parts[8];
8610 for (unsigned P = 0; P < 8; ++P) {
8611 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8612 DAG.getConstant(P, SL, MVT::i32));
8613 }
8614
8615 Lo = DAG.getBitcast(LoVT,
8616 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8617 Parts[0], Parts[1], Parts[2], Parts[3]));
8618 Hi = DAG.getBitcast(HiVT,
8619 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8620 Parts[4], Parts[5], Parts[6], Parts[7]));
8621 }
8622
8623 EVT IdxVT = Idx.getValueType();
8624 unsigned NElem = VecVT.getVectorNumElements();
8625 assert(isPowerOf2_32(NElem));
8626 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8627 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8628 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8629 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8630 }
8631
8632 assert(VecSize <= 64);
8633
8634 MVT IntVT = MVT::getIntegerVT(VecSize);
8635
8636 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8637 SDValue VecBC = peekThroughBitcasts(Vec);
8638 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8639 SDValue Src = VecBC.getOperand(0);
8640 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8641 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8642 }
8643
8644 unsigned EltSize = EltVT.getSizeInBits();
8645 assert(isPowerOf2_32(EltSize));
8646
8647 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8648
8649 // Convert vector index to bit-index (* EltSize)
8650 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8651
8652 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8653 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8654
8655 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8656 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8657 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8658 }
8659
8660 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8661}
8662
8663static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8664 assert(Elt % 2 == 0);
8665 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8666}
8667
8668static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8669 assert(Elt % 2 == 0);
8670 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8671 !(Mask[Elt + 1] & 1);
8672}
8673
8674SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8675 SelectionDAG &DAG) const {
8676 SDLoc SL(Op);
8677 EVT ResultVT = Op.getValueType();
8678 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8679 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8680 const int NewSrcNumElts = 2;
8681 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8682 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8683
8684 // Break up the shuffle into registers sized pieces.
8685 //
8686 // We're trying to form sub-shuffles that the register allocation pipeline
8687 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8688 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8689 // pair of copies into a consecutive register copy, so use the ordinary
8690 // extract_vector_elt lowering unless we can use the shuffle.
8691 //
8692 // TODO: This is a bit of hack, and we should probably always use
8693 // extract_subvector for the largest possible subvector we can (or at least
8694 // use it for PackVT aligned pieces). However we have worse support for
8695 // combines on them don't directly treat extract_subvector / insert_subvector
8696 // as legal. The DAG scheduler also ends up doing a worse job with the
8697 // extract_subvectors.
8698 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8699
8700 // vector_shuffle <0,1,6,7> lhs, rhs
8701 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8702 //
8703 // vector_shuffle <6,7,2,3> lhs, rhs
8704 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8705 //
8706 // vector_shuffle <6,7,0,1> lhs, rhs
8707 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8708
8709 // Avoid scalarizing when both halves are reading from consecutive elements.
8710
8711 // If we're treating 2 element shuffles as legal, also create odd-to-even
8712 // shuffles of neighboring pairs.
8713 //
8714 // vector_shuffle <3,2,7,6> lhs, rhs
8715 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8716 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8717
8719 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8720 if (ShouldUseConsecutiveExtract &&
8722 const int Idx = SVN->getMaskElt(I);
8723 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8724 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8725 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8726 SVN->getOperand(VecIdx),
8727 DAG.getConstant(EltIdx, SL, MVT::i32));
8728 Pieces.push_back(SubVec);
8729 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8731 int Idx0 = SVN->getMaskElt(I);
8732 int Idx1 = SVN->getMaskElt(I + 1);
8733
8734 SDValue SrcOp0 = SVN->getOperand(0);
8735 SDValue SrcOp1 = SrcOp0;
8736 if (Idx0 >= SrcNumElts) {
8737 SrcOp0 = SVN->getOperand(1);
8738 Idx0 -= SrcNumElts;
8739 }
8740
8741 if (Idx1 >= SrcNumElts) {
8742 SrcOp1 = SVN->getOperand(1);
8743 Idx1 -= SrcNumElts;
8744 }
8745
8746 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8747 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8748
8749 // Extract nearest even aligned piece.
8750 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8751 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8752 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8753 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8754
8755 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8756 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8757
8758 SDValue Result0 = SubVec0;
8759 SDValue Result1 = SubVec0;
8760
8761 if (SubVec0 != SubVec1) {
8762 NewMaskIdx1 += NewSrcNumElts;
8763 Result1 = SubVec1;
8764 } else {
8765 Result1 = DAG.getPOISON(PackVT);
8766 }
8767
8768 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8769 {NewMaskIdx0, NewMaskIdx1});
8770 Pieces.push_back(Shuf);
8771 } else {
8772 const int Idx0 = SVN->getMaskElt(I);
8773 const int Idx1 = SVN->getMaskElt(I + 1);
8774 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8775 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8776 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8777 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8778
8779 SDValue Vec0 = SVN->getOperand(VecIdx0);
8780 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8781 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8782
8783 SDValue Vec1 = SVN->getOperand(VecIdx1);
8784 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8785 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8786 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8787 }
8788 }
8789
8790 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8791}
8792
8793SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8794 SelectionDAG &DAG) const {
8795 SDValue SVal = Op.getOperand(0);
8796 EVT ResultVT = Op.getValueType();
8797 EVT SValVT = SVal.getValueType();
8798 SDValue UndefVal = DAG.getPOISON(SValVT);
8799 SDLoc SL(Op);
8800
8802 VElts.push_back(SVal);
8803 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8804 VElts.push_back(UndefVal);
8805
8806 return DAG.getBuildVector(ResultVT, SL, VElts);
8807}
8808
8809SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8810 SelectionDAG &DAG) const {
8811 SDLoc SL(Op);
8812 EVT VT = Op.getValueType();
8813
8814 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8815 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8816
8817 SDValue Lo = Op.getOperand(0);
8818 SDValue Hi = Op.getOperand(1);
8819
8820 // Avoid adding defined bits with the zero_extend.
8821 if (Hi.isUndef()) {
8822 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8823 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8824 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8825 }
8826
8827 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8828 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8829
8830 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8831 DAG.getConstant(16, SL, MVT::i32));
8832 if (Lo.isUndef())
8833 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8834
8835 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8836 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8837
8838 SDValue Or =
8839 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8840 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8841 }
8842
8843 // Split into 2-element chunks.
8844 const unsigned NumParts = VT.getVectorNumElements() / 2;
8845 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8846 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8847
8849 for (unsigned P = 0; P < NumParts; ++P) {
8850 SDValue Vec = DAG.getBuildVector(
8851 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8852 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8853 }
8854
8855 SDValue Blend =
8856 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8857 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8858}
8859
8861 const GlobalAddressSDNode *GA) const {
8862 // OSes that use ELF REL relocations (instead of RELA) can only store a
8863 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8864 // which can create arbitrary 64-bit addends. (This is only a problem for
8865 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8866 // the high 32 bits of the addend.)
8867 //
8868 // This should be kept in sync with how HasRelocationAddend is initialized in
8869 // the constructor of ELFAMDGPUAsmBackend.
8870 if (!Subtarget->isAmdHsaOS())
8871 return false;
8872
8873 // We can fold offsets for anything that doesn't require a GOT relocation.
8874 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8878}
8879
8880static SDValue
8882 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8883 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8884 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8885 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8886 // lowered to the following code sequence:
8887 //
8888 // For constant address space:
8889 // s_getpc_b64 s[0:1]
8890 // s_add_u32 s0, s0, $symbol
8891 // s_addc_u32 s1, s1, 0
8892 //
8893 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8894 // a fixup or relocation is emitted to replace $symbol with a literal
8895 // constant, which is a pc-relative offset from the encoding of the $symbol
8896 // operand to the global variable.
8897 //
8898 // For global address space:
8899 // s_getpc_b64 s[0:1]
8900 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8901 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8902 //
8903 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8904 // fixups or relocations are emitted to replace $symbol@*@lo and
8905 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8906 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8907 // operand to the global variable.
8908 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8909 assert(GAFlags != SIInstrInfo::MO_NONE);
8910
8911 SDValue Ptr =
8912 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8913 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8914 }
8915
8916 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8917 SDValue PtrHi;
8918 if (GAFlags == SIInstrInfo::MO_NONE)
8919 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8920 else
8921 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8922 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8923}
8924
8925SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8926 SDValue Op,
8927 SelectionDAG &DAG) const {
8928 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8929 SDLoc DL(GSD);
8930 EVT PtrVT = Op.getValueType();
8931
8932 const GlobalValue *GV = GSD->getGlobal();
8938 GV->hasExternalLinkage()) {
8939 Type *Ty = GV->getValueType();
8940 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8941 // zero-sized type in other languages to declare the dynamic shared
8942 // memory which size is not known at the compile time. They will be
8943 // allocated by the runtime and placed directly after the static
8944 // allocated ones. They all share the same offset.
8945 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8946 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8947 // Adjust alignment for that dynamic shared memory array.
8950 MFI->setUsesDynamicLDS(true);
8951 return SDValue(
8952 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8953 }
8954 }
8956 }
8957
8959 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8961 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8962 }
8963
8964 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8965 if (Subtarget->has64BitLiterals()) {
8967 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8968 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8969 0);
8970 }
8971
8972 SDValue AddrLo = DAG.getTargetGlobalAddress(
8973 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8974 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8975
8976 SDValue AddrHi = DAG.getTargetGlobalAddress(
8977 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8978 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8979
8980 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8981 }
8982
8983 if (shouldEmitFixup(GV))
8984 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8985
8986 if (shouldEmitPCReloc(GV))
8987 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8989
8990 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8992 PointerType *PtrTy =
8994 const DataLayout &DataLayout = DAG.getDataLayout();
8995 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8996 MachinePointerInfo PtrInfo =
8998
8999 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9002}
9003
9005 const SDLoc &DL, SDValue V) const {
9006 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9007 // the destination register.
9008 //
9009 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9010 // so we will end up with redundant moves to m0.
9011 //
9012 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9013
9014 // A Null SDValue creates a glue result.
9015 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9016 V, Chain);
9017 return SDValue(M0, 0);
9018}
9019
9020SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9021 MVT VT,
9022 unsigned Offset) const {
9023 SDLoc SL(Op);
9024 SDValue Param = lowerKernargMemParameter(
9025 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9026 // The local size values will have the hi 16-bits as zero.
9027 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9028 DAG.getValueType(VT));
9029}
9030
9032 EVT VT) {
9035 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9036 return DAG.getPOISON(VT);
9037}
9038
9040 EVT VT) {
9043 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9044 return DAG.getPOISON(VT);
9045}
9046
9048 ArrayRef<SDValue> Elts) {
9049 assert(!Elts.empty());
9050 MVT Type;
9051 unsigned NumElts = Elts.size();
9052
9053 if (NumElts <= 12) {
9054 Type = MVT::getVectorVT(MVT::f32, NumElts);
9055 } else {
9056 assert(Elts.size() <= 16);
9057 Type = MVT::v16f32;
9058 NumElts = 16;
9059 }
9060
9061 SmallVector<SDValue, 16> VecElts(NumElts);
9062 for (unsigned i = 0; i < Elts.size(); ++i) {
9063 SDValue Elt = Elts[i];
9064 if (Elt.getValueType() != MVT::f32)
9065 Elt = DAG.getBitcast(MVT::f32, Elt);
9066 VecElts[i] = Elt;
9067 }
9068 for (unsigned i = Elts.size(); i < NumElts; ++i)
9069 VecElts[i] = DAG.getPOISON(MVT::f32);
9070
9071 if (NumElts == 1)
9072 return VecElts[0];
9073 return DAG.getBuildVector(Type, DL, VecElts);
9074}
9075
9076static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9077 SDValue Src, int ExtraElts) {
9078 EVT SrcVT = Src.getValueType();
9079
9081
9082 if (SrcVT.isVector())
9083 DAG.ExtractVectorElements(Src, Elts);
9084 else
9085 Elts.push_back(Src);
9086
9087 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9088 while (ExtraElts--)
9089 Elts.push_back(Undef);
9090
9091 return DAG.getBuildVector(CastVT, DL, Elts);
9092}
9093
9094// Re-construct the required return value for a image load intrinsic.
9095// This is more complicated due to the optional use TexFailCtrl which means the
9096// required return type is an aggregate
9098 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9099 bool Unpacked, bool IsD16, int DMaskPop,
9100 int NumVDataDwords, bool IsAtomicPacked16Bit,
9101 const SDLoc &DL) {
9102 // Determine the required return type. This is the same regardless of
9103 // IsTexFail flag
9104 EVT ReqRetVT = ResultTypes[0];
9105 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9106 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9107 ? (ReqRetNumElts + 1) / 2
9108 : ReqRetNumElts;
9109
9110 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9111
9112 MVT DataDwordVT =
9113 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9114
9115 MVT MaskPopVT =
9116 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9117
9118 SDValue Data(Result, 0);
9119 SDValue TexFail;
9120
9121 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9122 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9123 if (MaskPopVT.isVector()) {
9124 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9125 SDValue(Result, 0), ZeroIdx);
9126 } else {
9127 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9128 SDValue(Result, 0), ZeroIdx);
9129 }
9130 }
9131
9132 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9133 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9134 NumDataDwords - MaskPopDwords);
9135
9136 if (IsD16)
9137 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9138
9139 EVT LegalReqRetVT = ReqRetVT;
9140 if (!ReqRetVT.isVector()) {
9141 if (!Data.getValueType().isInteger())
9142 Data = DAG.getNode(ISD::BITCAST, DL,
9143 Data.getValueType().changeTypeToInteger(), Data);
9144 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9145 } else {
9146 // We need to widen the return vector to a legal type
9147 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9148 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9149 LegalReqRetVT =
9151 ReqRetVT.getVectorNumElements() + 1);
9152 }
9153 }
9154 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9155
9156 if (IsTexFail) {
9157 TexFail =
9158 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9159 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9160
9161 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9162 }
9163
9164 if (Result->getNumValues() == 1)
9165 return Data;
9166
9167 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9168}
9169
9170static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9171 SDValue *LWE, bool &IsTexFail) {
9172 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9173
9174 uint64_t Value = TexFailCtrlConst->getZExtValue();
9175 if (Value) {
9176 IsTexFail = true;
9177 }
9178
9179 SDLoc DL(TexFailCtrlConst);
9180 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9181 Value &= ~(uint64_t)0x1;
9182 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9183 Value &= ~(uint64_t)0x2;
9184
9185 return Value == 0;
9186}
9187
9189 MVT PackVectorVT,
9190 SmallVectorImpl<SDValue> &PackedAddrs,
9191 unsigned DimIdx, unsigned EndIdx,
9192 unsigned NumGradients) {
9193 SDLoc DL(Op);
9194 for (unsigned I = DimIdx; I < EndIdx; I++) {
9195 SDValue Addr = Op.getOperand(I);
9196
9197 // Gradients are packed with undef for each coordinate.
9198 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9199 // 1D: undef,dx/dh; undef,dx/dv
9200 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9201 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9202 if (((I + 1) >= EndIdx) ||
9203 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9204 I == DimIdx + NumGradients - 1))) {
9205 if (Addr.getValueType() != MVT::i16)
9206 Addr = DAG.getBitcast(MVT::i16, Addr);
9207 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9208 } else {
9209 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9210 I++;
9211 }
9212 Addr = DAG.getBitcast(MVT::f32, Addr);
9213 PackedAddrs.push_back(Addr);
9214 }
9215}
9216
9217SDValue SITargetLowering::lowerImage(SDValue Op,
9219 SelectionDAG &DAG, bool WithChain) const {
9220 SDLoc DL(Op);
9221 MachineFunction &MF = DAG.getMachineFunction();
9222 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9223 unsigned IntrOpcode = Intr->BaseOpcode;
9224 // For image atomic: use no-return opcode if result is unused.
9225 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9226 !Op.getNode()->hasAnyUseOfValue(0))
9227 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9230 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9231 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9232 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9233 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9234
9235 SmallVector<EVT, 3> ResultTypes(Op->values());
9236 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9237 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9238 ResultTypes.erase(&ResultTypes[0]);
9239
9240 bool IsD16 = false;
9241 bool IsG16 = false;
9242 bool IsA16 = false;
9243 SDValue VData;
9244 int NumVDataDwords = 0;
9245 bool AdjustRetType = false;
9246 bool IsAtomicPacked16Bit = false;
9247
9248 // Offset of intrinsic arguments
9249 const unsigned ArgOffset = WithChain ? 2 : 1;
9250
9251 unsigned DMask;
9252 unsigned DMaskLanes = 0;
9253
9254 if (BaseOpcode->Atomic) {
9255 VData = Op.getOperand(2);
9256
9257 IsAtomicPacked16Bit =
9258 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9259 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9260 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9261 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9262
9263 bool Is64Bit = VData.getValueSizeInBits() == 64;
9264 if (BaseOpcode->AtomicX2) {
9265 SDValue VData2 = Op.getOperand(3);
9266 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9267 {VData, VData2});
9268 if (Is64Bit)
9269 VData = DAG.getBitcast(MVT::v4i32, VData);
9270
9271 if (!BaseOpcode->NoReturn)
9272 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9273
9274 DMask = Is64Bit ? 0xf : 0x3;
9275 NumVDataDwords = Is64Bit ? 4 : 2;
9276 } else {
9277 DMask = Is64Bit ? 0x3 : 0x1;
9278 NumVDataDwords = Is64Bit ? 2 : 1;
9279 }
9280 } else {
9281 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9282 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9283
9284 if (BaseOpcode->Store) {
9285 VData = Op.getOperand(2);
9286
9287 MVT StoreVT = VData.getSimpleValueType();
9288 if (StoreVT.getScalarType() == MVT::f16) {
9289 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9290 return Op; // D16 is unsupported for this instruction
9291
9292 IsD16 = true;
9293 VData = handleD16VData(VData, DAG, true);
9294 }
9295
9296 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9297 } else if (!BaseOpcode->NoReturn) {
9298 // Work out the num dwords based on the dmask popcount and underlying type
9299 // and whether packing is supported.
9300 MVT LoadVT = ResultTypes[0].getSimpleVT();
9301 if (LoadVT.getScalarType() == MVT::f16) {
9302 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9303 return Op; // D16 is unsupported for this instruction
9304
9305 IsD16 = true;
9306 }
9307
9308 // Confirm that the return type is large enough for the dmask specified
9309 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9310 (!LoadVT.isVector() && DMaskLanes > 1))
9311 return Op;
9312
9313 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9314 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9315 // instructions.
9316 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9317 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9318 NumVDataDwords = (DMaskLanes + 1) / 2;
9319 else
9320 NumVDataDwords = DMaskLanes;
9321
9322 AdjustRetType = true;
9323 }
9324 }
9325
9326 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9328
9329 // Check for 16 bit addresses or derivatives and pack if true.
9330 MVT VAddrVT =
9331 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9332 MVT VAddrScalarVT = VAddrVT.getScalarType();
9333 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9334 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9335
9336 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9337 VAddrScalarVT = VAddrVT.getScalarType();
9338 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9339 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9340
9341 // Push back extra arguments.
9342 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9343 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9344 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9345 // Special handling of bias when A16 is on. Bias is of type half but
9346 // occupies full 32-bit.
9347 SDValue Bias = DAG.getBuildVector(
9348 MVT::v2f16, DL,
9349 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9350 VAddrs.push_back(Bias);
9351 } else {
9352 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9353 "Bias needs to be converted to 16 bit in A16 mode");
9354 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9355 }
9356 }
9357
9358 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9359 // 16 bit gradients are supported, but are tied to the A16 control
9360 // so both gradients and addresses must be 16 bit
9361 LLVM_DEBUG(
9362 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9363 "require 16 bit args for both gradients and addresses");
9364 return Op;
9365 }
9366
9367 if (IsA16) {
9368 if (!ST->hasA16()) {
9369 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9370 "support 16 bit addresses\n");
9371 return Op;
9372 }
9373 }
9374
9375 // We've dealt with incorrect input so we know that if IsA16, IsG16
9376 // are set then we have to compress/pack operands (either address,
9377 // gradient or both)
9378 // In the case where a16 and gradients are tied (no G16 support) then we
9379 // have already verified that both IsA16 and IsG16 are true
9380 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9381 // Activate g16
9382 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9384 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9385 }
9386
9387 // Add gradients (packed or unpacked)
9388 if (IsG16) {
9389 // Pack the gradients
9390 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9391 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9392 ArgOffset + Intr->GradientStart,
9393 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9394 } else {
9395 for (unsigned I = ArgOffset + Intr->GradientStart;
9396 I < ArgOffset + Intr->CoordStart; I++)
9397 VAddrs.push_back(Op.getOperand(I));
9398 }
9399
9400 // Add addresses (packed or unpacked)
9401 if (IsA16) {
9402 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9403 ArgOffset + Intr->CoordStart, VAddrEnd,
9404 0 /* No gradients */);
9405 } else {
9406 // Add uncompressed address
9407 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9408 VAddrs.push_back(Op.getOperand(I));
9409 }
9410
9411 // If the register allocator cannot place the address registers contiguously
9412 // without introducing moves, then using the non-sequential address encoding
9413 // is always preferable, since it saves VALU instructions and is usually a
9414 // wash in terms of code size or even better.
9415 //
9416 // However, we currently have no way of hinting to the register allocator that
9417 // MIMG addresses should be placed contiguously when it is possible to do so,
9418 // so force non-NSA for the common 2-address case as a heuristic.
9419 //
9420 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9421 // allocation when possible.
9422 //
9423 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9424 // set of the remaining addresses.
9425 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9426 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9427 const bool UseNSA = ST->hasNSAEncoding() &&
9428 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9429 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9430 const bool UsePartialNSA =
9431 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9432
9433 SDValue VAddr;
9434 if (UsePartialNSA) {
9435 VAddr = getBuildDwordsVector(DAG, DL,
9436 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9437 } else if (!UseNSA) {
9438 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9439 }
9440
9441 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9442 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9443 SDValue Unorm;
9444 if (!BaseOpcode->Sampler) {
9445 Unorm = True;
9446 } else {
9447 uint64_t UnormConst =
9448 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9449
9450 Unorm = UnormConst ? True : False;
9451 }
9452
9453 SDValue TFE;
9454 SDValue LWE;
9455 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9456 bool IsTexFail = false;
9457 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9458 return Op;
9459
9460 if (IsTexFail) {
9461 if (!DMaskLanes) {
9462 // Expecting to get an error flag since TFC is on - and dmask is 0
9463 // Force dmask to be at least 1 otherwise the instruction will fail
9464 DMask = 0x1;
9465 DMaskLanes = 1;
9466 NumVDataDwords = 1;
9467 }
9468 NumVDataDwords += 1;
9469 AdjustRetType = true;
9470 }
9471
9472 // Has something earlier tagged that the return type needs adjusting
9473 // This happens if the instruction is a load or has set TexFailCtrl flags
9474 if (AdjustRetType) {
9475 // NumVDataDwords reflects the true number of dwords required in the return
9476 // type
9477 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9478 // This is a no-op load. This can be eliminated
9479 SDValue Undef = DAG.getPOISON(Op.getValueType());
9480 if (isa<MemSDNode>(Op))
9481 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9482 return Undef;
9483 }
9484
9485 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9486 MVT::i32, NumVDataDwords)
9487 : MVT::i32;
9488
9489 ResultTypes[0] = NewVT;
9490 if (ResultTypes.size() == 3) {
9491 // Original result was aggregate type used for TexFailCtrl results
9492 // The actual instruction returns as a vector type which has now been
9493 // created. Remove the aggregate result.
9494 ResultTypes.erase(&ResultTypes[1]);
9495 }
9496 }
9497
9498 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9499 // Keep GLC only when the atomic's result is actually used.
9500 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9502 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9504 return Op;
9505
9507 if (BaseOpcode->Store || BaseOpcode->Atomic)
9508 Ops.push_back(VData); // vdata
9509 if (UsePartialNSA) {
9510 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9511 Ops.push_back(VAddr);
9512 } else if (UseNSA)
9513 append_range(Ops, VAddrs);
9514 else
9515 Ops.push_back(VAddr);
9516 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9517 EVT RsrcVT = Rsrc.getValueType();
9518 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9519 return Op;
9520 Ops.push_back(Rsrc);
9521 if (BaseOpcode->Sampler) {
9522 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9523 if (Samp.getValueType() != MVT::v4i32)
9524 return Op;
9525 Ops.push_back(Samp);
9526 }
9527 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9528 if (IsGFX10Plus)
9529 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9530 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9531 Ops.push_back(Unorm);
9532 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9533 Ops.push_back(IsA16 && // r128, a16 for gfx9
9534 ST->hasFeature(AMDGPU::FeatureR128A16)
9535 ? True
9536 : False);
9537 if (IsGFX10Plus)
9538 Ops.push_back(IsA16 ? True : False);
9539
9540 if (!Subtarget->hasGFX90AInsts())
9541 Ops.push_back(TFE); // tfe
9542 else if (TFE->getAsZExtVal()) {
9543 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9545 "TFE is not supported on this GPU", DL.getDebugLoc()));
9546 }
9547
9548 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9549 Ops.push_back(LWE); // lwe
9550 if (!IsGFX10Plus)
9551 Ops.push_back(DimInfo->DA ? True : False);
9552 if (BaseOpcode->HasD16)
9553 Ops.push_back(IsD16 ? True : False);
9554 if (isa<MemSDNode>(Op))
9555 Ops.push_back(Op.getOperand(0)); // chain
9556
9557 int NumVAddrDwords =
9558 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9559 int Opcode = -1;
9560
9561 if (IsGFX12Plus) {
9562 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9563 NumVDataDwords, NumVAddrDwords);
9564 } else if (IsGFX11Plus) {
9565 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9566 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9567 : AMDGPU::MIMGEncGfx11Default,
9568 NumVDataDwords, NumVAddrDwords);
9569 } else if (IsGFX10Plus) {
9570 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9571 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9572 : AMDGPU::MIMGEncGfx10Default,
9573 NumVDataDwords, NumVAddrDwords);
9574 } else {
9575 if (Subtarget->hasGFX90AInsts()) {
9576 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9577 NumVDataDwords, NumVAddrDwords);
9578 if (Opcode == -1) {
9579 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9581 "requested image instruction is not supported on this GPU",
9582 DL.getDebugLoc()));
9583
9584 unsigned Idx = 0;
9585 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9586 for (EVT VT : OrigResultTypes) {
9587 if (VT == MVT::Other)
9588 RetValues[Idx++] = Op.getOperand(0); // Chain
9589 else
9590 RetValues[Idx++] = DAG.getPOISON(VT);
9591 }
9592
9593 return DAG.getMergeValues(RetValues, DL);
9594 }
9595 }
9596 if (Opcode == -1 &&
9597 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9598 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9599 NumVDataDwords, NumVAddrDwords);
9600 if (Opcode == -1)
9601 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9602 NumVDataDwords, NumVAddrDwords);
9603 }
9604 if (Opcode == -1)
9605 return Op;
9606
9607 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9608 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9609 MachineMemOperand *MemRef = MemOp->getMemOperand();
9610 DAG.setNodeMemRefs(NewNode, {MemRef});
9611 }
9612
9613 if (BaseOpcode->NoReturn) {
9614 if (BaseOpcode->Atomic)
9615 return DAG.getMergeValues(
9616 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9617
9618 return SDValue(NewNode, 0);
9619 }
9620
9621 if (BaseOpcode->AtomicX2) {
9623 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9624 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9625 }
9626
9627 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9628 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9629 NumVDataDwords, IsAtomicPacked16Bit, DL);
9630}
9631
9632SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9633 SDValue Offset, SDValue CachePolicy,
9634 SelectionDAG &DAG) const {
9635 MachineFunction &MF = DAG.getMachineFunction();
9636
9637 const DataLayout &DataLayout = DAG.getDataLayout();
9638 Align Alignment =
9639 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9640
9641 MachineMemOperand *MMO = MF.getMachineMemOperand(
9642 MachinePointerInfo(),
9645 VT.getStoreSize(), Alignment);
9646
9647 if (!Offset->isDivergent()) {
9648 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9649
9650 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9651 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9652 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9653 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9654 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9655 SDValue BufferLoad =
9656 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9657 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9658 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9659 }
9660
9661 // Widen vec3 load to vec4.
9662 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9663 !Subtarget->hasScalarDwordx3Loads()) {
9664 EVT WidenedVT =
9666 auto WidenedOp = DAG.getMemIntrinsicNode(
9667 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9668 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9669 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9670 DAG.getVectorIdxConstant(0, DL));
9671 return Subvector;
9672 }
9673
9674 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9675 DAG.getVTList(VT), Ops, VT, MMO);
9676 }
9677
9678 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9679 // assume that the buffer is unswizzled.
9680 SDValue Ops[] = {
9681 DAG.getEntryNode(), // Chain
9682 Rsrc, // rsrc
9683 DAG.getConstant(0, DL, MVT::i32), // vindex
9684 {}, // voffset
9685 {}, // soffset
9686 {}, // offset
9687 CachePolicy, // cachepolicy
9688 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9689 };
9690 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9691 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9692 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9693 }
9694
9696 unsigned NumLoads = 1;
9697 MVT LoadVT = VT.getSimpleVT();
9698 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9699 assert((LoadVT.getScalarType() == MVT::i32 ||
9700 LoadVT.getScalarType() == MVT::f32));
9701
9702 if (NumElts == 8 || NumElts == 16) {
9703 NumLoads = NumElts / 4;
9704 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9705 }
9706
9707 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9708
9709 // Use the alignment to ensure that the required offsets will fit into the
9710 // immediate offsets.
9711 setBufferOffsets(Offset, DAG, &Ops[3],
9712 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9713
9714 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9715 for (unsigned i = 0; i < NumLoads; ++i) {
9716 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9717 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9718 LoadVT, MMO, DAG));
9719 }
9720
9721 if (NumElts == 8 || NumElts == 16)
9722 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9723
9724 return Loads[0];
9725}
9726
9727SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9728 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9729 if (!Subtarget->hasArchitectedSGPRs())
9730 return {};
9731 SDLoc SL(Op);
9732 MVT VT = MVT::i32;
9733 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9734 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9735 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9736}
9737
9738SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9739 AMDGPU::Hwreg::Id HwReg,
9740 unsigned LowBit,
9741 unsigned Width) const {
9742 SDLoc SL(Op);
9743 using namespace AMDGPU::Hwreg;
9744 return {DAG.getMachineNode(
9745 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9746 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9747 SL, MVT::i32)),
9748 0};
9749}
9750
9751SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9752 unsigned Dim,
9753 const ArgDescriptor &Arg) const {
9754 SDLoc SL(Op);
9755 MachineFunction &MF = DAG.getMachineFunction();
9756 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9757 if (MaxID == 0)
9758 return DAG.getConstant(0, SL, MVT::i32);
9759
9760 // It's undefined behavior if a function marked with the amdgpu-no-*
9761 // attributes uses the corresponding intrinsic.
9762 if (!Arg)
9763 return DAG.getPOISON(Op->getValueType(0));
9764
9765 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9766 SDLoc(DAG.getEntryNode()), Arg);
9767
9768 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9769 // masking operations anyway.
9770 //
9771 // TODO: We could assert the top bit is 0 for the source copy.
9772 if (Arg.isMasked())
9773 return Val;
9774
9775 // Preserve the known bits after expansion to a copy.
9776 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9777 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9778 DAG.getValueType(SmallVT));
9779}
9780
9781SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9782 SelectionDAG &DAG) const {
9783 MachineFunction &MF = DAG.getMachineFunction();
9784 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9785
9786 EVT VT = Op.getValueType();
9787 SDLoc DL(Op);
9788 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9789
9790 // TODO: Should this propagate fast-math-flags?
9791
9792 switch (IntrinsicID) {
9793 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9794 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9795 return emitNonHSAIntrinsicError(DAG, DL, VT);
9796 return getPreloadedValue(DAG, *MFI, VT,
9798 }
9799 case Intrinsic::amdgcn_dispatch_ptr:
9800 case Intrinsic::amdgcn_queue_ptr: {
9801 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9802 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9803 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9804 DL.getDebugLoc()));
9805 return DAG.getPOISON(VT);
9806 }
9807
9808 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9811 return getPreloadedValue(DAG, *MFI, VT, RegID);
9812 }
9813 case Intrinsic::amdgcn_implicitarg_ptr: {
9814 if (MFI->isEntryFunction())
9815 return getImplicitArgPtr(DAG, DL);
9816 return getPreloadedValue(DAG, *MFI, VT,
9818 }
9819 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9820 if (!AMDGPU::isKernel(MF.getFunction())) {
9821 // This only makes sense to call in a kernel, so just lower to null.
9822 return DAG.getConstant(0, DL, VT);
9823 }
9824
9825 return getPreloadedValue(DAG, *MFI, VT,
9827 }
9828 case Intrinsic::amdgcn_dispatch_id: {
9829 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9830 }
9831 case Intrinsic::amdgcn_rcp:
9832 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9833 case Intrinsic::amdgcn_rsq:
9834 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9835 case Intrinsic::amdgcn_rsq_legacy:
9836 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9837 return emitRemovedIntrinsicError(DAG, DL, VT);
9838 return SDValue();
9839 case Intrinsic::amdgcn_rcp_legacy:
9840 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9841 return emitRemovedIntrinsicError(DAG, DL, VT);
9842 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9843 case Intrinsic::amdgcn_rsq_clamp: {
9844 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9845 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9846
9847 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9848 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9849 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9850
9851 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9852 SDValue Tmp =
9853 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9854 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9855 DAG.getConstantFP(Min, DL, VT));
9856 }
9857 case Intrinsic::r600_read_ngroups_x:
9858 if (Subtarget->isAmdHsaOS())
9859 return emitNonHSAIntrinsicError(DAG, DL, VT);
9860
9861 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9863 false);
9864 case Intrinsic::r600_read_ngroups_y:
9865 if (Subtarget->isAmdHsaOS())
9866 return emitNonHSAIntrinsicError(DAG, DL, VT);
9867
9868 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9870 false);
9871 case Intrinsic::r600_read_ngroups_z:
9872 if (Subtarget->isAmdHsaOS())
9873 return emitNonHSAIntrinsicError(DAG, DL, VT);
9874
9875 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9877 false);
9878 case Intrinsic::r600_read_local_size_x:
9879 if (Subtarget->isAmdHsaOS())
9880 return emitNonHSAIntrinsicError(DAG, DL, VT);
9881
9882 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9884 case Intrinsic::r600_read_local_size_y:
9885 if (Subtarget->isAmdHsaOS())
9886 return emitNonHSAIntrinsicError(DAG, DL, VT);
9887
9888 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9890 case Intrinsic::r600_read_local_size_z:
9891 if (Subtarget->isAmdHsaOS())
9892 return emitNonHSAIntrinsicError(DAG, DL, VT);
9893
9894 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9896 case Intrinsic::amdgcn_workgroup_id_x:
9897 return lowerWorkGroupId(DAG, *MFI, VT,
9901 case Intrinsic::amdgcn_workgroup_id_y:
9902 return lowerWorkGroupId(DAG, *MFI, VT,
9906 case Intrinsic::amdgcn_workgroup_id_z:
9907 return lowerWorkGroupId(DAG, *MFI, VT,
9911 case Intrinsic::amdgcn_cluster_id_x:
9912 return Subtarget->hasClusters()
9913 ? getPreloadedValue(DAG, *MFI, VT,
9915 : DAG.getPOISON(VT);
9916 case Intrinsic::amdgcn_cluster_id_y:
9917 return Subtarget->hasClusters()
9918 ? getPreloadedValue(DAG, *MFI, VT,
9920 : DAG.getPOISON(VT);
9921 case Intrinsic::amdgcn_cluster_id_z:
9922 return Subtarget->hasClusters()
9923 ? getPreloadedValue(DAG, *MFI, VT,
9925 : DAG.getPOISON(VT);
9926 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9927 return Subtarget->hasClusters()
9928 ? getPreloadedValue(
9929 DAG, *MFI, VT,
9931 : DAG.getPOISON(VT);
9932 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9933 return Subtarget->hasClusters()
9934 ? getPreloadedValue(
9935 DAG, *MFI, VT,
9937 : DAG.getPOISON(VT);
9938 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9939 return Subtarget->hasClusters()
9940 ? getPreloadedValue(
9941 DAG, *MFI, VT,
9943 : DAG.getPOISON(VT);
9944 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9945 return Subtarget->hasClusters()
9946 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9947 : SDValue();
9948 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9949 return Subtarget->hasClusters()
9950 ? getPreloadedValue(
9951 DAG, *MFI, VT,
9953 : DAG.getPOISON(VT);
9954 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9955 return Subtarget->hasClusters()
9956 ? getPreloadedValue(
9957 DAG, *MFI, VT,
9959 : DAG.getPOISON(VT);
9960 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9961 return Subtarget->hasClusters()
9962 ? getPreloadedValue(
9963 DAG, *MFI, VT,
9965 : DAG.getPOISON(VT);
9966 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9967 return Subtarget->hasClusters()
9968 ? getPreloadedValue(
9969 DAG, *MFI, VT,
9971 : DAG.getPOISON(VT);
9972 case Intrinsic::amdgcn_wave_id:
9973 return lowerWaveID(DAG, Op);
9974 case Intrinsic::amdgcn_lds_kernel_id: {
9975 if (MFI->isEntryFunction())
9976 return getLDSKernelId(DAG, DL);
9977 return getPreloadedValue(DAG, *MFI, VT,
9979 }
9980 case Intrinsic::amdgcn_workitem_id_x:
9981 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9982 case Intrinsic::amdgcn_workitem_id_y:
9983 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9984 case Intrinsic::amdgcn_workitem_id_z:
9985 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9986 case Intrinsic::amdgcn_wavefrontsize:
9987 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9988 SDLoc(Op), MVT::i32);
9989 case Intrinsic::amdgcn_s_buffer_load: {
9990 unsigned CPol = Op.getConstantOperandVal(3);
9991 // s_buffer_load, because of how it's optimized, can't be volatile
9992 // so reject ones with the volatile bit set.
9993 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9996 return Op;
9997 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9998 Op.getOperand(3), DAG);
9999 }
10000 case Intrinsic::amdgcn_fdiv_fast:
10001 return lowerFDIV_FAST(Op, DAG);
10002 case Intrinsic::amdgcn_sin:
10003 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10004
10005 case Intrinsic::amdgcn_cos:
10006 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10007
10008 case Intrinsic::amdgcn_mul_u24:
10009 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10010 Op.getOperand(2));
10011 case Intrinsic::amdgcn_mul_i24:
10012 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10013 Op.getOperand(2));
10014
10015 case Intrinsic::amdgcn_log_clamp: {
10016 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10017 return SDValue();
10018
10019 return emitRemovedIntrinsicError(DAG, DL, VT);
10020 }
10021 case Intrinsic::amdgcn_fract:
10022 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10023
10024 case Intrinsic::amdgcn_class:
10025 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10026 Op.getOperand(2));
10027 case Intrinsic::amdgcn_div_fmas:
10028 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10029 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10030
10031 case Intrinsic::amdgcn_div_fixup:
10032 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10033 Op.getOperand(2), Op.getOperand(3));
10034
10035 case Intrinsic::amdgcn_div_scale: {
10036 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10037
10038 // Translate to the operands expected by the machine instruction. The
10039 // first parameter must be the same as the first instruction.
10040 SDValue Numerator = Op.getOperand(1);
10041 SDValue Denominator = Op.getOperand(2);
10042
10043 // Note this order is opposite of the machine instruction's operations,
10044 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10045 // intrinsic has the numerator as the first operand to match a normal
10046 // division operation.
10047
10048 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10049
10050 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10051 Denominator, Numerator);
10052 }
10053 case Intrinsic::amdgcn_icmp: {
10054 // There is a Pat that handles this variant, so return it as-is.
10055 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10056 Op.getConstantOperandVal(2) == 0 &&
10057 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10058 return Op;
10059 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10060 }
10061 case Intrinsic::amdgcn_fcmp: {
10062 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10063 }
10064 case Intrinsic::amdgcn_ballot:
10065 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10066 case Intrinsic::amdgcn_fmed3:
10067 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10068 Op.getOperand(2), Op.getOperand(3));
10069 case Intrinsic::amdgcn_fdot2:
10070 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10071 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10072 case Intrinsic::amdgcn_fmul_legacy:
10073 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10074 Op.getOperand(2));
10075 case Intrinsic::amdgcn_sffbh:
10076 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10077 case Intrinsic::amdgcn_sbfe:
10078 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10079 Op.getOperand(2), Op.getOperand(3));
10080 case Intrinsic::amdgcn_ubfe:
10081 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10082 Op.getOperand(2), Op.getOperand(3));
10083 case Intrinsic::amdgcn_cvt_pkrtz:
10084 case Intrinsic::amdgcn_cvt_pknorm_i16:
10085 case Intrinsic::amdgcn_cvt_pknorm_u16:
10086 case Intrinsic::amdgcn_cvt_pk_i16:
10087 case Intrinsic::amdgcn_cvt_pk_u16: {
10088 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10089 EVT VT = Op.getValueType();
10090 unsigned Opcode;
10091
10092 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10093 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10094 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10095 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10096 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10097 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10098 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10099 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10100 else
10101 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10102
10103 if (isTypeLegal(VT))
10104 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10105
10106 SDValue Node =
10107 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10108 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10109 }
10110 case Intrinsic::amdgcn_fmad_ftz:
10111 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10112 Op.getOperand(2), Op.getOperand(3));
10113
10114 case Intrinsic::amdgcn_if_break:
10115 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10116 Op->getOperand(1), Op->getOperand(2)),
10117 0);
10118
10119 case Intrinsic::amdgcn_groupstaticsize: {
10121 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10122 return Op;
10123
10124 const Module *M = MF.getFunction().getParent();
10125 const GlobalValue *GV =
10126 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10127 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10129 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10130 }
10131 case Intrinsic::amdgcn_is_shared:
10132 case Intrinsic::amdgcn_is_private: {
10133 SDLoc SL(Op);
10134 SDValue SrcVec =
10135 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10136 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10137 DAG.getConstant(1, SL, MVT::i32));
10138
10139 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10141 : AMDGPUAS::PRIVATE_ADDRESS;
10142 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10143 Subtarget->hasGloballyAddressableScratch()) {
10144 SDValue FlatScratchBaseHi(
10145 DAG.getMachineNode(
10146 AMDGPU::S_MOV_B32, DL, MVT::i32,
10147 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10148 0);
10149 // Test bits 63..58 against the aperture address.
10150 return DAG.getSetCC(
10151 SL, MVT::i1,
10152 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10153 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10154 }
10155
10156 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10157 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10158 }
10159 case Intrinsic::amdgcn_perm:
10160 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10161 Op.getOperand(2), Op.getOperand(3));
10162 case Intrinsic::amdgcn_reloc_constant: {
10163 Module *M = MF.getFunction().getParent();
10164 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10165 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10166 auto *RelocSymbol = cast<GlobalVariable>(
10167 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10168 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10170 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10171 }
10172 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10173 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10174 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10175 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10176 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10177 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10178 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10179 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10180 if (Op.getOperand(4).getValueType() == MVT::i32)
10181 return SDValue();
10182
10183 SDLoc SL(Op);
10184 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10185 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10186 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10187 Op.getOperand(3), IndexKeyi32);
10188 }
10189 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10190 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10191 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10192 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10193 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10194 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10195 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10196 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10197 if (Op.getOperand(4).getValueType() == MVT::i64)
10198 return SDValue();
10199
10200 SDLoc SL(Op);
10201 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10202 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10203 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10204 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10205 Op.getOperand(6)});
10206 }
10207 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10208 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10209 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10210 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10211 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10212 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10213 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10214 ? MVT::i64
10215 : MVT::i32;
10216 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10217 return SDValue();
10218
10219 SDLoc SL(Op);
10220 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10221 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10222 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10223 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10224 IndexKey, Op.getOperand(7),
10225 Op.getOperand(8)}); // No clamp operand
10226 }
10227 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10228 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10229 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10230 if (Op.getOperand(6).getValueType() == MVT::i32)
10231 return SDValue();
10232
10233 SDLoc SL(Op);
10234 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10235 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10236 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10237 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10238 IndexKeyi32, Op.getOperand(7)});
10239 }
10240 case Intrinsic::amdgcn_addrspacecast_nonnull:
10241 return lowerADDRSPACECAST(Op, DAG);
10242 case Intrinsic::amdgcn_readlane:
10243 case Intrinsic::amdgcn_readfirstlane:
10244 case Intrinsic::amdgcn_writelane:
10245 case Intrinsic::amdgcn_permlane16:
10246 case Intrinsic::amdgcn_permlanex16:
10247 case Intrinsic::amdgcn_permlane64:
10248 case Intrinsic::amdgcn_set_inactive:
10249 case Intrinsic::amdgcn_set_inactive_chain_arg:
10250 case Intrinsic::amdgcn_mov_dpp8:
10251 case Intrinsic::amdgcn_update_dpp:
10252 return lowerLaneOp(*this, Op.getNode(), DAG);
10253 case Intrinsic::amdgcn_dead: {
10255 for (const EVT ValTy : Op.getNode()->values())
10256 Poisons.push_back(DAG.getPOISON(ValTy));
10257 return DAG.getMergeValues(Poisons, SDLoc(Op));
10258 }
10259 default:
10260 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10262 return lowerImage(Op, ImageDimIntr, DAG, false);
10263
10264 return Op;
10265 }
10266}
10267
10268// On targets not supporting constant in soffset field, turn zero to
10269// SGPR_NULL to avoid generating an extra s_mov with zero.
10271 const GCNSubtarget *Subtarget) {
10272 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10273 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10274 return SOffset;
10275}
10276
10277SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10278 SelectionDAG &DAG,
10279 unsigned NewOpcode) const {
10280 SDLoc DL(Op);
10281
10282 SDValue VData = Op.getOperand(2);
10283 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10284 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10285 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10286 SDValue Ops[] = {
10287 Op.getOperand(0), // Chain
10288 VData, // vdata
10289 Rsrc, // rsrc
10290 DAG.getConstant(0, DL, MVT::i32), // vindex
10291 VOffset, // voffset
10292 SOffset, // soffset
10293 Offset, // offset
10294 Op.getOperand(6), // cachepolicy
10295 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10296 };
10297
10298 auto *M = cast<MemSDNode>(Op);
10299
10300 EVT MemVT = VData.getValueType();
10301 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10302 M->getMemOperand());
10303}
10304
10305SDValue
10306SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10307 unsigned NewOpcode) const {
10308 SDLoc DL(Op);
10309
10310 SDValue VData = Op.getOperand(2);
10311 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10312 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10313 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10314 SDValue Ops[] = {
10315 Op.getOperand(0), // Chain
10316 VData, // vdata
10317 Rsrc, // rsrc
10318 Op.getOperand(4), // vindex
10319 VOffset, // voffset
10320 SOffset, // soffset
10321 Offset, // offset
10322 Op.getOperand(7), // cachepolicy
10323 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10324 };
10325
10326 auto *M = cast<MemSDNode>(Op);
10327
10328 EVT MemVT = VData.getValueType();
10329 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10330 M->getMemOperand());
10331}
10332
10333SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10334 SelectionDAG &DAG) const {
10335 unsigned IntrID = Op.getConstantOperandVal(1);
10336 SDLoc DL(Op);
10337
10338 switch (IntrID) {
10339 case Intrinsic::amdgcn_ds_ordered_add:
10340 case Intrinsic::amdgcn_ds_ordered_swap: {
10341 MemSDNode *M = cast<MemSDNode>(Op);
10342 SDValue Chain = M->getOperand(0);
10343 SDValue M0 = M->getOperand(2);
10344 SDValue Value = M->getOperand(3);
10345 unsigned IndexOperand = M->getConstantOperandVal(7);
10346 unsigned WaveRelease = M->getConstantOperandVal(8);
10347 unsigned WaveDone = M->getConstantOperandVal(9);
10348
10349 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10350 IndexOperand &= ~0x3f;
10351 unsigned CountDw = 0;
10352
10353 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10354 CountDw = (IndexOperand >> 24) & 0xf;
10355 IndexOperand &= ~(0xf << 24);
10356
10357 if (CountDw < 1 || CountDw > 4) {
10358 const Function &Fn = DAG.getMachineFunction().getFunction();
10359 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10360 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10361 DL.getDebugLoc()));
10362 CountDw = 1;
10363 }
10364 }
10365
10366 if (IndexOperand) {
10367 const Function &Fn = DAG.getMachineFunction().getFunction();
10368 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10369 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10370 }
10371
10372 if (WaveDone && !WaveRelease) {
10373 // TODO: Move this to IR verifier
10374 const Function &Fn = DAG.getMachineFunction().getFunction();
10375 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10376 Fn, "ds_ordered_count: wave_done requires wave_release",
10377 DL.getDebugLoc()));
10378 }
10379
10380 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10381 unsigned ShaderType =
10383 unsigned Offset0 = OrderedCountIndex << 2;
10384 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10385
10386 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10387 Offset1 |= (CountDw - 1) << 6;
10388
10389 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10390 Offset1 |= ShaderType << 2;
10391
10392 unsigned Offset = Offset0 | (Offset1 << 8);
10393
10394 SDValue Ops[] = {
10395 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10396 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10397 };
10398 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10399 M->getVTList(), Ops, M->getMemoryVT(),
10400 M->getMemOperand());
10401 }
10402 case Intrinsic::amdgcn_raw_buffer_load:
10403 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10404 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10405 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10406 case Intrinsic::amdgcn_raw_buffer_load_format:
10407 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10408 const bool IsFormat =
10409 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10410 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10411
10412 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10413 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10414 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10415 SDValue Ops[] = {
10416 Op.getOperand(0), // Chain
10417 Rsrc, // rsrc
10418 DAG.getConstant(0, DL, MVT::i32), // vindex
10419 VOffset, // voffset
10420 SOffset, // soffset
10421 Offset, // offset
10422 Op.getOperand(5), // cachepolicy, swizzled buffer
10423 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10424 };
10425
10426 auto *M = cast<MemSDNode>(Op);
10427 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10428 }
10429 case Intrinsic::amdgcn_struct_buffer_load:
10430 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10431 case Intrinsic::amdgcn_struct_buffer_load_format:
10432 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10433 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10434 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10435 const bool IsFormat =
10436 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10437 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10438
10439 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10440 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10441 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10442 SDValue Ops[] = {
10443 Op.getOperand(0), // Chain
10444 Rsrc, // rsrc
10445 Op.getOperand(3), // vindex
10446 VOffset, // voffset
10447 SOffset, // soffset
10448 Offset, // offset
10449 Op.getOperand(6), // cachepolicy, swizzled buffer
10450 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10451 };
10452
10453 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10454 }
10455 case Intrinsic::amdgcn_raw_tbuffer_load:
10456 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10457 MemSDNode *M = cast<MemSDNode>(Op);
10458 EVT LoadVT = Op.getValueType();
10459 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10460 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10461 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10462
10463 SDValue Ops[] = {
10464 Op.getOperand(0), // Chain
10465 Rsrc, // rsrc
10466 DAG.getConstant(0, DL, MVT::i32), // vindex
10467 VOffset, // voffset
10468 SOffset, // soffset
10469 Offset, // offset
10470 Op.getOperand(5), // format
10471 Op.getOperand(6), // cachepolicy, swizzled buffer
10472 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10473 };
10474
10475 if (LoadVT.getScalarType() == MVT::f16)
10476 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10477 Ops);
10478 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10479 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10480 DAG);
10481 }
10482 case Intrinsic::amdgcn_struct_tbuffer_load:
10483 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10484 MemSDNode *M = cast<MemSDNode>(Op);
10485 EVT LoadVT = Op.getValueType();
10486 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10487 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10488 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10489
10490 SDValue Ops[] = {
10491 Op.getOperand(0), // Chain
10492 Rsrc, // rsrc
10493 Op.getOperand(3), // vindex
10494 VOffset, // voffset
10495 SOffset, // soffset
10496 Offset, // offset
10497 Op.getOperand(6), // format
10498 Op.getOperand(7), // cachepolicy, swizzled buffer
10499 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10500 };
10501
10502 if (LoadVT.getScalarType() == MVT::f16)
10503 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10504 Ops);
10505 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10506 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10507 DAG);
10508 }
10509 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10510 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10511 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10512 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10514 return lowerStructBufferAtomicIntrin(Op, DAG,
10515 AMDGPUISD::BUFFER_ATOMIC_FADD);
10516 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10518 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10519 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10520 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10521 return lowerStructBufferAtomicIntrin(Op, DAG,
10522 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10523 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10524 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10525 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10526 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10527 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10528 return lowerStructBufferAtomicIntrin(Op, DAG,
10529 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10530 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10531 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10532 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10533 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10534 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10535 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10536 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10537 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10538 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10539 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10540 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10541 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10542 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10543 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10544 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10545 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10546 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10547 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10548 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10549 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10550 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10551 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10552 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10553 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10554 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10555 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10556 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10557 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10558 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10559 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10560 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10561 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10562 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10563 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10564 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10565 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10566 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10567 return lowerRawBufferAtomicIntrin(Op, DAG,
10568 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10569 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10570 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10571 return lowerStructBufferAtomicIntrin(Op, DAG,
10572 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10573 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10574 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10575 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10576 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10577 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10578 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10579 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10580 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10581 return lowerStructBufferAtomicIntrin(Op, DAG,
10582 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10583 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10585 return lowerStructBufferAtomicIntrin(Op, DAG,
10586 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10587 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10588 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10589 return lowerStructBufferAtomicIntrin(Op, DAG,
10590 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10591 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10592 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10593 return lowerStructBufferAtomicIntrin(Op, DAG,
10594 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10595 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10596 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10597 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10598 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10600 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10601 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10602 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10603 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10604 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10605 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10606 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10607 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10608 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10609 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10610 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10611 return lowerStructBufferAtomicIntrin(Op, DAG,
10612 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10613
10614 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10615 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10616 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10617 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10618 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10619 SDValue Ops[] = {
10620 Op.getOperand(0), // Chain
10621 Op.getOperand(2), // src
10622 Op.getOperand(3), // cmp
10623 Rsrc, // rsrc
10624 DAG.getConstant(0, DL, MVT::i32), // vindex
10625 VOffset, // voffset
10626 SOffset, // soffset
10627 Offset, // offset
10628 Op.getOperand(7), // cachepolicy
10629 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10630 };
10631 EVT VT = Op.getValueType();
10632 auto *M = cast<MemSDNode>(Op);
10633
10634 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10635 Op->getVTList(), Ops, VT,
10636 M->getMemOperand());
10637 }
10638 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10640 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10641 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10642 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10643 SDValue Ops[] = {
10644 Op.getOperand(0), // Chain
10645 Op.getOperand(2), // src
10646 Op.getOperand(3), // cmp
10647 Rsrc, // rsrc
10648 Op.getOperand(5), // vindex
10649 VOffset, // voffset
10650 SOffset, // soffset
10651 Offset, // offset
10652 Op.getOperand(8), // cachepolicy
10653 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10654 };
10655 EVT VT = Op.getValueType();
10656 auto *M = cast<MemSDNode>(Op);
10657
10658 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10659 Op->getVTList(), Ops, VT,
10660 M->getMemOperand());
10661 }
10662 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10663 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10664 MemSDNode *M = cast<MemSDNode>(Op);
10665 SDValue NodePtr = M->getOperand(2);
10666 SDValue RayExtent = M->getOperand(3);
10667 SDValue InstanceMask = M->getOperand(4);
10668 SDValue RayOrigin = M->getOperand(5);
10669 SDValue RayDir = M->getOperand(6);
10670 SDValue Offsets = M->getOperand(7);
10671 SDValue TDescr = M->getOperand(8);
10672
10673 assert(NodePtr.getValueType() == MVT::i64);
10674 assert(RayDir.getValueType() == MVT::v3f32);
10675
10676 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10677 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10678 return SDValue();
10679 }
10680
10681 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10682 const unsigned NumVDataDwords = 10;
10683 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10684 int Opcode = AMDGPU::getMIMGOpcode(
10685 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10686 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10687 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10688 assert(Opcode != -1);
10689
10691 Ops.push_back(NodePtr);
10692 Ops.push_back(DAG.getBuildVector(
10693 MVT::v2i32, DL,
10694 {DAG.getBitcast(MVT::i32, RayExtent),
10695 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10696 Ops.push_back(RayOrigin);
10697 Ops.push_back(RayDir);
10698 Ops.push_back(Offsets);
10699 Ops.push_back(TDescr);
10700 Ops.push_back(M->getChain());
10701
10702 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10703 MachineMemOperand *MemRef = M->getMemOperand();
10704 DAG.setNodeMemRefs(NewNode, {MemRef});
10705 return SDValue(NewNode, 0);
10706 }
10707 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10708 MemSDNode *M = cast<MemSDNode>(Op);
10709 SDValue NodePtr = M->getOperand(2);
10710 SDValue RayExtent = M->getOperand(3);
10711 SDValue RayOrigin = M->getOperand(4);
10712 SDValue RayDir = M->getOperand(5);
10713 SDValue RayInvDir = M->getOperand(6);
10714 SDValue TDescr = M->getOperand(7);
10715
10716 assert(NodePtr.getValueType() == MVT::i32 ||
10717 NodePtr.getValueType() == MVT::i64);
10718 assert(RayDir.getValueType() == MVT::v3f16 ||
10719 RayDir.getValueType() == MVT::v3f32);
10720
10721 if (!Subtarget->hasGFX10_AEncoding()) {
10722 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10723 return SDValue();
10724 }
10725
10726 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10727 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10728 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10729 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10730 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10731 const unsigned NumVDataDwords = 4;
10732 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10733 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10734 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10735 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10736 IsGFX12Plus;
10737 const unsigned BaseOpcodes[2][2] = {
10738 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10739 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10740 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10741 int Opcode;
10742 if (UseNSA) {
10743 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10744 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10745 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10746 : AMDGPU::MIMGEncGfx10NSA,
10747 NumVDataDwords, NumVAddrDwords);
10748 } else {
10749 assert(!IsGFX12Plus);
10750 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10751 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10752 : AMDGPU::MIMGEncGfx10Default,
10753 NumVDataDwords, NumVAddrDwords);
10754 }
10755 assert(Opcode != -1);
10756
10758
10759 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10761 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10762 if (Lanes[0].getValueSizeInBits() == 32) {
10763 for (unsigned I = 0; I < 3; ++I)
10764 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10765 } else {
10766 if (IsAligned) {
10767 Ops.push_back(DAG.getBitcast(
10768 MVT::i32,
10769 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10770 Ops.push_back(Lanes[2]);
10771 } else {
10772 SDValue Elt0 = Ops.pop_back_val();
10773 Ops.push_back(DAG.getBitcast(
10774 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10775 Ops.push_back(DAG.getBitcast(
10776 MVT::i32,
10777 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10778 }
10779 }
10780 };
10781
10782 if (UseNSA && IsGFX11Plus) {
10783 Ops.push_back(NodePtr);
10784 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10785 Ops.push_back(RayOrigin);
10786 if (IsA16) {
10787 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10788 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10789 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10790 for (unsigned I = 0; I < 3; ++I) {
10791 MergedLanes.push_back(DAG.getBitcast(
10792 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10793 {DirLanes[I], InvDirLanes[I]})));
10794 }
10795 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10796 } else {
10797 Ops.push_back(RayDir);
10798 Ops.push_back(RayInvDir);
10799 }
10800 } else {
10801 if (Is64)
10802 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10803 2);
10804 else
10805 Ops.push_back(NodePtr);
10806
10807 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10808 packLanes(RayOrigin, true);
10809 packLanes(RayDir, true);
10810 packLanes(RayInvDir, false);
10811 }
10812
10813 if (!UseNSA) {
10814 // Build a single vector containing all the operands so far prepared.
10815 if (NumVAddrDwords > 12) {
10816 SDValue Undef = DAG.getPOISON(MVT::i32);
10817 Ops.append(16 - Ops.size(), Undef);
10818 }
10819 assert(Ops.size() >= 8 && Ops.size() <= 12);
10820 SDValue MergedOps =
10821 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10822 Ops.clear();
10823 Ops.push_back(MergedOps);
10824 }
10825
10826 Ops.push_back(TDescr);
10827 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10828 Ops.push_back(M->getChain());
10829
10830 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10831 MachineMemOperand *MemRef = M->getMemOperand();
10832 DAG.setNodeMemRefs(NewNode, {MemRef});
10833 return SDValue(NewNode, 0);
10834 }
10835 case Intrinsic::amdgcn_global_atomic_fmin_num:
10836 case Intrinsic::amdgcn_global_atomic_fmax_num:
10837 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10838 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10839 MemSDNode *M = cast<MemSDNode>(Op);
10840 SDValue Ops[] = {
10841 M->getOperand(0), // Chain
10842 M->getOperand(2), // Ptr
10843 M->getOperand(3) // Value
10844 };
10845 unsigned Opcode = 0;
10846 switch (IntrID) {
10847 case Intrinsic::amdgcn_global_atomic_fmin_num:
10848 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10849 Opcode = ISD::ATOMIC_LOAD_FMIN;
10850 break;
10851 }
10852 case Intrinsic::amdgcn_global_atomic_fmax_num:
10853 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10854 Opcode = ISD::ATOMIC_LOAD_FMAX;
10855 break;
10856 }
10857 default:
10858 llvm_unreachable("unhandled atomic opcode");
10859 }
10860 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10861 Ops, M->getMemOperand());
10862 }
10863 case Intrinsic::amdgcn_s_get_barrier_state:
10864 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10865 SDValue Chain = Op->getOperand(0);
10867 unsigned Opc;
10868
10869 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10870 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10871 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10872 BarID = (BarID >> 4) & 0x3F;
10873 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10874 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10875 Ops.push_back(K);
10876 Ops.push_back(Chain);
10877 } else {
10878 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10879 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10880 SDValue M0Val;
10881 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10882 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10883 M0Val = SDValue(
10884 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10885 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10886 0);
10887 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10888 } else
10889 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10890 }
10891
10892 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10893 return SDValue(NewMI, 0);
10894 }
10895 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10896 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10897 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10898 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10899 SDValue Chain = Op->getOperand(0);
10900 SDValue Ptr = Op->getOperand(2);
10901 EVT VT = Op->getValueType(0);
10902 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10903 Chain, Ptr, MII->getMemOperand());
10904 }
10905 default:
10906
10907 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10909 return lowerImage(Op, ImageDimIntr, DAG, true);
10910
10911 return SDValue();
10912 }
10913}
10914
10915// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10916// dwordx4 if on SI and handle TFE loads.
10917SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10918 SDVTList VTList,
10919 ArrayRef<SDValue> Ops, EVT MemVT,
10920 MachineMemOperand *MMO,
10921 SelectionDAG &DAG) const {
10922 LLVMContext &C = *DAG.getContext();
10923 MachineFunction &MF = DAG.getMachineFunction();
10924 EVT VT = VTList.VTs[0];
10925
10926 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10927 bool IsTFE = VTList.NumVTs == 3;
10928 if (IsTFE) {
10929 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10930 unsigned NumOpDWords = NumValueDWords + 1;
10931 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10932 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10933 MachineMemOperand *OpDWordsMMO =
10934 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10935 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10936 OpDWordsVT, OpDWordsMMO, DAG);
10937 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10938 DAG.getVectorIdxConstant(NumValueDWords, DL));
10939 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10940 SDValue ValueDWords =
10941 NumValueDWords == 1
10942 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10944 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10945 ZeroIdx);
10946 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10947 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10948 }
10949
10950 if (!Subtarget->hasDwordx3LoadStores() &&
10951 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10952 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10953 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10954 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10955 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10956 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10957 WidenedMemVT, WidenedMMO);
10959 DAG.getVectorIdxConstant(0, DL));
10960 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10961 }
10962
10963 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10964}
10965
10966SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10967 bool ImageStore) const {
10968 EVT StoreVT = VData.getValueType();
10969
10970 // No change for f16 and legal vector D16 types.
10971 if (!StoreVT.isVector())
10972 return VData;
10973
10974 SDLoc DL(VData);
10975 unsigned NumElements = StoreVT.getVectorNumElements();
10976
10977 if (Subtarget->hasUnpackedD16VMem()) {
10978 // We need to unpack the packed data to store.
10979 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10980 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10981
10982 EVT EquivStoreVT =
10983 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10984 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10985 return DAG.UnrollVectorOp(ZExt.getNode());
10986 }
10987
10988 // The sq block of gfx8.1 does not estimate register use correctly for d16
10989 // image store instructions. The data operand is computed as if it were not a
10990 // d16 image instruction.
10991 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10992 // Bitcast to i16
10993 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10994 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10995
10996 // Decompose into scalars
10998 DAG.ExtractVectorElements(IntVData, Elts);
10999
11000 // Group pairs of i16 into v2i16 and bitcast to i32
11001 SmallVector<SDValue, 4> PackedElts;
11002 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11003 SDValue Pair =
11004 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11005 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11006 PackedElts.push_back(IntPair);
11007 }
11008 if ((NumElements % 2) == 1) {
11009 // Handle v3i16
11010 unsigned I = Elts.size() / 2;
11011 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11012 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11013 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11014 PackedElts.push_back(IntPair);
11015 }
11016
11017 // Pad using UNDEF
11018 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11019
11020 // Build final vector
11021 EVT VecVT =
11022 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11023 return DAG.getBuildVector(VecVT, DL, PackedElts);
11024 }
11025
11026 if (NumElements == 3) {
11027 EVT IntStoreVT =
11029 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11030
11031 EVT WidenedStoreVT = EVT::getVectorVT(
11032 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11033 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11034 WidenedStoreVT.getStoreSizeInBits());
11035 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11036 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11037 }
11038
11039 assert(isTypeLegal(StoreVT));
11040 return VData;
11041}
11042
11043SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11044 SelectionDAG &DAG) const {
11045 SDLoc DL(Op);
11046 SDValue Chain = Op.getOperand(0);
11047 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11048 MachineFunction &MF = DAG.getMachineFunction();
11049
11050 switch (IntrinsicID) {
11051 case Intrinsic::amdgcn_exp_compr: {
11052 if (!Subtarget->hasCompressedExport()) {
11053 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11055 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11056 }
11057 SDValue Src0 = Op.getOperand(4);
11058 SDValue Src1 = Op.getOperand(5);
11059 // Hack around illegal type on SI by directly selecting it.
11060 if (isTypeLegal(Src0.getValueType()))
11061 return SDValue();
11062
11063 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11064 SDValue Undef = DAG.getPOISON(MVT::f32);
11065 const SDValue Ops[] = {
11066 Op.getOperand(2), // tgt
11067 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11068 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11069 Undef, // src2
11070 Undef, // src3
11071 Op.getOperand(7), // vm
11072 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11073 Op.getOperand(3), // en
11074 Op.getOperand(0) // Chain
11075 };
11076
11077 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11078 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11079 }
11080
11081 case Intrinsic::amdgcn_struct_tbuffer_store:
11082 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11083 SDValue VData = Op.getOperand(2);
11084 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11085 if (IsD16)
11086 VData = handleD16VData(VData, DAG);
11087 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11088 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11089 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11090 SDValue Ops[] = {
11091 Chain,
11092 VData, // vdata
11093 Rsrc, // rsrc
11094 Op.getOperand(4), // vindex
11095 VOffset, // voffset
11096 SOffset, // soffset
11097 Offset, // offset
11098 Op.getOperand(7), // format
11099 Op.getOperand(8), // cachepolicy, swizzled buffer
11100 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11101 };
11102 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11103 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11104 MemSDNode *M = cast<MemSDNode>(Op);
11105 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11106 M->getMemoryVT(), M->getMemOperand());
11107 }
11108
11109 case Intrinsic::amdgcn_raw_tbuffer_store:
11110 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11111 SDValue VData = Op.getOperand(2);
11112 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11113 if (IsD16)
11114 VData = handleD16VData(VData, DAG);
11115 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11116 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11117 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11118 SDValue Ops[] = {
11119 Chain,
11120 VData, // vdata
11121 Rsrc, // rsrc
11122 DAG.getConstant(0, DL, MVT::i32), // vindex
11123 VOffset, // voffset
11124 SOffset, // soffset
11125 Offset, // offset
11126 Op.getOperand(6), // format
11127 Op.getOperand(7), // cachepolicy, swizzled buffer
11128 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11129 };
11130 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11131 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11132 MemSDNode *M = cast<MemSDNode>(Op);
11133 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11134 M->getMemoryVT(), M->getMemOperand());
11135 }
11136
11137 case Intrinsic::amdgcn_raw_buffer_store:
11138 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11139 case Intrinsic::amdgcn_raw_buffer_store_format:
11140 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11141 const bool IsFormat =
11142 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11143 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11144
11145 SDValue VData = Op.getOperand(2);
11146 EVT VDataVT = VData.getValueType();
11147 EVT EltType = VDataVT.getScalarType();
11148 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11149 if (IsD16) {
11150 VData = handleD16VData(VData, DAG);
11151 VDataVT = VData.getValueType();
11152 }
11153
11154 if (!isTypeLegal(VDataVT)) {
11155 VData =
11156 DAG.getNode(ISD::BITCAST, DL,
11157 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11158 }
11159
11160 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11161 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11162 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11163 SDValue Ops[] = {
11164 Chain,
11165 VData,
11166 Rsrc,
11167 DAG.getConstant(0, DL, MVT::i32), // vindex
11168 VOffset, // voffset
11169 SOffset, // soffset
11170 Offset, // offset
11171 Op.getOperand(6), // cachepolicy, swizzled buffer
11172 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11173 };
11174 unsigned Opc =
11175 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11176 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11177 MemSDNode *M = cast<MemSDNode>(Op);
11178
11179 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11180 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11181 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11182
11183 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11184 M->getMemoryVT(), M->getMemOperand());
11185 }
11186
11187 case Intrinsic::amdgcn_struct_buffer_store:
11188 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11189 case Intrinsic::amdgcn_struct_buffer_store_format:
11190 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11191 const bool IsFormat =
11192 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11193 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11194
11195 SDValue VData = Op.getOperand(2);
11196 EVT VDataVT = VData.getValueType();
11197 EVT EltType = VDataVT.getScalarType();
11198 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11199
11200 if (IsD16) {
11201 VData = handleD16VData(VData, DAG);
11202 VDataVT = VData.getValueType();
11203 }
11204
11205 if (!isTypeLegal(VDataVT)) {
11206 VData =
11207 DAG.getNode(ISD::BITCAST, DL,
11208 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11209 }
11210
11211 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11212 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11213 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11214 SDValue Ops[] = {
11215 Chain,
11216 VData,
11217 Rsrc,
11218 Op.getOperand(4), // vindex
11219 VOffset, // voffset
11220 SOffset, // soffset
11221 Offset, // offset
11222 Op.getOperand(7), // cachepolicy, swizzled buffer
11223 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11224 };
11225 unsigned Opc =
11226 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11227 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11228 MemSDNode *M = cast<MemSDNode>(Op);
11229
11230 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11231 EVT VDataType = VData.getValueType().getScalarType();
11232 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11233 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11234
11235 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11236 M->getMemoryVT(), M->getMemOperand());
11237 }
11238 case Intrinsic::amdgcn_raw_buffer_load_lds:
11239 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11240 case Intrinsic::amdgcn_struct_buffer_load_lds:
11241 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11242 if (!Subtarget->hasVMemToLDSLoad())
11243 return SDValue();
11244 unsigned Opc;
11245 bool HasVIndex =
11246 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11247 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11248 unsigned OpOffset = HasVIndex ? 1 : 0;
11249 SDValue VOffset = Op.getOperand(5 + OpOffset);
11250 bool HasVOffset = !isNullConstant(VOffset);
11251 unsigned Size = Op->getConstantOperandVal(4);
11252
11253 switch (Size) {
11254 default:
11255 return SDValue();
11256 case 1:
11257 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11258 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11259 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11260 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11261 break;
11262 case 2:
11263 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11264 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11265 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11266 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11267 break;
11268 case 4:
11269 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11270 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11271 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11272 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11273 break;
11274 case 12:
11275 if (!Subtarget->hasLDSLoadB96_B128())
11276 return SDValue();
11277 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11278 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11279 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11280 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11281 break;
11282 case 16:
11283 if (!Subtarget->hasLDSLoadB96_B128())
11284 return SDValue();
11285 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11286 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11287 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11288 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11289 break;
11290 }
11291
11292 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11293
11295
11296 if (HasVIndex && HasVOffset)
11297 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11298 {Op.getOperand(5), // VIndex
11299 VOffset}));
11300 else if (HasVIndex)
11301 Ops.push_back(Op.getOperand(5));
11302 else if (HasVOffset)
11303 Ops.push_back(VOffset);
11304
11305 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11306 Ops.push_back(Rsrc);
11307 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11308 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11309 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11310 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11311 Ops.push_back(DAG.getTargetConstant(
11312 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11313 DL, MVT::i8)); // cpol
11314 Ops.push_back(DAG.getTargetConstant(
11315 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11316 ? 1
11317 : 0,
11318 DL, MVT::i8)); // swz
11319 Ops.push_back(M0Val.getValue(0)); // Chain
11320 Ops.push_back(M0Val.getValue(1)); // Glue
11321
11322 auto *M = cast<MemSDNode>(Op);
11323 MachineMemOperand *LoadMMO = M->getMemOperand();
11324 // Don't set the offset value here because the pointer points to the base of
11325 // the buffer.
11326 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11327
11328 MachinePointerInfo StorePtrI = LoadPtrI;
11329 LoadPtrI.V = PoisonValue::get(
11333
11334 auto F = LoadMMO->getFlags() &
11336 LoadMMO =
11338 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11339
11340 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11341 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11342 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11343
11344 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11345 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11346
11347 return SDValue(Load, 0);
11348 }
11349 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11350 // for "trust me" that the remaining cases are global pointers until
11351 // such time as we can put two mem operands on an intrinsic.
11352 case Intrinsic::amdgcn_load_to_lds:
11353 case Intrinsic::amdgcn_global_load_lds: {
11354 if (!Subtarget->hasVMemToLDSLoad())
11355 return SDValue();
11356
11357 unsigned Opc;
11358 unsigned Size = Op->getConstantOperandVal(4);
11359 switch (Size) {
11360 default:
11361 return SDValue();
11362 case 1:
11363 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11364 break;
11365 case 2:
11366 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11367 break;
11368 case 4:
11369 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11370 break;
11371 case 12:
11372 if (!Subtarget->hasLDSLoadB96_B128())
11373 return SDValue();
11374 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11375 break;
11376 case 16:
11377 if (!Subtarget->hasLDSLoadB96_B128())
11378 return SDValue();
11379 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11380 break;
11381 }
11382
11383 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11384
11386
11387 SDValue Addr = Op.getOperand(2); // Global ptr
11388 SDValue VOffset;
11389 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11390 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11391 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11392 SDValue LHS = Addr.getOperand(0);
11393 SDValue RHS = Addr.getOperand(1);
11394
11395 if (LHS->isDivergent())
11396 std::swap(LHS, RHS);
11397
11398 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11399 RHS.getOperand(0).getValueType() == MVT::i32) {
11400 // add (i64 sgpr), (zero_extend (i32 vgpr))
11401 Addr = LHS;
11402 VOffset = RHS.getOperand(0);
11403 }
11404 }
11405
11406 Ops.push_back(Addr);
11407 if (!Addr->isDivergent()) {
11409 if (!VOffset)
11410 VOffset =
11411 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11412 DAG.getTargetConstant(0, DL, MVT::i32)),
11413 0);
11414 Ops.push_back(VOffset);
11415 }
11416
11417 Ops.push_back(Op.getOperand(5)); // Offset
11418
11419 unsigned Aux = Op.getConstantOperandVal(6);
11420 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11421 MVT::i32)); // CPol
11422
11423 Ops.push_back(M0Val.getValue(0)); // Chain
11424 Ops.push_back(M0Val.getValue(1)); // Glue
11425
11426 auto *M = cast<MemSDNode>(Op);
11427 MachineMemOperand *LoadMMO = M->getMemOperand();
11428 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11429 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11430 MachinePointerInfo StorePtrI = LoadPtrI;
11431 LoadPtrI.V = PoisonValue::get(
11435 auto F = LoadMMO->getFlags() &
11437 LoadMMO =
11439 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11440 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11441 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11442 LoadMMO->getAAInfo());
11443
11444 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11445 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11446
11447 return SDValue(Load, 0);
11448 }
11449 case Intrinsic::amdgcn_end_cf:
11450 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11451 Op->getOperand(2), Chain),
11452 0);
11453 case Intrinsic::amdgcn_s_barrier_init:
11454 case Intrinsic::amdgcn_s_barrier_signal_var: {
11455 // these two intrinsics have two operands: barrier pointer and member count
11456 SDValue Chain = Op->getOperand(0);
11458 SDValue BarOp = Op->getOperand(2);
11459 SDValue CntOp = Op->getOperand(3);
11460 SDValue M0Val;
11461 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11462 ? AMDGPU::S_BARRIER_INIT_M0
11463 : AMDGPU::S_BARRIER_SIGNAL_M0;
11464 // extract the BarrierID from bits 4-9 of BarOp
11465 SDValue BarID;
11466 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11467 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11468 BarID =
11469 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11470 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11471 0);
11472 // Member count should be put into M0[ShAmt:+6]
11473 // Barrier ID should be put into M0[5:0]
11474 M0Val =
11475 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11476 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11477 0);
11478 constexpr unsigned ShAmt = 16;
11479 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11480 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11481
11482 M0Val = SDValue(
11483 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11484
11485 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11486
11487 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11488 return SDValue(NewMI, 0);
11489 }
11490 case Intrinsic::amdgcn_s_barrier_join: {
11491 // these three intrinsics have one operand: barrier pointer
11492 SDValue Chain = Op->getOperand(0);
11494 SDValue BarOp = Op->getOperand(2);
11495 unsigned Opc;
11496
11497 if (isa<ConstantSDNode>(BarOp)) {
11498 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11499 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11500
11501 // extract the BarrierID from bits 4-9 of the immediate
11502 unsigned BarID = (BarVal >> 4) & 0x3F;
11503 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11504 Ops.push_back(K);
11505 Ops.push_back(Chain);
11506 } else {
11507 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11508
11509 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11510 SDValue M0Val;
11511 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11512 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11513 M0Val =
11514 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11515 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11516 0);
11517 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11518 }
11519
11520 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11521 return SDValue(NewMI, 0);
11522 }
11523 case Intrinsic::amdgcn_s_prefetch_data: {
11524 // For non-global address space preserve the chain and remove the call.
11526 return Op.getOperand(0);
11527 return Op;
11528 }
11529 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11530 SDValue Ops[] = {
11531 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11532 Op.getOperand(3), // offset
11533 Op.getOperand(4), // length
11534 };
11535
11536 MemSDNode *M = cast<MemSDNode>(Op);
11537 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11538 Op->getVTList(), Ops, M->getMemoryVT(),
11539 M->getMemOperand());
11540 }
11541 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11542 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11543 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11544 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11545 SDValue Chain = Op->getOperand(0);
11546 SDValue Ptr = Op->getOperand(2);
11547 SDValue Val = Op->getOperand(3);
11548 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11549 Ptr, MII->getMemOperand());
11550 }
11551 default: {
11552 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11554 return lowerImage(Op, ImageDimIntr, DAG, true);
11555
11556 return Op;
11557 }
11558 }
11559}
11560
11561// Return whether the operation has NoUnsignedWrap property.
11562static bool isNoUnsignedWrap(SDValue Addr) {
11563 return (Addr.getOpcode() == ISD::ADD &&
11564 Addr->getFlags().hasNoUnsignedWrap()) ||
11565 Addr->getOpcode() == ISD::OR;
11566}
11567
11569 EVT PtrVT) const {
11570 return PtrVT == MVT::i64;
11571}
11572
11574 EVT PtrVT) const {
11575 return true;
11576}
11577
11578// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11579// offset (the offset that is included in bounds checking and swizzling, to be
11580// split between the instruction's voffset and immoffset fields) and soffset
11581// (the offset that is excluded from bounds checking and swizzling, to go in
11582// the instruction's soffset field). This function takes the first kind of
11583// offset and figures out how to split it between voffset and immoffset.
11584std::pair<SDValue, SDValue>
11585SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11586 SDLoc DL(Offset);
11587 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11588 SDValue N0 = Offset;
11589 ConstantSDNode *C1 = nullptr;
11590
11591 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11592 N0 = SDValue();
11593 else if (DAG.isBaseWithConstantOffset(N0)) {
11594 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11595 // being added, so we can only safely match a 32-bit addition with no
11596 // unsigned overflow.
11597 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11598 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11599 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11600 N0 = N0.getOperand(0);
11601 }
11602 }
11603
11604 if (C1) {
11605 unsigned ImmOffset = C1->getZExtValue();
11606 // If the immediate value is too big for the immoffset field, put only bits
11607 // that would normally fit in the immoffset field. The remaining value that
11608 // is copied/added for the voffset field is a large power of 2, and it
11609 // stands more chance of being CSEd with the copy/add for another similar
11610 // load/store.
11611 // However, do not do that rounding down if that is a negative
11612 // number, as it appears to be illegal to have a negative offset in the
11613 // vgpr, even if adding the immediate offset makes it positive.
11614 unsigned Overflow = ImmOffset & ~MaxImm;
11615 ImmOffset -= Overflow;
11616 if ((int32_t)Overflow < 0) {
11617 Overflow += ImmOffset;
11618 ImmOffset = 0;
11619 }
11620 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11621 if (Overflow) {
11622 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11623 if (!N0)
11624 N0 = OverflowVal;
11625 else {
11626 SDValue Ops[] = {N0, OverflowVal};
11627 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11628 }
11629 }
11630 }
11631 if (!N0)
11632 N0 = DAG.getConstant(0, DL, MVT::i32);
11633 if (!C1)
11634 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11635 return {N0, SDValue(C1, 0)};
11636}
11637
11638// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11639// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11640// pointed to by Offsets.
11641void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11642 SelectionDAG &DAG, SDValue *Offsets,
11643 Align Alignment) const {
11644 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11645 SDLoc DL(CombinedOffset);
11646 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11647 uint32_t Imm = C->getZExtValue();
11648 uint32_t SOffset, ImmOffset;
11649 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11650 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11651 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11652 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11653 return;
11654 }
11655 }
11656 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11657 SDValue N0 = CombinedOffset.getOperand(0);
11658 SDValue N1 = CombinedOffset.getOperand(1);
11659 uint32_t SOffset, ImmOffset;
11660 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11661 if (Offset >= 0 &&
11662 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11663 Offsets[0] = N0;
11664 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11665 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11666 return;
11667 }
11668 }
11669
11670 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11671 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11672 : DAG.getConstant(0, DL, MVT::i32);
11673
11674 Offsets[0] = CombinedOffset;
11675 Offsets[1] = SOffsetZero;
11676 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11677}
11678
11679SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11680 SelectionDAG &DAG) const {
11681 if (!MaybePointer.getValueType().isScalarInteger())
11682 return MaybePointer;
11683
11684 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11685 return Rsrc;
11686}
11687
11688// Wrap a global or flat pointer into a buffer intrinsic using the flags
11689// specified in the intrinsic.
11690SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11691 SelectionDAG &DAG) const {
11692 SDLoc Loc(Op);
11693
11694 SDValue Pointer = Op->getOperand(1);
11695 SDValue Stride = Op->getOperand(2);
11696 SDValue NumRecords = Op->getOperand(3);
11697 SDValue Flags = Op->getOperand(4);
11698
11699 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11700 SDValue Rsrc;
11701
11702 if (Subtarget->has45BitNumRecordsBufferResource()) {
11703 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11704 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11705 // num_records.
11706 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11707 SDValue NumRecordsLHS =
11708 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11709 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11710 SDValue LowHalf =
11711 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11712
11713 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11714 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11715 SDValue NumRecordsRHS =
11716 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11717 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11718 SDValue ShiftedStride =
11719 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11720 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11721 SDValue ExtShiftedStrideVec =
11722 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11723 SDValue ExtShiftedStride =
11724 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11725 SDValue ShiftedFlags =
11726 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11727 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11728 SDValue ExtShiftedFlagsVec =
11729 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11730 SDValue ExtShiftedFlags =
11731 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11732 SDValue CombinedFields =
11733 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11734 SDValue HighHalf =
11735 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11736
11737 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11738 } else {
11739 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11740 auto [LowHalf, HighHalf] =
11741 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11742 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11743 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11744 SDValue ShiftedStride =
11745 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11746 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11747 SDValue NewHighHalf =
11748 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11749
11750 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11751 NumRecords, Flags);
11752 }
11753
11754 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11755 return RsrcPtr;
11756}
11757
11758// Handle 8 bit and 16 bit buffer loads
11759SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11760 EVT LoadVT, SDLoc DL,
11762 MachineMemOperand *MMO,
11763 bool IsTFE) const {
11764 EVT IntVT = LoadVT.changeTypeToInteger();
11765
11766 if (IsTFE) {
11767 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11768 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11769 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11770 MachineFunction &MF = DAG.getMachineFunction();
11771 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11772 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11773 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11774 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11775 DAG.getConstant(1, DL, MVT::i32));
11776 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11777 DAG.getConstant(0, DL, MVT::i32));
11778 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11779 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11780 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11781 }
11782
11783 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11784 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11785 : AMDGPUISD::BUFFER_LOAD_USHORT;
11786
11787 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11788 SDValue BufferLoad =
11789 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11790 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11791 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11792
11793 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11794}
11795
11796// Handle 8 bit and 16 bit buffer stores
11797SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11798 EVT VDataType, SDLoc DL,
11799 SDValue Ops[],
11800 MemSDNode *M) const {
11801 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11802 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11803
11804 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11805 Ops[1] = BufferStoreExt;
11806 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11807 : AMDGPUISD::BUFFER_STORE_SHORT;
11808 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11809 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11810 M->getMemOperand());
11811}
11812
11814 SDValue Op, const SDLoc &SL, EVT VT) {
11815 if (VT.bitsLT(Op.getValueType()))
11816 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11817
11818 switch (ExtType) {
11819 case ISD::SEXTLOAD:
11820 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11821 case ISD::ZEXTLOAD:
11822 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11823 case ISD::EXTLOAD:
11824 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11825 case ISD::NON_EXTLOAD:
11826 return Op;
11827 }
11828
11829 llvm_unreachable("invalid ext type");
11830}
11831
11832// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11833// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11834SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11835 DAGCombinerInfo &DCI) const {
11836 SelectionDAG &DAG = DCI.DAG;
11837 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11838 return SDValue();
11839
11840 // FIXME: Constant loads should all be marked invariant.
11841 unsigned AS = Ld->getAddressSpace();
11842 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11844 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11845 return SDValue();
11846
11847 // Don't do this early, since it may interfere with adjacent load merging for
11848 // illegal types. We can avoid losing alignment information for exotic types
11849 // pre-legalize.
11850 EVT MemVT = Ld->getMemoryVT();
11851 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11852 MemVT.getSizeInBits() >= 32)
11853 return SDValue();
11854
11855 SDLoc SL(Ld);
11856
11857 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11858 "unexpected vector extload");
11859
11860 // TODO: Drop only high part of range.
11861 SDValue Ptr = Ld->getBasePtr();
11862 SDValue NewLoad = DAG.getLoad(
11863 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11864 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11865 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11866 nullptr); // Drop ranges
11867
11868 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11869 if (MemVT.isFloatingPoint()) {
11871 "unexpected fp extload");
11872 TruncVT = MemVT.changeTypeToInteger();
11873 }
11874
11875 SDValue Cvt = NewLoad;
11876 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11877 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11878 DAG.getValueType(TruncVT));
11879 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11881 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11882 } else {
11884 }
11885
11886 EVT VT = Ld->getValueType(0);
11887 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11888
11889 DCI.AddToWorklist(Cvt.getNode());
11890
11891 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11892 // the appropriate extension from the 32-bit load.
11893 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11894 DCI.AddToWorklist(Cvt.getNode());
11895
11896 // Handle conversion back to floating point if necessary.
11897 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11898
11899 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11900}
11901
11903 const SIMachineFunctionInfo &Info) {
11904 // TODO: Should check if the address can definitely not access stack.
11905 if (Info.isEntryFunction())
11906 return Info.getUserSGPRInfo().hasFlatScratchInit();
11907 return true;
11908}
11909
11910SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11911 SDLoc DL(Op);
11912 LoadSDNode *Load = cast<LoadSDNode>(Op);
11913 ISD::LoadExtType ExtType = Load->getExtensionType();
11914 EVT MemVT = Load->getMemoryVT();
11915 MachineMemOperand *MMO = Load->getMemOperand();
11916
11917 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11918 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11919 return SDValue();
11920
11921 // FIXME: Copied from PPC
11922 // First, load into 32 bits, then truncate to 1 bit.
11923
11924 SDValue Chain = Load->getChain();
11925 SDValue BasePtr = Load->getBasePtr();
11926
11927 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11928
11929 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11930 RealMemVT, MMO);
11931
11932 if (!MemVT.isVector()) {
11933 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11934 NewLD.getValue(1)};
11935
11936 return DAG.getMergeValues(Ops, DL);
11937 }
11938
11940 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11941 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11942 DAG.getConstant(I, DL, MVT::i32));
11943
11944 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11945 }
11946
11947 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11948
11949 return DAG.getMergeValues(Ops, DL);
11950 }
11951
11952 if (!MemVT.isVector())
11953 return SDValue();
11954
11955 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11956 "Custom lowering for non-i32 vectors hasn't been implemented.");
11957
11958 Align Alignment = Load->getAlign();
11959 unsigned AS = Load->getAddressSpace();
11960 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11961 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11962 return SplitVectorLoad(Op, DAG);
11963 }
11964
11965 MachineFunction &MF = DAG.getMachineFunction();
11966 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11967 // If there is a possibility that flat instruction access scratch memory
11968 // then we need to use the same legalization rules we use for private.
11969 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11970 !Subtarget->hasMultiDwordFlatScratchAddressing())
11971 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11974
11975 unsigned NumElements = MemVT.getVectorNumElements();
11976
11977 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11979 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11980 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11981 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
11982 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11983 Alignment >= Align(4) && NumElements < 32) {
11984 if (MemVT.isPow2VectorType() ||
11985 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11986 return SDValue();
11987 return WidenOrSplitVectorLoad(Op, DAG);
11988 }
11989 // Non-uniform loads will be selected to MUBUF instructions, so they
11990 // have the same legalization requirements as global and private
11991 // loads.
11992 //
11993 }
11994 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11997 if (NumElements > 4)
11998 return SplitVectorLoad(Op, DAG);
11999 // v3 loads not supported on SI.
12000 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12001 return WidenOrSplitVectorLoad(Op, DAG);
12002
12003 // v3 and v4 loads are supported for private and global memory.
12004 return SDValue();
12005 }
12006 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12007 // Depending on the setting of the private_element_size field in the
12008 // resource descriptor, we can only make private accesses up to a certain
12009 // size.
12010 switch (Subtarget->getMaxPrivateElementSize()) {
12011 case 4: {
12012 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12013 return DAG.getMergeValues({Op0, Op1}, DL);
12014 }
12015 case 8:
12016 if (NumElements > 2)
12017 return SplitVectorLoad(Op, DAG);
12018 return SDValue();
12019 case 16:
12020 // Same as global/flat
12021 if (NumElements > 4)
12022 return SplitVectorLoad(Op, DAG);
12023 // v3 loads not supported on SI.
12024 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12025 return WidenOrSplitVectorLoad(Op, DAG);
12026
12027 return SDValue();
12028 default:
12029 llvm_unreachable("unsupported private_element_size");
12030 }
12031 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12032 unsigned Fast = 0;
12033 auto Flags = Load->getMemOperand()->getFlags();
12035 Load->getAlign(), Flags, &Fast) &&
12036 Fast > 1)
12037 return SDValue();
12038
12039 if (MemVT.isVector())
12040 return SplitVectorLoad(Op, DAG);
12041 }
12042
12044 MemVT, *Load->getMemOperand())) {
12045 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12046 return DAG.getMergeValues({Op0, Op1}, DL);
12047 }
12048
12049 return SDValue();
12050}
12051
12052SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12053 EVT VT = Op.getValueType();
12054 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12055 VT.getSizeInBits() == 512)
12056 return splitTernaryVectorOp(Op, DAG);
12057
12058 assert(VT.getSizeInBits() == 64);
12059
12060 SDLoc DL(Op);
12061 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12062
12063 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12064 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12065
12066 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12067 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12068
12069 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12070 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12071
12072 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12073
12074 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12075 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12076
12077 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12078
12079 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12080 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12081}
12082
12083// Catch division cases where we can use shortcuts with rcp and rsq
12084// instructions.
12085SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12086 SelectionDAG &DAG) const {
12087 SDLoc SL(Op);
12088 SDValue LHS = Op.getOperand(0);
12089 SDValue RHS = Op.getOperand(1);
12090 EVT VT = Op.getValueType();
12091 const SDNodeFlags Flags = Op->getFlags();
12092
12093 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12094
12095 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12096 // Without !fpmath accuracy information, we can't do more because we don't
12097 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12098 // f16 is always accurate enough
12099 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12100 return SDValue();
12101
12102 if (CLHS->isExactlyValue(1.0)) {
12103 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12104 // the CI documentation has a worst case error of 1 ulp.
12105 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12106 // use it as long as we aren't trying to use denormals.
12107 //
12108 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12109
12110 // 1.0 / sqrt(x) -> rsq(x)
12111
12112 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12113 // error seems really high at 2^29 ULP.
12114 // 1.0 / x -> rcp(x)
12115 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12116 }
12117
12118 // Same as for 1.0, but expand the sign out of the constant.
12119 if (CLHS->isExactlyValue(-1.0)) {
12120 // -1.0 / x -> rcp (fneg x)
12121 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12122 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12123 }
12124 }
12125
12126 // For f16 and bf16 require afn or arcp.
12127 // For f32 require afn.
12128 if (!AllowInaccurateRcp &&
12129 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12130 return SDValue();
12131
12132 // Turn into multiply by the reciprocal.
12133 // x / y -> x * (1.0 / y)
12134 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12135 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12136}
12137
12138SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12139 SelectionDAG &DAG) const {
12140 SDLoc SL(Op);
12141 SDValue X = Op.getOperand(0);
12142 SDValue Y = Op.getOperand(1);
12143 EVT VT = Op.getValueType();
12144 const SDNodeFlags Flags = Op->getFlags();
12145
12146 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12147 if (!AllowInaccurateDiv)
12148 return SDValue();
12149
12150 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12151 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12152
12153 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12154 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12155
12156 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12157 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12158 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12159 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12160 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12161 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12162}
12163
12164static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12165 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12166 SDNodeFlags Flags) {
12167 if (GlueChain->getNumValues() <= 1) {
12168 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12169 }
12170
12171 assert(GlueChain->getNumValues() == 3);
12172
12173 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12174 switch (Opcode) {
12175 default:
12176 llvm_unreachable("no chain equivalent for opcode");
12177 case ISD::FMUL:
12178 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12179 break;
12180 }
12181
12182 return DAG.getNode(Opcode, SL, VTList,
12183 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12184 Flags);
12185}
12186
12187static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12188 EVT VT, SDValue A, SDValue B, SDValue C,
12189 SDValue GlueChain, SDNodeFlags Flags) {
12190 if (GlueChain->getNumValues() <= 1) {
12191 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12192 }
12193
12194 assert(GlueChain->getNumValues() == 3);
12195
12196 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12197 switch (Opcode) {
12198 default:
12199 llvm_unreachable("no chain equivalent for opcode");
12200 case ISD::FMA:
12201 Opcode = AMDGPUISD::FMA_W_CHAIN;
12202 break;
12203 }
12204
12205 return DAG.getNode(Opcode, SL, VTList,
12206 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12207 Flags);
12208}
12209
12210SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12211 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12212 return FastLowered;
12213
12214 SDLoc SL(Op);
12215 EVT VT = Op.getValueType();
12216 SDValue LHS = Op.getOperand(0);
12217 SDValue RHS = Op.getOperand(1);
12218
12219 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12220 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12221
12222 if (VT == MVT::bf16) {
12223 SDValue ExtDiv =
12224 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12225 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12226 DAG.getTargetConstant(0, SL, MVT::i32));
12227 }
12228
12229 assert(VT == MVT::f16);
12230
12231 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12232 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12233 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12234 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12235 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12236 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12237 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12238 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12239 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12240 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12241 // q16.u = opx(V_CVT_F16_F32, q32.u);
12242 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12243
12244 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12245 unsigned FMADOpCode =
12247 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12248 SDValue Rcp =
12249 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12250 SDValue Quot =
12251 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12252 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12253 Op->getFlags());
12254 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12255 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12256 Op->getFlags());
12257 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12258 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12259 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12260 DAG.getConstant(0xff800000, SL, MVT::i32));
12261 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12262 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12263 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12264 DAG.getTargetConstant(0, SL, MVT::i32));
12265 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12266 Op->getFlags());
12267}
12268
12269// Faster 2.5 ULP division that does not support denormals.
12270SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12271 SDNodeFlags Flags = Op->getFlags();
12272 SDLoc SL(Op);
12273 SDValue LHS = Op.getOperand(1);
12274 SDValue RHS = Op.getOperand(2);
12275
12276 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12277
12278 const APFloat K0Val(0x1p+96f);
12279 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12280
12281 const APFloat K1Val(0x1p-32f);
12282 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12283
12284 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12285
12286 EVT SetCCVT =
12287 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12288
12289 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12290
12291 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12292
12293 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12294
12295 // rcp does not support denormals.
12296 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12297
12298 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12299
12300 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12301}
12302
12303// Returns immediate value for setting the F32 denorm mode when using the
12304// S_DENORM_MODE instruction.
12307 const GCNSubtarget *ST) {
12308 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12309 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12310 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12311 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12312}
12313
12314SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12315 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12316 return FastLowered;
12317
12318 // The selection matcher assumes anything with a chain selecting to a
12319 // mayRaiseFPException machine instruction. Since we're introducing a chain
12320 // here, we need to explicitly report nofpexcept for the regular fdiv
12321 // lowering.
12322 SDNodeFlags Flags = Op->getFlags();
12323 Flags.setNoFPExcept(true);
12324
12325 SDLoc SL(Op);
12326 SDValue LHS = Op.getOperand(0);
12327 SDValue RHS = Op.getOperand(1);
12328
12329 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12330
12331 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12332
12333 SDValue DenominatorScaled =
12334 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12335 SDValue NumeratorScaled =
12336 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12337
12338 // Denominator is scaled to not be denormal, so using rcp is ok.
12339 SDValue ApproxRcp =
12340 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12341 SDValue NegDivScale0 =
12342 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12343
12344 using namespace AMDGPU::Hwreg;
12345 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12346 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12347
12348 const MachineFunction &MF = DAG.getMachineFunction();
12349 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12350 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12351
12352 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12353 const bool HasDynamicDenormals =
12354 (DenormMode.Input == DenormalMode::Dynamic) ||
12355 (DenormMode.Output == DenormalMode::Dynamic);
12356
12357 SDValue SavedDenormMode;
12358
12359 if (!PreservesDenormals) {
12360 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12361 // lowering. The chain dependence is insufficient, and we need glue. We do
12362 // not need the glue variants in a strictfp function.
12363
12364 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12365
12366 SDValue Glue = DAG.getEntryNode();
12367 if (HasDynamicDenormals) {
12368 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12369 DAG.getVTList(MVT::i32, MVT::Glue),
12370 {BitField, Glue});
12371 SavedDenormMode = SDValue(GetReg, 0);
12372
12373 Glue = DAG.getMergeValues(
12374 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12375 }
12376
12377 SDNode *EnableDenorm;
12378 if (Subtarget->hasDenormModeInst()) {
12379 const SDValue EnableDenormValue =
12381
12382 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12383 EnableDenormValue)
12384 .getNode();
12385 } else {
12386 const SDValue EnableDenormValue =
12387 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12388 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12389 {EnableDenormValue, BitField, Glue});
12390 }
12391
12392 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12393 SDValue(EnableDenorm, 1)};
12394
12395 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12396 }
12397
12398 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12399 ApproxRcp, One, NegDivScale0, Flags);
12400
12401 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12402 ApproxRcp, Fma0, Flags);
12403
12404 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12405 Fma1, Flags);
12406
12407 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12408 NumeratorScaled, Mul, Flags);
12409
12410 SDValue Fma3 =
12411 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12412
12413 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12414 NumeratorScaled, Fma3, Flags);
12415
12416 if (!PreservesDenormals) {
12417 SDNode *DisableDenorm;
12418 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12419 const SDValue DisableDenormValue = getSPDenormModeValue(
12420 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12421
12422 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12423 DisableDenorm =
12424 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12425 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12426 .getNode();
12427 } else {
12428 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12429 const SDValue DisableDenormValue =
12430 HasDynamicDenormals
12431 ? SavedDenormMode
12432 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12433
12434 DisableDenorm = DAG.getMachineNode(
12435 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12436 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12437 }
12438
12439 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12440 SDValue(DisableDenorm, 0), DAG.getRoot());
12441 DAG.setRoot(OutputChain);
12442 }
12443
12444 SDValue Scale = NumeratorScaled.getValue(1);
12445 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12446 {Fma4, Fma1, Fma3, Scale}, Flags);
12447
12448 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12449}
12450
12451SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12452 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12453 return FastLowered;
12454
12455 SDLoc SL(Op);
12456 SDValue X = Op.getOperand(0);
12457 SDValue Y = Op.getOperand(1);
12458
12459 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12460
12461 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12462
12463 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12464
12465 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12466
12467 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12468
12469 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12470
12471 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12472
12473 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12474
12475 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12476
12477 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12478 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12479
12480 SDValue Fma4 =
12481 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12482
12483 SDValue Scale;
12484
12485 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12486 // Workaround a hardware bug on SI where the condition output from div_scale
12487 // is not usable.
12488
12489 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12490
12491 // Figure out if the scale to use for div_fmas.
12492 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12493 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12494 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12495 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12496
12497 SDValue NumHi =
12498 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12499 SDValue DenHi =
12500 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12501
12502 SDValue Scale0Hi =
12503 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12504 SDValue Scale1Hi =
12505 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12506
12507 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12508 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12509 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12510 } else {
12511 Scale = DivScale1.getValue(1);
12512 }
12513
12514 SDValue Fmas =
12515 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12516
12517 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12518}
12519
12520SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12521 EVT VT = Op.getValueType();
12522
12523 if (VT == MVT::f32)
12524 return LowerFDIV32(Op, DAG);
12525
12526 if (VT == MVT::f64)
12527 return LowerFDIV64(Op, DAG);
12528
12529 if (VT == MVT::f16 || VT == MVT::bf16)
12530 return LowerFDIV16(Op, DAG);
12531
12532 llvm_unreachable("Unexpected type for fdiv");
12533}
12534
12535SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12536 SDLoc dl(Op);
12537 SDValue Val = Op.getOperand(0);
12538 EVT VT = Val.getValueType();
12539 EVT ResultExpVT = Op->getValueType(1);
12540 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12541
12542 SDValue Mant = DAG.getNode(
12544 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12545
12546 SDValue Exp = DAG.getNode(
12547 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12548 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12549
12550 if (Subtarget->hasFractBug()) {
12551 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12552 SDValue Inf =
12554
12555 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12556 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12557 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12558 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12559 }
12560
12561 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12562 return DAG.getMergeValues({Mant, CastExp}, dl);
12563}
12564
12565SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12566 SDLoc DL(Op);
12567 StoreSDNode *Store = cast<StoreSDNode>(Op);
12568 EVT VT = Store->getMemoryVT();
12569
12570 if (VT == MVT::i1) {
12571 return DAG.getTruncStore(
12572 Store->getChain(), DL,
12573 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12574 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12575 }
12576
12577 assert(VT.isVector() &&
12578 Store->getValue().getValueType().getScalarType() == MVT::i32);
12579
12580 unsigned AS = Store->getAddressSpace();
12581 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12582 Store->getAlign().value() < VT.getStoreSize() &&
12583 VT.getSizeInBits() > 32) {
12584 return SplitVectorStore(Op, DAG);
12585 }
12586
12587 MachineFunction &MF = DAG.getMachineFunction();
12588 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12589 // If there is a possibility that flat instruction access scratch memory
12590 // then we need to use the same legalization rules we use for private.
12591 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12592 !Subtarget->hasMultiDwordFlatScratchAddressing())
12593 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12596
12597 unsigned NumElements = VT.getVectorNumElements();
12599 if (NumElements > 4)
12600 return SplitVectorStore(Op, DAG);
12601 // v3 stores not supported on SI.
12602 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12603 return SplitVectorStore(Op, DAG);
12604
12606 VT, *Store->getMemOperand()))
12607 return expandUnalignedStore(Store, DAG);
12608
12609 return SDValue();
12610 }
12611 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12612 switch (Subtarget->getMaxPrivateElementSize()) {
12613 case 4:
12614 return scalarizeVectorStore(Store, DAG);
12615 case 8:
12616 if (NumElements > 2)
12617 return SplitVectorStore(Op, DAG);
12618 return SDValue();
12619 case 16:
12620 if (NumElements > 4 ||
12621 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12622 return SplitVectorStore(Op, DAG);
12623 return SDValue();
12624 default:
12625 llvm_unreachable("unsupported private_element_size");
12626 }
12627 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12628 unsigned Fast = 0;
12629 auto Flags = Store->getMemOperand()->getFlags();
12631 Store->getAlign(), Flags, &Fast) &&
12632 Fast > 1)
12633 return SDValue();
12634
12635 if (VT.isVector())
12636 return SplitVectorStore(Op, DAG);
12637
12638 return expandUnalignedStore(Store, DAG);
12639 }
12640
12641 // Probably an invalid store. If so we'll end up emitting a selection error.
12642 return SDValue();
12643}
12644
12645// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12646SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12647 SDLoc SL(Op);
12648 assert(!Subtarget->has16BitInsts());
12649 SDNodeFlags Flags = Op->getFlags();
12650 SDValue Ext =
12651 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12652
12653 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12654 SDValue Sqrt =
12655 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12656
12657 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12658 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12659}
12660
12661SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12662 SDLoc DL(Op);
12663 SDNodeFlags Flags = Op->getFlags();
12664 MVT VT = Op.getValueType().getSimpleVT();
12665 const SDValue X = Op.getOperand(0);
12666
12667 if (allowApproxFunc(DAG, Flags)) {
12668 // Instruction is 1ulp but ignores denormals.
12669 return DAG.getNode(
12671 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12672 }
12673
12674 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12675 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12676
12677 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12678
12679 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12680
12681 SDValue SqrtX =
12682 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12683
12684 SDValue SqrtS;
12685 if (needsDenormHandlingF32(DAG, X, Flags)) {
12686 SDValue SqrtID =
12687 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12688 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12689
12690 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12691 SDValue SqrtSNextDownInt =
12692 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12693 DAG.getAllOnesConstant(DL, MVT::i32));
12694 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12695
12696 SDValue NegSqrtSNextDown =
12697 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12698
12699 SDValue SqrtVP =
12700 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12701
12702 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12703 DAG.getConstant(1, DL, MVT::i32));
12704 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12705
12706 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12707 SDValue SqrtVS =
12708 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12709
12710 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12711 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12712
12713 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12714 Flags);
12715
12716 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12717 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12718 Flags);
12719 } else {
12720 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12721
12722 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12723
12724 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12725 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12726 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12727
12728 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12729 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12730 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12731
12732 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12733 SDValue SqrtD =
12734 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12735 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12736 }
12737
12738 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12739
12740 SDValue ScaledDown =
12741 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12742
12743 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12744 SDValue IsZeroOrInf =
12745 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12746 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12747
12748 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12749}
12750
12751SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12752 // For double type, the SQRT and RSQ instructions don't have required
12753 // precision, we apply Goldschmidt's algorithm to improve the result:
12754 //
12755 // y0 = rsq(x)
12756 // g0 = x * y0
12757 // h0 = 0.5 * y0
12758 //
12759 // r0 = 0.5 - h0 * g0
12760 // g1 = g0 * r0 + g0
12761 // h1 = h0 * r0 + h0
12762 //
12763 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12764 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12765 // h2 = h1 * r1 + h1
12766 //
12767 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12768 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12769 //
12770 // sqrt(x) = g3
12771
12772 SDNodeFlags Flags = Op->getFlags();
12773
12774 SDLoc DL(Op);
12775
12776 SDValue X = Op.getOperand(0);
12777 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12778
12779 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12780
12781 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12782
12783 // Scale up input if it is too small.
12784 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12785 SDValue ScaleUp =
12786 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12787 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12788
12789 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12790
12791 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12792
12793 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12794 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12795
12796 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12797 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12798
12799 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12800
12801 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12802
12803 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12804 SDValue SqrtD0 =
12805 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12806
12807 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12808
12809 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12810 SDValue SqrtD1 =
12811 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12812
12813 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12814
12815 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12816 SDValue ScaleDown =
12817 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12818 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12819
12820 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12821 // with finite only or nsz because rsq(+/-0) = +/-inf
12822
12823 // TODO: Check for DAZ and expand to subnormals
12824 SDValue IsZeroOrInf =
12825 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12826 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12827
12828 // If x is +INF, +0, or -0, use its original value
12829 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12830 Flags);
12831}
12832
12833SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12834 SDLoc DL(Op);
12835 EVT VT = Op.getValueType();
12836 SDValue Arg = Op.getOperand(0);
12837 SDValue TrigVal;
12838
12839 // Propagate fast-math flags so that the multiply we introduce can be folded
12840 // if Arg is already the result of a multiply by constant.
12841 auto Flags = Op->getFlags();
12842
12843 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12844
12845 if (Subtarget->hasTrigReducedRange()) {
12846 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12847 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12848 } else {
12849 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12850 }
12851
12852 switch (Op.getOpcode()) {
12853 case ISD::FCOS:
12854 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12855 case ISD::FSIN:
12856 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12857 default:
12858 llvm_unreachable("Wrong trig opcode");
12859 }
12860}
12861
12862SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12863 SelectionDAG &DAG) const {
12864 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12865 assert(AtomicNode->isCompareAndSwap());
12866 unsigned AS = AtomicNode->getAddressSpace();
12867
12868 // No custom lowering required for local address space
12870 return Op;
12871
12872 // Non-local address space requires custom lowering for atomic compare
12873 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12874 SDLoc DL(Op);
12875 SDValue ChainIn = Op.getOperand(0);
12876 SDValue Addr = Op.getOperand(1);
12877 SDValue Old = Op.getOperand(2);
12878 SDValue New = Op.getOperand(3);
12879 EVT VT = Op.getValueType();
12880 MVT SimpleVT = VT.getSimpleVT();
12881 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12882
12883 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12884 SDValue Ops[] = {ChainIn, Addr, NewOld};
12885
12886 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
12887 Op->getVTList(), Ops, VT,
12888 AtomicNode->getMemOperand());
12889}
12890
12891//===----------------------------------------------------------------------===//
12892// Custom DAG optimizations
12893//===----------------------------------------------------------------------===//
12894
12895SDValue
12896SITargetLowering::performUCharToFloatCombine(SDNode *N,
12897 DAGCombinerInfo &DCI) const {
12898 EVT VT = N->getValueType(0);
12899 EVT ScalarVT = VT.getScalarType();
12900 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12901 return SDValue();
12902
12903 SelectionDAG &DAG = DCI.DAG;
12904 SDLoc DL(N);
12905
12906 SDValue Src = N->getOperand(0);
12907 EVT SrcVT = Src.getValueType();
12908
12909 // TODO: We could try to match extracting the higher bytes, which would be
12910 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12911 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12912 // about in practice.
12913 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12914 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12915 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12916 DCI.AddToWorklist(Cvt.getNode());
12917
12918 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12919 if (ScalarVT != MVT::f32) {
12920 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12921 DAG.getTargetConstant(0, DL, MVT::i32));
12922 }
12923 return Cvt;
12924 }
12925 }
12926
12927 return SDValue();
12928}
12929
12930SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12931 DAGCombinerInfo &DCI) const {
12932 SDValue MagnitudeOp = N->getOperand(0);
12933 SDValue SignOp = N->getOperand(1);
12934
12935 // The generic combine for fcopysign + fp cast is too conservative with
12936 // vectors, and also gets confused by the splitting we will perform here, so
12937 // peek through FP casts.
12938 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12939 SignOp.getOpcode() == ISD::FP_ROUND)
12940 SignOp = SignOp.getOperand(0);
12941
12942 SelectionDAG &DAG = DCI.DAG;
12943 SDLoc DL(N);
12944 EVT SignVT = SignOp.getValueType();
12945
12946 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12947 // lower half with a copy.
12948 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12949 EVT MagVT = MagnitudeOp.getValueType();
12950
12951 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12952
12953 if (MagVT.getScalarType() == MVT::f64) {
12954 EVT F32VT = MagVT.isVector()
12955 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12956 : MVT::v2f32;
12957
12958 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12959
12961 for (unsigned I = 0; I != NumElts; ++I) {
12962 SDValue MagLo =
12963 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12964 DAG.getConstant(2 * I, DL, MVT::i32));
12965 SDValue MagHi =
12966 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12967 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12968
12969 SDValue SignOpElt =
12970 MagVT.isVector()
12972 SignOp, DAG.getConstant(I, DL, MVT::i32))
12973 : SignOp;
12974
12975 SDValue HiOp =
12976 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12977
12978 SDValue Vector =
12979 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12980
12981 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12982 NewElts.push_back(NewElt);
12983 }
12984
12985 if (NewElts.size() == 1)
12986 return NewElts[0];
12987
12988 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12989 }
12990
12991 if (SignVT.getScalarType() != MVT::f64)
12992 return SDValue();
12993
12994 // Reduce width of sign operand, we only need the highest bit.
12995 //
12996 // fcopysign f64:x, f64:y ->
12997 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12998 // TODO: In some cases it might make sense to go all the way to f16.
12999
13000 EVT F32VT = MagVT.isVector()
13001 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13002 : MVT::v2f32;
13003
13004 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13005
13006 SmallVector<SDValue, 8> F32Signs;
13007 for (unsigned I = 0; I != NumElts; ++I) {
13008 // Take sign from odd elements of cast vector
13009 SDValue SignAsF32 =
13010 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13011 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13012 F32Signs.push_back(SignAsF32);
13013 }
13014
13015 SDValue NewSign =
13016 NumElts == 1
13017 ? F32Signs.back()
13019 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13020 F32Signs);
13021
13022 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13023 NewSign);
13024}
13025
13026// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13027// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13028// bits
13029
13030// This is a variant of
13031// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13032//
13033// The normal DAG combiner will do this, but only if the add has one use since
13034// that would increase the number of instructions.
13035//
13036// This prevents us from seeing a constant offset that can be folded into a
13037// memory instruction's addressing mode. If we know the resulting add offset of
13038// a pointer can be folded into an addressing offset, we can replace the pointer
13039// operand with the add of new constant offset. This eliminates one of the uses,
13040// and may allow the remaining use to also be simplified.
13041//
13042SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13043 EVT MemVT,
13044 DAGCombinerInfo &DCI) const {
13045 SDValue N0 = N->getOperand(0);
13046 SDValue N1 = N->getOperand(1);
13047
13048 // We only do this to handle cases where it's profitable when there are
13049 // multiple uses of the add, so defer to the standard combine.
13050 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13051 return SDValue();
13052
13053 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13054 if (!CN1)
13055 return SDValue();
13056
13057 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13058 if (!CAdd)
13059 return SDValue();
13060
13061 SelectionDAG &DAG = DCI.DAG;
13062
13063 if (N0->getOpcode() == ISD::OR &&
13064 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13065 return SDValue();
13066
13067 // If the resulting offset is too large, we can't fold it into the
13068 // addressing mode offset.
13069 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13070 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13071
13072 AddrMode AM;
13073 AM.HasBaseReg = true;
13074 AM.BaseOffs = Offset.getSExtValue();
13075 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13076 return SDValue();
13077
13078 SDLoc SL(N);
13079 EVT VT = N->getValueType(0);
13080
13081 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13082 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13083
13084 SDNodeFlags Flags;
13085 Flags.setNoUnsignedWrap(
13086 N->getFlags().hasNoUnsignedWrap() &&
13087 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13088
13089 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13090 // be sure that the new left operand is a proper base pointer.
13091 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13092}
13093
13094/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13095/// by the chain and intrinsic ID. Theoretically we would also need to check the
13096/// specific intrinsic, but they all place the pointer operand first.
13097static unsigned getBasePtrIndex(const MemSDNode *N) {
13098 switch (N->getOpcode()) {
13099 case ISD::STORE:
13102 return 2;
13103 default:
13104 return 1;
13105 }
13106}
13107
13108SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13109 DAGCombinerInfo &DCI) const {
13110 SelectionDAG &DAG = DCI.DAG;
13111
13112 unsigned PtrIdx = getBasePtrIndex(N);
13113 SDValue Ptr = N->getOperand(PtrIdx);
13114
13115 // TODO: We could also do this for multiplies.
13116 if (Ptr.getOpcode() == ISD::SHL) {
13117 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13118 N->getMemoryVT(), DCI);
13119 if (NewPtr) {
13120 SmallVector<SDValue, 8> NewOps(N->ops());
13121
13122 NewOps[PtrIdx] = NewPtr;
13123 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13124 }
13125 }
13126
13127 return SDValue();
13128}
13129
13130static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13131 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13132 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13133 (Opc == ISD::XOR && Val == 0);
13134}
13135
13136// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13137// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13138// integer combine opportunities since most 64-bit operations are decomposed
13139// this way. TODO: We won't want this for SALU especially if it is an inline
13140// immediate.
13141SDValue SITargetLowering::splitBinaryBitConstantOp(
13142 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13143 const ConstantSDNode *CRHS) const {
13144 uint64_t Val = CRHS->getZExtValue();
13145 uint32_t ValLo = Lo_32(Val);
13146 uint32_t ValHi = Hi_32(Val);
13147 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13148
13149 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13151 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13152 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13153 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13154 !CRHS->user_begin()->isDivergent())
13155 return SDValue();
13156
13157 // If we need to materialize a 64-bit immediate, it will be split up later
13158 // anyway. Avoid creating the harder to understand 64-bit immediate
13159 // materialization.
13160 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13161 }
13162
13163 return SDValue();
13164}
13165
13167 if (V.getValueType() != MVT::i1)
13168 return false;
13169 switch (V.getOpcode()) {
13170 default:
13171 break;
13172 case ISD::SETCC:
13173 case ISD::IS_FPCLASS:
13174 case AMDGPUISD::FP_CLASS:
13175 return true;
13176 case ISD::AND:
13177 case ISD::OR:
13178 case ISD::XOR:
13179 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13180 case ISD::SADDO:
13181 case ISD::UADDO:
13182 case ISD::SSUBO:
13183 case ISD::USUBO:
13184 case ISD::SMULO:
13185 case ISD::UMULO:
13186 return V.getResNo() == 1;
13188 unsigned IntrinsicID = V.getConstantOperandVal(0);
13189 switch (IntrinsicID) {
13190 case Intrinsic::amdgcn_is_shared:
13191 case Intrinsic::amdgcn_is_private:
13192 return true;
13193 default:
13194 return false;
13195 }
13196
13197 return false;
13198 }
13199 }
13200 return false;
13201}
13202
13203// If a constant has all zeroes or all ones within each byte return it.
13204// Otherwise return 0.
13206 // 0xff for any zero byte in the mask
13207 uint32_t ZeroByteMask = 0;
13208 if (!(C & 0x000000ff))
13209 ZeroByteMask |= 0x000000ff;
13210 if (!(C & 0x0000ff00))
13211 ZeroByteMask |= 0x0000ff00;
13212 if (!(C & 0x00ff0000))
13213 ZeroByteMask |= 0x00ff0000;
13214 if (!(C & 0xff000000))
13215 ZeroByteMask |= 0xff000000;
13216 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13217 if ((NonZeroByteMask & C) != NonZeroByteMask)
13218 return 0; // Partial bytes selected.
13219 return C;
13220}
13221
13222// Check if a node selects whole bytes from its operand 0 starting at a byte
13223// boundary while masking the rest. Returns select mask as in the v_perm_b32
13224// or -1 if not succeeded.
13225// Note byte select encoding:
13226// value 0-3 selects corresponding source byte;
13227// value 0xc selects zero;
13228// value 0xff selects 0xff.
13230 assert(V.getValueSizeInBits() == 32);
13231
13232 if (V.getNumOperands() != 2)
13233 return ~0;
13234
13235 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13236 if (!N1)
13237 return ~0;
13238
13239 uint32_t C = N1->getZExtValue();
13240
13241 switch (V.getOpcode()) {
13242 default:
13243 break;
13244 case ISD::AND:
13245 if (uint32_t ConstMask = getConstantPermuteMask(C))
13246 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13247 break;
13248
13249 case ISD::OR:
13250 if (uint32_t ConstMask = getConstantPermuteMask(C))
13251 return (0x03020100 & ~ConstMask) | ConstMask;
13252 break;
13253
13254 case ISD::SHL:
13255 if (C % 8)
13256 return ~0;
13257
13258 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13259
13260 case ISD::SRL:
13261 if (C % 8)
13262 return ~0;
13263
13264 return uint32_t(0x0c0c0c0c03020100ull >> C);
13265 }
13266
13267 return ~0;
13268}
13269
13270SDValue SITargetLowering::performAndCombine(SDNode *N,
13271 DAGCombinerInfo &DCI) const {
13272 if (DCI.isBeforeLegalize())
13273 return SDValue();
13274
13275 SelectionDAG &DAG = DCI.DAG;
13276 EVT VT = N->getValueType(0);
13277 SDValue LHS = N->getOperand(0);
13278 SDValue RHS = N->getOperand(1);
13279
13280 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13281 if (VT == MVT::i64 && CRHS) {
13282 if (SDValue Split =
13283 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13284 return Split;
13285 }
13286
13287 if (CRHS && VT == MVT::i32) {
13288 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13289 // nb = number of trailing zeroes in mask
13290 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13291 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13292 uint64_t Mask = CRHS->getZExtValue();
13293 unsigned Bits = llvm::popcount(Mask);
13294 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13295 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13296 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13297 unsigned Shift = CShift->getZExtValue();
13298 unsigned NB = CRHS->getAPIntValue().countr_zero();
13299 unsigned Offset = NB + Shift;
13300 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13301 SDLoc SL(N);
13302 SDValue BFE =
13303 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13304 DAG.getConstant(Offset, SL, MVT::i32),
13305 DAG.getConstant(Bits, SL, MVT::i32));
13306 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13307 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13308 DAG.getValueType(NarrowVT));
13309 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13310 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13311 return Shl;
13312 }
13313 }
13314 }
13315
13316 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13317 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13318 isa<ConstantSDNode>(LHS.getOperand(2))) {
13319 uint32_t Sel = getConstantPermuteMask(Mask);
13320 if (!Sel)
13321 return SDValue();
13322
13323 // Select 0xc for all zero bytes
13324 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13325 SDLoc DL(N);
13326 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13327 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13328 }
13329 }
13330
13331 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13332 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13333 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13334 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13335 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13336
13337 SDValue X = LHS.getOperand(0);
13338 SDValue Y = RHS.getOperand(0);
13339 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13340 !isTypeLegal(X.getValueType()))
13341 return SDValue();
13342
13343 if (LCC == ISD::SETO) {
13344 if (X != LHS.getOperand(1))
13345 return SDValue();
13346
13347 if (RCC == ISD::SETUNE) {
13348 const ConstantFPSDNode *C1 =
13349 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13350 if (!C1 || !C1->isInfinity() || C1->isNegative())
13351 return SDValue();
13352
13353 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13357
13358 static_assert(
13361 0x3ff) == Mask,
13362 "mask not equal");
13363
13364 SDLoc DL(N);
13365 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13366 DAG.getConstant(Mask, DL, MVT::i32));
13367 }
13368 }
13369 }
13370
13371 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13372 std::swap(LHS, RHS);
13373
13374 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13375 RHS.hasOneUse()) {
13376 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13377 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13378 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13379 // | n_nan)
13380 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13381 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13382 (RHS.getOperand(0) == LHS.getOperand(0) &&
13383 LHS.getOperand(0) == LHS.getOperand(1))) {
13384 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13385 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13386 : Mask->getZExtValue() & OrdMask;
13387
13388 SDLoc DL(N);
13389 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13390 DAG.getConstant(NewMask, DL, MVT::i32));
13391 }
13392 }
13393
13394 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13395 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13396 // and x, (sext cc from i1) => select cc, x, 0
13397 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13398 std::swap(LHS, RHS);
13399 if (isBoolSGPR(RHS.getOperand(0)))
13400 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13401 DAG.getConstant(0, SDLoc(N), MVT::i32));
13402 }
13403
13404 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13405 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13406 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13407 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13408 uint32_t LHSMask = getPermuteMask(LHS);
13409 uint32_t RHSMask = getPermuteMask(RHS);
13410 if (LHSMask != ~0u && RHSMask != ~0u) {
13411 // Canonicalize the expression in an attempt to have fewer unique masks
13412 // and therefore fewer registers used to hold the masks.
13413 if (LHSMask > RHSMask) {
13414 std::swap(LHSMask, RHSMask);
13415 std::swap(LHS, RHS);
13416 }
13417
13418 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13419 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13420 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13421 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13422
13423 // Check of we need to combine values from two sources within a byte.
13424 if (!(LHSUsedLanes & RHSUsedLanes) &&
13425 // If we select high and lower word keep it for SDWA.
13426 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13427 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13428 // Each byte in each mask is either selector mask 0-3, or has higher
13429 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13430 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13431 // mask which is not 0xff wins. By anding both masks we have a correct
13432 // result except that 0x0c shall be corrected to give 0x0c only.
13433 uint32_t Mask = LHSMask & RHSMask;
13434 for (unsigned I = 0; I < 32; I += 8) {
13435 uint32_t ByteSel = 0xff << I;
13436 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13437 Mask &= (0x0c << I) & 0xffffffff;
13438 }
13439
13440 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13441 // or 0x0c.
13442 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13443 SDLoc DL(N);
13444
13445 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13446 RHS.getOperand(0),
13447 DAG.getConstant(Sel, DL, MVT::i32));
13448 }
13449 }
13450 }
13451
13452 return SDValue();
13453}
13454
13455// A key component of v_perm is a mapping between byte position of the src
13456// operands, and the byte position of the dest. To provide such, we need: 1. the
13457// node that provides x byte of the dest of the OR, and 2. the byte of the node
13458// used to provide that x byte. calculateByteProvider finds which node provides
13459// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13460// and finds an ultimate src and byte position For example: The supported
13461// LoadCombine pattern for vector loads is as follows
13462// t1
13463// or
13464// / \
13465// t2 t3
13466// zext shl
13467// | | \
13468// t4 t5 16
13469// or anyext
13470// / \ |
13471// t6 t7 t8
13472// srl shl or
13473// / | / \ / \
13474// t9 t10 t11 t12 t13 t14
13475// trunc* 8 trunc* 8 and and
13476// | | / | | \
13477// t15 t16 t17 t18 t19 t20
13478// trunc* 255 srl -256
13479// | / \
13480// t15 t15 16
13481//
13482// *In this example, the truncs are from i32->i16
13483//
13484// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13485// respectively. calculateSrcByte would find (given node) -> ultimate src &
13486// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13487// After finding the mapping, we can combine the tree into vperm t15, t16,
13488// 0x05000407
13489
13490// Find the source and byte position from a node.
13491// \p DestByte is the byte position of the dest of the or that the src
13492// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13493// dest of the or byte. \p Depth tracks how many recursive iterations we have
13494// performed.
13495static const std::optional<ByteProvider<SDValue>>
13496calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13497 unsigned Depth = 0) {
13498 // We may need to recursively traverse a series of SRLs
13499 if (Depth >= 6)
13500 return std::nullopt;
13501
13502 if (Op.getValueSizeInBits() < 8)
13503 return std::nullopt;
13504
13505 if (Op.getValueType().isVector())
13506 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13507
13508 switch (Op->getOpcode()) {
13509 case ISD::TRUNCATE: {
13510 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13511 }
13512
13513 case ISD::SIGN_EXTEND:
13514 case ISD::ZERO_EXTEND:
13516 SDValue NarrowOp = Op->getOperand(0);
13517 auto NarrowVT = NarrowOp.getValueType();
13518 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13519 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13520 NarrowVT = VTSign->getVT();
13521 }
13522 if (!NarrowVT.isByteSized())
13523 return std::nullopt;
13524 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13525
13526 if (SrcIndex >= NarrowByteWidth)
13527 return std::nullopt;
13528 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13529 }
13530
13531 case ISD::SRA:
13532 case ISD::SRL: {
13533 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13534 if (!ShiftOp)
13535 return std::nullopt;
13536
13537 uint64_t BitShift = ShiftOp->getZExtValue();
13538
13539 if (BitShift % 8 != 0)
13540 return std::nullopt;
13541
13542 SrcIndex += BitShift / 8;
13543
13544 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13545 }
13546
13547 default: {
13548 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13549 }
13550 }
13551 llvm_unreachable("fully handled switch");
13552}
13553
13554// For a byte position in the result of an Or, traverse the tree and find the
13555// node (and the byte of the node) which ultimately provides this {Or,
13556// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13557// the byte position of the Op that corresponds with the originally requested
13558// byte of the Or \p Depth tracks how many recursive iterations we have
13559// performed. \p StartingIndex is the originally requested byte of the Or
13560static const std::optional<ByteProvider<SDValue>>
13561calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13562 unsigned StartingIndex = 0) {
13563 // Finding Src tree of RHS of or typically requires at least 1 additional
13564 // depth
13565 if (Depth > 6)
13566 return std::nullopt;
13567
13568 unsigned BitWidth = Op.getScalarValueSizeInBits();
13569 if (BitWidth % 8 != 0)
13570 return std::nullopt;
13571 if (Index > BitWidth / 8 - 1)
13572 return std::nullopt;
13573
13574 bool IsVec = Op.getValueType().isVector();
13575 switch (Op.getOpcode()) {
13576 case ISD::OR: {
13577 if (IsVec)
13578 return std::nullopt;
13579
13580 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13581 StartingIndex);
13582 if (!RHS)
13583 return std::nullopt;
13584 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13585 StartingIndex);
13586 if (!LHS)
13587 return std::nullopt;
13588 // A well formed Or will have two ByteProviders for each byte, one of which
13589 // is constant zero
13590 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13591 return std::nullopt;
13592 if (!LHS || LHS->isConstantZero())
13593 return RHS;
13594 if (!RHS || RHS->isConstantZero())
13595 return LHS;
13596 return std::nullopt;
13597 }
13598
13599 case ISD::AND: {
13600 if (IsVec)
13601 return std::nullopt;
13602
13603 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13604 if (!BitMaskOp)
13605 return std::nullopt;
13606
13607 uint32_t BitMask = BitMaskOp->getZExtValue();
13608 // Bits we expect for our StartingIndex
13609 uint32_t IndexMask = 0xFF << (Index * 8);
13610
13611 if ((IndexMask & BitMask) != IndexMask) {
13612 // If the result of the and partially provides the byte, then it
13613 // is not well formatted
13614 if (IndexMask & BitMask)
13615 return std::nullopt;
13617 }
13618
13619 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13620 }
13621
13622 case ISD::FSHR: {
13623 if (IsVec)
13624 return std::nullopt;
13625
13626 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13627 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13628 if (!ShiftOp || Op.getValueType().isVector())
13629 return std::nullopt;
13630
13631 uint64_t BitsProvided = Op.getValueSizeInBits();
13632 if (BitsProvided % 8 != 0)
13633 return std::nullopt;
13634
13635 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13636 if (BitShift % 8)
13637 return std::nullopt;
13638
13639 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13640 uint64_t ByteShift = BitShift / 8;
13641
13642 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13643 uint64_t BytesProvided = BitsProvided / 8;
13644 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13645 NewIndex %= BytesProvided;
13646 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13647 }
13648
13649 case ISD::SRA:
13650 case ISD::SRL: {
13651 if (IsVec)
13652 return std::nullopt;
13653
13654 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13655 if (!ShiftOp)
13656 return std::nullopt;
13657
13658 uint64_t BitShift = ShiftOp->getZExtValue();
13659 if (BitShift % 8)
13660 return std::nullopt;
13661
13662 auto BitsProvided = Op.getScalarValueSizeInBits();
13663 if (BitsProvided % 8 != 0)
13664 return std::nullopt;
13665
13666 uint64_t BytesProvided = BitsProvided / 8;
13667 uint64_t ByteShift = BitShift / 8;
13668 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13669 // If the byte we are trying to provide (as tracked by index) falls in this
13670 // range, then the SRL provides the byte. The byte of interest of the src of
13671 // the SRL is Index + ByteShift
13672 return BytesProvided - ByteShift > Index
13673 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13674 Index + ByteShift)
13676 }
13677
13678 case ISD::SHL: {
13679 if (IsVec)
13680 return std::nullopt;
13681
13682 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13683 if (!ShiftOp)
13684 return std::nullopt;
13685
13686 uint64_t BitShift = ShiftOp->getZExtValue();
13687 if (BitShift % 8 != 0)
13688 return std::nullopt;
13689 uint64_t ByteShift = BitShift / 8;
13690
13691 // If we are shifting by an amount greater than (or equal to)
13692 // the index we are trying to provide, then it provides 0s. If not,
13693 // then this bytes are not definitively 0s, and the corresponding byte
13694 // of interest is Index - ByteShift of the src
13695 return Index < ByteShift
13697 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13698 Depth + 1, StartingIndex);
13699 }
13700 case ISD::ANY_EXTEND:
13701 case ISD::SIGN_EXTEND:
13702 case ISD::ZERO_EXTEND:
13704 case ISD::AssertZext:
13705 case ISD::AssertSext: {
13706 if (IsVec)
13707 return std::nullopt;
13708
13709 SDValue NarrowOp = Op->getOperand(0);
13710 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13711 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13712 Op->getOpcode() == ISD::AssertZext ||
13713 Op->getOpcode() == ISD::AssertSext) {
13714 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13715 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13716 }
13717 if (NarrowBitWidth % 8 != 0)
13718 return std::nullopt;
13719 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13720
13721 if (Index >= NarrowByteWidth)
13722 return Op.getOpcode() == ISD::ZERO_EXTEND
13723 ? std::optional<ByteProvider<SDValue>>(
13725 : std::nullopt;
13726 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13727 }
13728
13729 case ISD::TRUNCATE: {
13730 if (IsVec)
13731 return std::nullopt;
13732
13733 uint64_t NarrowByteWidth = BitWidth / 8;
13734
13735 if (NarrowByteWidth >= Index) {
13736 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13737 StartingIndex);
13738 }
13739
13740 return std::nullopt;
13741 }
13742
13743 case ISD::CopyFromReg: {
13744 if (BitWidth / 8 > Index)
13745 return calculateSrcByte(Op, StartingIndex, Index);
13746
13747 return std::nullopt;
13748 }
13749
13750 case ISD::LOAD: {
13751 auto *L = cast<LoadSDNode>(Op.getNode());
13752
13753 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13754 if (NarrowBitWidth % 8 != 0)
13755 return std::nullopt;
13756 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13757
13758 // If the width of the load does not reach byte we are trying to provide for
13759 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13760 // question
13761 if (Index >= NarrowByteWidth) {
13762 return L->getExtensionType() == ISD::ZEXTLOAD
13763 ? std::optional<ByteProvider<SDValue>>(
13765 : std::nullopt;
13766 }
13767
13768 if (NarrowByteWidth > Index) {
13769 return calculateSrcByte(Op, StartingIndex, Index);
13770 }
13771
13772 return std::nullopt;
13773 }
13774
13775 case ISD::BSWAP: {
13776 if (IsVec)
13777 return std::nullopt;
13778
13779 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13780 Depth + 1, StartingIndex);
13781 }
13782
13784 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13785 if (!IdxOp)
13786 return std::nullopt;
13787 auto VecIdx = IdxOp->getZExtValue();
13788 auto ScalarSize = Op.getScalarValueSizeInBits();
13789 if (ScalarSize < 32)
13790 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13791 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13792 StartingIndex, Index);
13793 }
13794
13795 case AMDGPUISD::PERM: {
13796 if (IsVec)
13797 return std::nullopt;
13798
13799 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13800 if (!PermMask)
13801 return std::nullopt;
13802
13803 auto IdxMask =
13804 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13805 if (IdxMask > 0x07 && IdxMask != 0x0c)
13806 return std::nullopt;
13807
13808 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13809 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13810
13811 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13814 }
13815
13816 default: {
13817 return std::nullopt;
13818 }
13819 }
13820
13821 llvm_unreachable("fully handled switch");
13822}
13823
13824// Returns true if the Operand is a scalar and is 16 bits
13825static bool isExtendedFrom16Bits(SDValue &Operand) {
13826
13827 switch (Operand.getOpcode()) {
13828 case ISD::ANY_EXTEND:
13829 case ISD::SIGN_EXTEND:
13830 case ISD::ZERO_EXTEND: {
13831 auto OpVT = Operand.getOperand(0).getValueType();
13832 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13833 }
13834 case ISD::LOAD: {
13835 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13836 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13837 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13838 ExtType == ISD::EXTLOAD) {
13839 auto MemVT = L->getMemoryVT();
13840 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13841 }
13842 return L->getMemoryVT().getSizeInBits() == 16;
13843 }
13844 default:
13845 return false;
13846 }
13847}
13848
13849// Returns true if the mask matches consecutive bytes, and the first byte
13850// begins at a power of 2 byte offset from 0th byte
13851static bool addresses16Bits(int Mask) {
13852 int Low8 = Mask & 0xff;
13853 int Hi8 = (Mask & 0xff00) >> 8;
13854
13855 assert(Low8 < 8 && Hi8 < 8);
13856 // Are the bytes contiguous in the order of increasing addresses.
13857 bool IsConsecutive = (Hi8 - Low8 == 1);
13858 // Is the first byte at location that is aligned for 16 bit instructions.
13859 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13860 // In this case, we still need code to extract the 16 bit operand, so it
13861 // is better to use i8 v_perm
13862 bool Is16Aligned = !(Low8 % 2);
13863
13864 return IsConsecutive && Is16Aligned;
13865}
13866
13867// Do not lower into v_perm if the operands are actually 16 bit
13868// and the selected bits (based on PermMask) correspond with two
13869// easily addressable 16 bit operands.
13871 SDValue &OtherOp) {
13872 int Low16 = PermMask & 0xffff;
13873 int Hi16 = (PermMask & 0xffff0000) >> 16;
13874
13875 auto TempOp = peekThroughBitcasts(Op);
13876 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13877
13878 auto OpIs16Bit =
13879 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13880 if (!OpIs16Bit)
13881 return true;
13882
13883 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13884 isExtendedFrom16Bits(TempOtherOp);
13885 if (!OtherOpIs16Bit)
13886 return true;
13887
13888 // Do we cleanly address both
13889 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13890}
13891
13893 unsigned DWordOffset) {
13894 SDValue Ret;
13895
13896 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13897 // ByteProvider must be at least 8 bits
13898 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13899
13900 if (TypeSize <= 32)
13901 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13902
13903 if (Src.getValueType().isVector()) {
13904 auto ScalarTySize = Src.getScalarValueSizeInBits();
13905 auto ScalarTy = Src.getValueType().getScalarType();
13906 if (ScalarTySize == 32) {
13907 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13908 DAG.getConstant(DWordOffset, SL, MVT::i32));
13909 }
13910 if (ScalarTySize > 32) {
13911 Ret = DAG.getNode(
13912 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13913 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13914 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13915 if (ShiftVal)
13916 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13917 DAG.getConstant(ShiftVal, SL, MVT::i32));
13918 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13919 }
13920
13921 assert(ScalarTySize < 32);
13922 auto NumElements = TypeSize / ScalarTySize;
13923 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13924 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13925 auto NumElementsIn32 = 32 / ScalarTySize;
13926 auto NumAvailElements = DWordOffset < Trunc32Elements
13927 ? NumElementsIn32
13928 : NumElements - NormalizedTrunc;
13929
13931 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13932 NumAvailElements);
13933
13934 Ret = DAG.getBuildVector(
13935 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13936 VecSrcs);
13937 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13938 }
13939
13940 /// Scalar Type
13941 auto ShiftVal = 32 * DWordOffset;
13942 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13943 DAG.getConstant(ShiftVal, SL, MVT::i32));
13944 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13945}
13946
13948 SelectionDAG &DAG = DCI.DAG;
13949 [[maybe_unused]] EVT VT = N->getValueType(0);
13951
13952 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13953 assert(VT == MVT::i32);
13954 for (int i = 0; i < 4; i++) {
13955 // Find the ByteProvider that provides the ith byte of the result of OR
13956 std::optional<ByteProvider<SDValue>> P =
13957 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13958 // TODO support constantZero
13959 if (!P || P->isConstantZero())
13960 return SDValue();
13961
13962 PermNodes.push_back(*P);
13963 }
13964 if (PermNodes.size() != 4)
13965 return SDValue();
13966
13967 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13968 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13969 uint64_t PermMask = 0x00000000;
13970 for (size_t i = 0; i < PermNodes.size(); i++) {
13971 auto PermOp = PermNodes[i];
13972 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13973 // by sizeof(Src2) = 4
13974 int SrcByteAdjust = 4;
13975
13976 // If the Src uses a byte from a different DWORD, then it corresponds
13977 // with a difference source
13978 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13979 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13980 if (SecondSrc)
13981 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13982 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13983 return SDValue();
13984
13985 // Set the index of the second distinct Src node
13986 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13987 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13988 SrcByteAdjust = 0;
13989 }
13990 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13992 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13993 }
13994 SDLoc DL(N);
13995 SDValue Op = *PermNodes[FirstSrc.first].Src;
13996 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13997 assert(Op.getValueSizeInBits() == 32);
13998
13999 // Check that we are not just extracting the bytes in order from an op
14000 if (!SecondSrc) {
14001 int Low16 = PermMask & 0xffff;
14002 int Hi16 = (PermMask & 0xffff0000) >> 16;
14003
14004 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14005 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14006
14007 // The perm op would really just produce Op. So combine into Op
14008 if (WellFormedLow && WellFormedHi)
14009 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14010 }
14011
14012 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14013
14014 if (SecondSrc) {
14015 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14016 assert(OtherOp.getValueSizeInBits() == 32);
14017 }
14018
14019 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14020
14021 assert(Op.getValueType().isByteSized() &&
14022 OtherOp.getValueType().isByteSized());
14023
14024 // If the ultimate src is less than 32 bits, then we will only be
14025 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14026 // CalculateByteProvider would not have returned Op as source if we
14027 // used a byte that is outside its ValueType. Thus, we are free to
14028 // ANY_EXTEND as the extended bits are dont-cares.
14029 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14030 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14031
14032 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14033 DAG.getConstant(PermMask, DL, MVT::i32));
14034 }
14035 return SDValue();
14036}
14037
14038SDValue SITargetLowering::performOrCombine(SDNode *N,
14039 DAGCombinerInfo &DCI) const {
14040 SelectionDAG &DAG = DCI.DAG;
14041 SDValue LHS = N->getOperand(0);
14042 SDValue RHS = N->getOperand(1);
14043
14044 EVT VT = N->getValueType(0);
14045 if (VT == MVT::i1) {
14046 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14047 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14048 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14049 SDValue Src = LHS.getOperand(0);
14050 if (Src != RHS.getOperand(0))
14051 return SDValue();
14052
14053 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14054 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14055 if (!CLHS || !CRHS)
14056 return SDValue();
14057
14058 // Only 10 bits are used.
14059 static const uint32_t MaxMask = 0x3ff;
14060
14061 uint32_t NewMask =
14062 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14063 SDLoc DL(N);
14064 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14065 DAG.getConstant(NewMask, DL, MVT::i32));
14066 }
14067
14068 return SDValue();
14069 }
14070
14071 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14073 LHS.getOpcode() == AMDGPUISD::PERM &&
14074 isa<ConstantSDNode>(LHS.getOperand(2))) {
14075 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14076 if (!Sel)
14077 return SDValue();
14078
14079 Sel |= LHS.getConstantOperandVal(2);
14080 SDLoc DL(N);
14081 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14082 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14083 }
14084
14085 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14086 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14087 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14088 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14089
14090 // If all the uses of an or need to extract the individual elements, do not
14091 // attempt to lower into v_perm
14092 auto usesCombinedOperand = [](SDNode *OrUse) {
14093 // If we have any non-vectorized use, then it is a candidate for v_perm
14094 if (OrUse->getOpcode() != ISD::BITCAST ||
14095 !OrUse->getValueType(0).isVector())
14096 return true;
14097
14098 // If we have any non-vectorized use, then it is a candidate for v_perm
14099 for (auto *VUser : OrUse->users()) {
14100 if (!VUser->getValueType(0).isVector())
14101 return true;
14102
14103 // If the use of a vector is a store, then combining via a v_perm
14104 // is beneficial.
14105 // TODO -- whitelist more uses
14106 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14107 if (VUser->getOpcode() == VectorwiseOp)
14108 return true;
14109 }
14110 return false;
14111 };
14112
14113 if (!any_of(N->users(), usesCombinedOperand))
14114 return SDValue();
14115
14116 uint32_t LHSMask = getPermuteMask(LHS);
14117 uint32_t RHSMask = getPermuteMask(RHS);
14118
14119 if (LHSMask != ~0u && RHSMask != ~0u) {
14120 // Canonicalize the expression in an attempt to have fewer unique masks
14121 // and therefore fewer registers used to hold the masks.
14122 if (LHSMask > RHSMask) {
14123 std::swap(LHSMask, RHSMask);
14124 std::swap(LHS, RHS);
14125 }
14126
14127 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14128 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14129 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14130 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14131
14132 // Check of we need to combine values from two sources within a byte.
14133 if (!(LHSUsedLanes & RHSUsedLanes) &&
14134 // If we select high and lower word keep it for SDWA.
14135 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14136 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14137 // Kill zero bytes selected by other mask. Zero value is 0xc.
14138 LHSMask &= ~RHSUsedLanes;
14139 RHSMask &= ~LHSUsedLanes;
14140 // Add 4 to each active LHS lane
14141 LHSMask |= LHSUsedLanes & 0x04040404;
14142 // Combine masks
14143 uint32_t Sel = LHSMask | RHSMask;
14144 SDLoc DL(N);
14145
14146 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14147 RHS.getOperand(0),
14148 DAG.getConstant(Sel, DL, MVT::i32));
14149 }
14150 }
14151 if (LHSMask == ~0u || RHSMask == ~0u) {
14152 if (SDValue Perm = matchPERM(N, DCI))
14153 return Perm;
14154 }
14155 }
14156
14157 // Detect identity v2i32 OR and replace with identity source node.
14158 // Specifically an Or that has operands constructed from the same source node
14159 // via extract_vector_elt and build_vector. I.E.
14160 // v2i32 or(
14161 // v2i32 build_vector(
14162 // i32 extract_elt(%IdentitySrc, 0),
14163 // i32 0
14164 // ),
14165 // v2i32 build_vector(
14166 // i32 0,
14167 // i32 extract_elt(%IdentitySrc, 1)
14168 // ) )
14169 // =>
14170 // v2i32 %IdentitySrc
14171
14172 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14173 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14174
14175 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14176 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14177
14178 // Test for and normalise build vectors.
14179 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14180
14181 // Get the extract_vector_element operands.
14182 SDValue LEVE = LHS->getOperand(0);
14183 SDValue REVE = RHS->getOperand(1);
14184
14185 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14187 // Check that different elements from the same vector are
14188 // extracted.
14189 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14190 LEVE->getOperand(1) != REVE->getOperand(1)) {
14191 SDValue IdentitySrc = LEVE.getOperand(0);
14192 return IdentitySrc;
14193 }
14194 }
14195 }
14196 }
14197
14198 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14199 return SDValue();
14200
14201 // TODO: This could be a generic combine with a predicate for extracting the
14202 // high half of an integer being free.
14203
14204 // (or i64:x, (zero_extend i32:y)) ->
14205 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14206 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14207 RHS.getOpcode() != ISD::ZERO_EXTEND)
14208 std::swap(LHS, RHS);
14209
14210 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14211 SDValue ExtSrc = RHS.getOperand(0);
14212 EVT SrcVT = ExtSrc.getValueType();
14213 if (SrcVT == MVT::i32) {
14214 SDLoc SL(N);
14215 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14216 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14217
14218 DCI.AddToWorklist(LowOr.getNode());
14219 DCI.AddToWorklist(HiBits.getNode());
14220
14221 SDValue Vec =
14222 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14223 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14224 }
14225 }
14226
14227 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14228 if (CRHS) {
14229 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14230 N->getOperand(0), CRHS))
14231 return Split;
14232 }
14233
14234 return SDValue();
14235}
14236
14237SDValue SITargetLowering::performXorCombine(SDNode *N,
14238 DAGCombinerInfo &DCI) const {
14239 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14240 return RV;
14241
14242 SDValue LHS = N->getOperand(0);
14243 SDValue RHS = N->getOperand(1);
14244
14245 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14246 SelectionDAG &DAG = DCI.DAG;
14247
14248 EVT VT = N->getValueType(0);
14249 if (CRHS && VT == MVT::i64) {
14250 if (SDValue Split =
14251 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14252 return Split;
14253 }
14254
14255 // v2i32 (xor (vselect cc, x, y), K) ->
14256 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14257 // replaced with source modifiers when the select is lowered to CNDMASK.
14258 unsigned Opc = LHS.getOpcode();
14259 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14260 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14261 CRHS && CRHS->getAPIntValue().isSignMask()) {
14262 SDValue CC = LHS->getOperand(0);
14263 SDValue TRUE = LHS->getOperand(1);
14264 SDValue FALSE = LHS->getOperand(2);
14265 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14266 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14267 SDValue XSelect =
14268 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14269 return XSelect;
14270 }
14271
14272 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14273 // fneg-like xors into 64-bit select.
14274 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14275 // This looks like an fneg, try to fold as a source modifier.
14276 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14278 // xor (select c, a, b), 0x80000000 ->
14279 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14280 SDLoc DL(N);
14281 SDValue CastLHS =
14282 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14283 SDValue CastRHS =
14284 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14285 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14286 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14287 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14288 LHS->getOperand(0), FNegLHS, FNegRHS);
14289 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14290 }
14291 }
14292
14293 return SDValue();
14294}
14295
14296SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14297 DAGCombinerInfo &DCI) const {
14298 if (!Subtarget->has16BitInsts() ||
14299 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14300 return SDValue();
14301
14302 EVT VT = N->getValueType(0);
14303 if (VT != MVT::i32)
14304 return SDValue();
14305
14306 SDValue Src = N->getOperand(0);
14307 if (Src.getValueType() != MVT::i16)
14308 return SDValue();
14309
14310 return SDValue();
14311}
14312
14313SDValue
14314SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14315 DAGCombinerInfo &DCI) const {
14316 SDValue Src = N->getOperand(0);
14317 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14318
14319 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14320 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14321 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14322 VTSign->getVT() == MVT::i8) ||
14323 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14324 VTSign->getVT() == MVT::i16))) {
14325 assert(Subtarget->hasScalarSubwordLoads() &&
14326 "s_buffer_load_{u8, i8} are supported "
14327 "in GFX12 (or newer) architectures.");
14328 EVT VT = Src.getValueType();
14329 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14330 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14331 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14332 SDLoc DL(N);
14333 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14334 SDValue Ops[] = {
14335 Src.getOperand(0), // source register
14336 Src.getOperand(1), // offset
14337 Src.getOperand(2) // cachePolicy
14338 };
14339 auto *M = cast<MemSDNode>(Src);
14340 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14341 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14342 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14343 return LoadVal;
14344 }
14345 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14346 VTSign->getVT() == MVT::i8) ||
14347 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14348 VTSign->getVT() == MVT::i16)) &&
14349 Src.hasOneUse()) {
14350 auto *M = cast<MemSDNode>(Src);
14351 SDValue Ops[] = {Src.getOperand(0), // Chain
14352 Src.getOperand(1), // rsrc
14353 Src.getOperand(2), // vindex
14354 Src.getOperand(3), // voffset
14355 Src.getOperand(4), // soffset
14356 Src.getOperand(5), // offset
14357 Src.getOperand(6), Src.getOperand(7)};
14358 // replace with BUFFER_LOAD_BYTE/SHORT
14359 SDVTList ResList =
14360 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14361 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14362 ? AMDGPUISD::BUFFER_LOAD_BYTE
14363 : AMDGPUISD::BUFFER_LOAD_SHORT;
14364 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14365 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14366 return DCI.DAG.getMergeValues(
14367 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14368 }
14369 return SDValue();
14370}
14371
14372SDValue SITargetLowering::performClassCombine(SDNode *N,
14373 DAGCombinerInfo &DCI) const {
14374 SelectionDAG &DAG = DCI.DAG;
14375 SDValue Mask = N->getOperand(1);
14376
14377 // fp_class x, 0 -> false
14378 if (isNullConstant(Mask))
14379 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14380
14381 if (N->getOperand(0).isUndef())
14382 return DAG.getUNDEF(MVT::i1);
14383
14384 return SDValue();
14385}
14386
14387SDValue SITargetLowering::performRcpCombine(SDNode *N,
14388 DAGCombinerInfo &DCI) const {
14389 EVT VT = N->getValueType(0);
14390 SDValue N0 = N->getOperand(0);
14391
14392 if (N0.isUndef()) {
14393 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14394 SDLoc(N), VT);
14395 }
14396
14397 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14398 N0.getOpcode() == ISD::SINT_TO_FP)) {
14399 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14400 N->getFlags());
14401 }
14402
14403 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14404 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14405 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14406 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14407 N->getFlags());
14408 }
14409
14411}
14412
14414 unsigned MaxDepth) const {
14415 unsigned Opcode = Op.getOpcode();
14416 if (Opcode == ISD::FCANONICALIZE)
14417 return true;
14418
14419 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14420 const auto &F = CFP->getValueAPF();
14421 if (F.isNaN() && F.isSignaling())
14422 return false;
14423 if (!F.isDenormal())
14424 return true;
14425
14426 DenormalMode Mode =
14427 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14428 return Mode == DenormalMode::getIEEE();
14429 }
14430
14431 // If source is a result of another standard FP operation it is already in
14432 // canonical form.
14433 if (MaxDepth == 0)
14434 return false;
14435
14436 switch (Opcode) {
14437 // These will flush denorms if required.
14438 case ISD::FADD:
14439 case ISD::FSUB:
14440 case ISD::FMUL:
14441 case ISD::FCEIL:
14442 case ISD::FFLOOR:
14443 case ISD::FMA:
14444 case ISD::FMAD:
14445 case ISD::FSQRT:
14446 case ISD::FDIV:
14447 case ISD::FREM:
14448 case ISD::FP_ROUND:
14449 case ISD::FP_EXTEND:
14450 case ISD::FP16_TO_FP:
14451 case ISD::FP_TO_FP16:
14452 case ISD::BF16_TO_FP:
14453 case ISD::FP_TO_BF16:
14454 case ISD::FLDEXP:
14455 case AMDGPUISD::FMUL_LEGACY:
14456 case AMDGPUISD::FMAD_FTZ:
14457 case AMDGPUISD::RCP:
14458 case AMDGPUISD::RSQ:
14459 case AMDGPUISD::RSQ_CLAMP:
14460 case AMDGPUISD::RCP_LEGACY:
14461 case AMDGPUISD::RCP_IFLAG:
14462 case AMDGPUISD::LOG:
14463 case AMDGPUISD::EXP:
14464 case AMDGPUISD::DIV_SCALE:
14465 case AMDGPUISD::DIV_FMAS:
14466 case AMDGPUISD::DIV_FIXUP:
14467 case AMDGPUISD::FRACT:
14468 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14469 case AMDGPUISD::CVT_F32_UBYTE0:
14470 case AMDGPUISD::CVT_F32_UBYTE1:
14471 case AMDGPUISD::CVT_F32_UBYTE2:
14472 case AMDGPUISD::CVT_F32_UBYTE3:
14473 case AMDGPUISD::FP_TO_FP16:
14474 case AMDGPUISD::SIN_HW:
14475 case AMDGPUISD::COS_HW:
14476 return true;
14477
14478 // It can/will be lowered or combined as a bit operation.
14479 // Need to check their input recursively to handle.
14480 case ISD::FNEG:
14481 case ISD::FABS:
14482 case ISD::FCOPYSIGN:
14483 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14484
14485 case ISD::AND:
14486 if (Op.getValueType() == MVT::i32) {
14487 // Be careful as we only know it is a bitcast floating point type. It
14488 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14489 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14490 // is valid to optimize for all types.
14491 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14492 if (RHS->getZExtValue() == 0xffff0000) {
14493 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14494 }
14495 }
14496 }
14497 break;
14498
14499 case ISD::FSIN:
14500 case ISD::FCOS:
14501 case ISD::FSINCOS:
14502 return Op.getValueType().getScalarType() != MVT::f16;
14503
14504 case ISD::FMINNUM:
14505 case ISD::FMAXNUM:
14506 case ISD::FMINNUM_IEEE:
14507 case ISD::FMAXNUM_IEEE:
14508 case ISD::FMINIMUM:
14509 case ISD::FMAXIMUM:
14510 case ISD::FMINIMUMNUM:
14511 case ISD::FMAXIMUMNUM:
14512 case AMDGPUISD::CLAMP:
14513 case AMDGPUISD::FMED3:
14514 case AMDGPUISD::FMAX3:
14515 case AMDGPUISD::FMIN3:
14516 case AMDGPUISD::FMAXIMUM3:
14517 case AMDGPUISD::FMINIMUM3: {
14518 // FIXME: Shouldn't treat the generic operations different based these.
14519 // However, we aren't really required to flush the result from
14520 // minnum/maxnum..
14521
14522 // snans will be quieted, so we only need to worry about denormals.
14523 if (Subtarget->supportsMinMaxDenormModes() ||
14524 // FIXME: denormalsEnabledForType is broken for dynamic
14525 denormalsEnabledForType(DAG, Op.getValueType()))
14526 return true;
14527
14528 // Flushing may be required.
14529 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14530 // targets need to check their input recursively.
14531
14532 // FIXME: Does this apply with clamp? It's implemented with max.
14533 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14534 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14535 return false;
14536 }
14537
14538 return true;
14539 }
14540 case ISD::SELECT: {
14541 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14542 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14543 }
14544 case ISD::BUILD_VECTOR: {
14545 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14546 SDValue SrcOp = Op.getOperand(i);
14547 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14548 return false;
14549 }
14550
14551 return true;
14552 }
14555 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14556 }
14558 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14559 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14560 }
14561 case ISD::UNDEF:
14562 // Could be anything.
14563 return false;
14564
14565 case ISD::BITCAST:
14566 // TODO: This is incorrect as it loses track of the operand's type. We may
14567 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14568 // same bits that are canonicalized in one type need not be in the other.
14569 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14570 case ISD::TRUNCATE: {
14571 // Hack round the mess we make when legalizing extract_vector_elt
14572 if (Op.getValueType() == MVT::i16) {
14573 SDValue TruncSrc = Op.getOperand(0);
14574 if (TruncSrc.getValueType() == MVT::i32 &&
14575 TruncSrc.getOpcode() == ISD::BITCAST &&
14576 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14577 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14578 }
14579 }
14580 return false;
14581 }
14583 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14584 // TODO: Handle more intrinsics
14585 switch (IntrinsicID) {
14586 case Intrinsic::amdgcn_cvt_pkrtz:
14587 case Intrinsic::amdgcn_cubeid:
14588 case Intrinsic::amdgcn_frexp_mant:
14589 case Intrinsic::amdgcn_fdot2:
14590 case Intrinsic::amdgcn_rcp:
14591 case Intrinsic::amdgcn_rsq:
14592 case Intrinsic::amdgcn_rsq_clamp:
14593 case Intrinsic::amdgcn_rcp_legacy:
14594 case Intrinsic::amdgcn_rsq_legacy:
14595 case Intrinsic::amdgcn_trig_preop:
14596 case Intrinsic::amdgcn_tanh:
14597 case Intrinsic::amdgcn_log:
14598 case Intrinsic::amdgcn_exp2:
14599 case Intrinsic::amdgcn_sqrt:
14600 return true;
14601 default:
14602 break;
14603 }
14604
14605 break;
14606 }
14607 default:
14608 break;
14609 }
14610
14611 // FIXME: denormalsEnabledForType is broken for dynamic
14612 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14613 DAG.isKnownNeverSNaN(Op);
14614}
14615
14617 unsigned MaxDepth) const {
14618 const MachineRegisterInfo &MRI = MF.getRegInfo();
14619 MachineInstr *MI = MRI.getVRegDef(Reg);
14620 unsigned Opcode = MI->getOpcode();
14621
14622 if (Opcode == AMDGPU::G_FCANONICALIZE)
14623 return true;
14624
14625 std::optional<FPValueAndVReg> FCR;
14626 // Constant splat (can be padded with undef) or scalar constant.
14628 if (FCR->Value.isSignaling())
14629 return false;
14630 if (!FCR->Value.isDenormal())
14631 return true;
14632
14633 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14634 return Mode == DenormalMode::getIEEE();
14635 }
14636
14637 if (MaxDepth == 0)
14638 return false;
14639
14640 switch (Opcode) {
14641 case AMDGPU::G_FADD:
14642 case AMDGPU::G_FSUB:
14643 case AMDGPU::G_FMUL:
14644 case AMDGPU::G_FCEIL:
14645 case AMDGPU::G_FFLOOR:
14646 case AMDGPU::G_FRINT:
14647 case AMDGPU::G_FNEARBYINT:
14648 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14649 case AMDGPU::G_INTRINSIC_TRUNC:
14650 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14651 case AMDGPU::G_FMA:
14652 case AMDGPU::G_FMAD:
14653 case AMDGPU::G_FSQRT:
14654 case AMDGPU::G_FDIV:
14655 case AMDGPU::G_FREM:
14656 case AMDGPU::G_FPOW:
14657 case AMDGPU::G_FPEXT:
14658 case AMDGPU::G_FLOG:
14659 case AMDGPU::G_FLOG2:
14660 case AMDGPU::G_FLOG10:
14661 case AMDGPU::G_FPTRUNC:
14662 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14663 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14664 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14665 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14666 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14667 return true;
14668 case AMDGPU::G_FNEG:
14669 case AMDGPU::G_FABS:
14670 case AMDGPU::G_FCOPYSIGN:
14671 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14672 case AMDGPU::G_FMINNUM:
14673 case AMDGPU::G_FMAXNUM:
14674 case AMDGPU::G_FMINNUM_IEEE:
14675 case AMDGPU::G_FMAXNUM_IEEE:
14676 case AMDGPU::G_FMINIMUM:
14677 case AMDGPU::G_FMAXIMUM:
14678 case AMDGPU::G_FMINIMUMNUM:
14679 case AMDGPU::G_FMAXIMUMNUM: {
14680 if (Subtarget->supportsMinMaxDenormModes() ||
14681 // FIXME: denormalsEnabledForType is broken for dynamic
14682 denormalsEnabledForType(MRI.getType(Reg), MF))
14683 return true;
14684
14685 [[fallthrough]];
14686 }
14687 case AMDGPU::G_BUILD_VECTOR:
14688 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14689 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14690 return false;
14691 return true;
14692 case AMDGPU::G_INTRINSIC:
14693 case AMDGPU::G_INTRINSIC_CONVERGENT:
14694 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14695 case Intrinsic::amdgcn_fmul_legacy:
14696 case Intrinsic::amdgcn_fmad_ftz:
14697 case Intrinsic::amdgcn_sqrt:
14698 case Intrinsic::amdgcn_fmed3:
14699 case Intrinsic::amdgcn_sin:
14700 case Intrinsic::amdgcn_cos:
14701 case Intrinsic::amdgcn_log:
14702 case Intrinsic::amdgcn_exp2:
14703 case Intrinsic::amdgcn_log_clamp:
14704 case Intrinsic::amdgcn_rcp:
14705 case Intrinsic::amdgcn_rcp_legacy:
14706 case Intrinsic::amdgcn_rsq:
14707 case Intrinsic::amdgcn_rsq_clamp:
14708 case Intrinsic::amdgcn_rsq_legacy:
14709 case Intrinsic::amdgcn_div_scale:
14710 case Intrinsic::amdgcn_div_fmas:
14711 case Intrinsic::amdgcn_div_fixup:
14712 case Intrinsic::amdgcn_fract:
14713 case Intrinsic::amdgcn_cvt_pkrtz:
14714 case Intrinsic::amdgcn_cubeid:
14715 case Intrinsic::amdgcn_cubema:
14716 case Intrinsic::amdgcn_cubesc:
14717 case Intrinsic::amdgcn_cubetc:
14718 case Intrinsic::amdgcn_frexp_mant:
14719 case Intrinsic::amdgcn_fdot2:
14720 case Intrinsic::amdgcn_trig_preop:
14721 case Intrinsic::amdgcn_tanh:
14722 return true;
14723 default:
14724 break;
14725 }
14726
14727 [[fallthrough]];
14728 default:
14729 return false;
14730 }
14731
14732 llvm_unreachable("invalid operation");
14733}
14734
14735// Constant fold canonicalize.
14736SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14737 const SDLoc &SL, EVT VT,
14738 const APFloat &C) const {
14739 // Flush denormals to 0 if not enabled.
14740 if (C.isDenormal()) {
14741 DenormalMode Mode =
14742 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14743 if (Mode == DenormalMode::getPreserveSign()) {
14744 return DAG.getConstantFP(
14745 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14746 }
14747
14748 if (Mode != DenormalMode::getIEEE())
14749 return SDValue();
14750 }
14751
14752 if (C.isNaN()) {
14753 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14754 if (C.isSignaling()) {
14755 // Quiet a signaling NaN.
14756 // FIXME: Is this supposed to preserve payload bits?
14757 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14758 }
14759
14760 // Make sure it is the canonical NaN bitpattern.
14761 //
14762 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14763 // immediate?
14764 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14765 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14766 }
14767
14768 // Already canonical.
14769 return DAG.getConstantFP(C, SL, VT);
14770}
14771
14773 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14774}
14775
14776SDValue
14777SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14778 DAGCombinerInfo &DCI) const {
14779 SelectionDAG &DAG = DCI.DAG;
14780 SDValue N0 = N->getOperand(0);
14781 EVT VT = N->getValueType(0);
14782
14783 // fcanonicalize undef -> qnan
14784 if (N0.isUndef()) {
14786 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14787 }
14788
14789 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14790 EVT VT = N->getValueType(0);
14791 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14792 }
14793
14794 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14795 // (fcanonicalize k)
14796 //
14797 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14798
14799 // TODO: This could be better with wider vectors that will be split to v2f16,
14800 // and to consider uses since there aren't that many packed operations.
14801 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14802 isTypeLegal(MVT::v2f16)) {
14803 SDLoc SL(N);
14804 SDValue NewElts[2];
14805 SDValue Lo = N0.getOperand(0);
14806 SDValue Hi = N0.getOperand(1);
14807 EVT EltVT = Lo.getValueType();
14808
14810 for (unsigned I = 0; I != 2; ++I) {
14811 SDValue Op = N0.getOperand(I);
14812 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14813 NewElts[I] =
14814 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14815 } else if (Op.isUndef()) {
14816 // Handled below based on what the other operand is.
14817 NewElts[I] = Op;
14818 } else {
14819 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14820 }
14821 }
14822
14823 // If one half is undef, and one is constant, prefer a splat vector rather
14824 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14825 // cheaper to use and may be free with a packed operation.
14826 if (NewElts[0].isUndef()) {
14827 if (isa<ConstantFPSDNode>(NewElts[1]))
14828 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14829 ? NewElts[1]
14830 : DAG.getConstantFP(0.0f, SL, EltVT);
14831 }
14832
14833 if (NewElts[1].isUndef()) {
14834 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14835 ? NewElts[0]
14836 : DAG.getConstantFP(0.0f, SL, EltVT);
14837 }
14838
14839 return DAG.getBuildVector(VT, SL, NewElts);
14840 }
14841 }
14842
14843 return SDValue();
14844}
14845
14846static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14847 switch (Opc) {
14848 case ISD::FMAXNUM:
14849 case ISD::FMAXNUM_IEEE:
14850 case ISD::FMAXIMUMNUM:
14851 return AMDGPUISD::FMAX3;
14852 case ISD::FMAXIMUM:
14853 return AMDGPUISD::FMAXIMUM3;
14854 case ISD::SMAX:
14855 return AMDGPUISD::SMAX3;
14856 case ISD::UMAX:
14857 return AMDGPUISD::UMAX3;
14858 case ISD::FMINNUM:
14859 case ISD::FMINNUM_IEEE:
14860 case ISD::FMINIMUMNUM:
14861 return AMDGPUISD::FMIN3;
14862 case ISD::FMINIMUM:
14863 return AMDGPUISD::FMINIMUM3;
14864 case ISD::SMIN:
14865 return AMDGPUISD::SMIN3;
14866 case ISD::UMIN:
14867 return AMDGPUISD::UMIN3;
14868 default:
14869 llvm_unreachable("Not a min/max opcode");
14870 }
14871}
14872
14873SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14874 const SDLoc &SL, SDValue Src,
14875 SDValue MinVal,
14876 SDValue MaxVal,
14877 bool Signed) const {
14878
14879 // med3 comes from
14880 // min(max(x, K0), K1), K0 < K1
14881 // max(min(x, K0), K1), K1 < K0
14882 //
14883 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14884 // min/max op.
14885 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14886 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14887
14888 if (!MinK || !MaxK)
14889 return SDValue();
14890
14891 if (Signed) {
14892 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14893 return SDValue();
14894 } else {
14895 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14896 return SDValue();
14897 }
14898
14899 EVT VT = MinK->getValueType(0);
14900 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14901 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14902 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14903
14904 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14905 // not available, but this is unlikely to be profitable as constants
14906 // will often need to be materialized & extended, especially on
14907 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14908 return SDValue();
14909}
14910
14913 return C;
14914
14916 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14917 return C;
14918 }
14919
14920 return nullptr;
14921}
14922
14923SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14924 const SDLoc &SL, SDValue Op0,
14925 SDValue Op1) const {
14926 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14927 if (!K1)
14928 return SDValue();
14929
14930 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14931 if (!K0)
14932 return SDValue();
14933
14934 // Ordered >= (although NaN inputs should have folded away by now).
14935 if (K0->getValueAPF() > K1->getValueAPF())
14936 return SDValue();
14937
14938 // med3 with a nan input acts like
14939 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14940 //
14941 // So the result depends on whether the IEEE mode bit is enabled or not with a
14942 // signaling nan input.
14943 // ieee=1
14944 // s0 snan: yields s2
14945 // s1 snan: yields s2
14946 // s2 snan: qnan
14947
14948 // s0 qnan: min(s1, s2)
14949 // s1 qnan: min(s0, s2)
14950 // s2 qnan: min(s0, s1)
14951
14952 // ieee=0
14953 // s0 snan: min(s1, s2)
14954 // s1 snan: min(s0, s2)
14955 // s2 snan: qnan
14956
14957 // s0 qnan: min(s1, s2)
14958 // s1 qnan: min(s0, s2)
14959 // s2 qnan: min(s0, s1)
14960 const MachineFunction &MF = DAG.getMachineFunction();
14961 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14962
14963 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14964 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14965 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14966 EVT VT = Op0.getValueType();
14967 if (Info->getMode().DX10Clamp) {
14968 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14969 // hardware fmed3 behavior converting to a min.
14970 // FIXME: Should this be allowing -0.0?
14971 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14972 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14973 }
14974
14975 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14976 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14977 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14978 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14979 // then give the other result, which is different from med3 with a NaN
14980 // input.
14981 SDValue Var = Op0.getOperand(0);
14982 if (!DAG.isKnownNeverSNaN(Var))
14983 return SDValue();
14984
14985 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14986
14987 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14988 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14989 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14990 SDValue(K0, 0), SDValue(K1, 0));
14991 }
14992 }
14993
14994 return SDValue();
14995}
14996
14997/// \return true if the subtarget supports minimum3 and maximum3 with the given
14998/// base min/max opcode \p Opc for type \p VT.
14999static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15000 EVT VT) {
15001 switch (Opc) {
15002 case ISD::FMINNUM:
15003 case ISD::FMAXNUM:
15004 case ISD::FMINNUM_IEEE:
15005 case ISD::FMAXNUM_IEEE:
15006 case ISD::FMINIMUMNUM:
15007 case ISD::FMAXIMUMNUM:
15008 case AMDGPUISD::FMIN_LEGACY:
15009 case AMDGPUISD::FMAX_LEGACY:
15010 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15011 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15012 case ISD::FMINIMUM:
15013 case ISD::FMAXIMUM:
15014 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15015 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15016 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15017 case ISD::SMAX:
15018 case ISD::SMIN:
15019 case ISD::UMAX:
15020 case ISD::UMIN:
15021 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15022 default:
15023 return false;
15024 }
15025
15026 llvm_unreachable("not a min/max opcode");
15027}
15028
15029SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15030 DAGCombinerInfo &DCI) const {
15031 SelectionDAG &DAG = DCI.DAG;
15032
15033 EVT VT = N->getValueType(0);
15034 unsigned Opc = N->getOpcode();
15035 SDValue Op0 = N->getOperand(0);
15036 SDValue Op1 = N->getOperand(1);
15037
15038 // Only do this if the inner op has one use since this will just increases
15039 // register pressure for no benefit.
15040
15041 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15042 // max(max(a, b), c) -> max3(a, b, c)
15043 // min(min(a, b), c) -> min3(a, b, c)
15044 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15045 SDLoc DL(N);
15046 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15047 Op0.getOperand(0), Op0.getOperand(1), Op1);
15048 }
15049
15050 // Try commuted.
15051 // max(a, max(b, c)) -> max3(a, b, c)
15052 // min(a, min(b, c)) -> min3(a, b, c)
15053 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15054 SDLoc DL(N);
15055 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15056 Op0, Op1.getOperand(0), Op1.getOperand(1));
15057 }
15058 }
15059
15060 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15061 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15062 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15063 if (SDValue Med3 = performIntMed3ImmCombine(
15064 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15065 return Med3;
15066 }
15067 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15068 if (SDValue Med3 = performIntMed3ImmCombine(
15069 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15070 return Med3;
15071 }
15072
15073 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15074 if (SDValue Med3 = performIntMed3ImmCombine(
15075 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15076 return Med3;
15077 }
15078 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15079 if (SDValue Med3 = performIntMed3ImmCombine(
15080 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15081 return Med3;
15082 }
15083
15084 // if !is_snan(x):
15085 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15086 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15087 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15088 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15089 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15090 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
15091 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
15092 (Opc == AMDGPUISD::FMIN_LEGACY &&
15093 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15094 (VT == MVT::f32 || VT == MVT::f64 ||
15095 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15096 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15097 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15098 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15099 Op0.hasOneUse()) {
15100 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15101 return Res;
15102 }
15103
15104 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15105 // for some types, but at a higher cost since it's implemented with a 3
15106 // operand form.
15107 const SDNodeFlags Flags = N->getFlags();
15108 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15109 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15110 unsigned NewOpc =
15111 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15112 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15113 }
15114
15115 return SDValue();
15116}
15117
15121 // FIXME: Should this be allowing -0.0?
15122 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15123 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15124 }
15125 }
15126
15127 return false;
15128}
15129
15130// FIXME: Should only worry about snans for version with chain.
15131SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15132 DAGCombinerInfo &DCI) const {
15133 EVT VT = N->getValueType(0);
15134 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15135 // NaNs. With a NaN input, the order of the operands may change the result.
15136
15137 SelectionDAG &DAG = DCI.DAG;
15138 SDLoc SL(N);
15139
15140 SDValue Src0 = N->getOperand(0);
15141 SDValue Src1 = N->getOperand(1);
15142 SDValue Src2 = N->getOperand(2);
15143
15144 if (isClampZeroToOne(Src0, Src1)) {
15145 // const_a, const_b, x -> clamp is safe in all cases including signaling
15146 // nans.
15147 // FIXME: Should this be allowing -0.0?
15148 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15149 }
15150
15151 const MachineFunction &MF = DAG.getMachineFunction();
15152 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15153
15154 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15155 // handling no dx10-clamp?
15156 if (Info->getMode().DX10Clamp) {
15157 // If NaNs is clamped to 0, we are free to reorder the inputs.
15158
15159 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15160 std::swap(Src0, Src1);
15161
15162 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15163 std::swap(Src1, Src2);
15164
15165 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15166 std::swap(Src0, Src1);
15167
15168 if (isClampZeroToOne(Src1, Src2))
15169 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15170 }
15171
15172 return SDValue();
15173}
15174
15175SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15176 DAGCombinerInfo &DCI) const {
15177 SDValue Src0 = N->getOperand(0);
15178 SDValue Src1 = N->getOperand(1);
15179 if (Src0.isUndef() && Src1.isUndef())
15180 return DCI.DAG.getUNDEF(N->getValueType(0));
15181 return SDValue();
15182}
15183
15184// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15185// expanded into a set of cmp/select instructions.
15187 unsigned NumElem,
15188 bool IsDivergentIdx,
15189 const GCNSubtarget *Subtarget) {
15191 return false;
15192
15193 unsigned VecSize = EltSize * NumElem;
15194
15195 // Sub-dword vectors of size 2 dword or less have better implementation.
15196 if (VecSize <= 64 && EltSize < 32)
15197 return false;
15198
15199 // Always expand the rest of sub-dword instructions, otherwise it will be
15200 // lowered via memory.
15201 if (EltSize < 32)
15202 return true;
15203
15204 // Always do this if var-idx is divergent, otherwise it will become a loop.
15205 if (IsDivergentIdx)
15206 return true;
15207
15208 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15209 unsigned NumInsts = NumElem /* Number of compares */ +
15210 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15211
15212 // On some architectures (GFX9) movrel is not available and it's better
15213 // to expand.
15214 if (Subtarget->useVGPRIndexMode())
15215 return NumInsts <= 16;
15216
15217 // If movrel is available, use it instead of expanding for vector of 8
15218 // elements.
15219 if (Subtarget->hasMovrel())
15220 return NumInsts <= 15;
15221
15222 return true;
15223}
15224
15226 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15227 if (isa<ConstantSDNode>(Idx))
15228 return false;
15229
15230 SDValue Vec = N->getOperand(0);
15231 EVT VecVT = Vec.getValueType();
15232 EVT EltVT = VecVT.getVectorElementType();
15233 unsigned EltSize = EltVT.getSizeInBits();
15234 unsigned NumElem = VecVT.getVectorNumElements();
15235
15237 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15238}
15239
15240SDValue
15241SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15242 DAGCombinerInfo &DCI) const {
15243 SDValue Vec = N->getOperand(0);
15244 SelectionDAG &DAG = DCI.DAG;
15245
15246 EVT VecVT = Vec.getValueType();
15247 EVT VecEltVT = VecVT.getVectorElementType();
15248 EVT ResVT = N->getValueType(0);
15249
15250 unsigned VecSize = VecVT.getSizeInBits();
15251 unsigned VecEltSize = VecEltVT.getSizeInBits();
15252
15253 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15255 SDLoc SL(N);
15256 SDValue Idx = N->getOperand(1);
15257 SDValue Elt =
15258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15259 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15260 }
15261
15262 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15263 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15264 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15265 // depending on the shift operand. See e.g. performSraCombine().
15266 // This combine ensures that the optimisation is compatible with v2i32
15267 // legalised AND.
15268 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15269 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15270
15272 if (!C || C->getZExtValue() != 0x1f)
15273 return SDValue();
15274
15275 SDLoc SL(N);
15276 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15277 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15278 Vec->getOperand(0), N->getOperand(1));
15279 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15280 DAG.ReplaceAllUsesWith(N, A.getNode());
15281 }
15282
15283 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15284 // =>
15285 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15286 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15287 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15288 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15289 SDLoc SL(N);
15290 SDValue Idx = N->getOperand(1);
15291 unsigned Opc = Vec.getOpcode();
15292
15293 switch (Opc) {
15294 default:
15295 break;
15296 // TODO: Support other binary operations.
15297 case ISD::FADD:
15298 case ISD::FSUB:
15299 case ISD::FMUL:
15300 case ISD::ADD:
15301 case ISD::UMIN:
15302 case ISD::UMAX:
15303 case ISD::SMIN:
15304 case ISD::SMAX:
15305 case ISD::FMAXNUM:
15306 case ISD::FMINNUM:
15307 case ISD::FMAXNUM_IEEE:
15308 case ISD::FMINNUM_IEEE:
15309 case ISD::FMAXIMUM:
15310 case ISD::FMINIMUM: {
15311 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15312 Vec.getOperand(0), Idx);
15313 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15314 Vec.getOperand(1), Idx);
15315
15316 DCI.AddToWorklist(Elt0.getNode());
15317 DCI.AddToWorklist(Elt1.getNode());
15318 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15319 }
15320 }
15321 }
15322
15323 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15325 SDLoc SL(N);
15326 SDValue Idx = N->getOperand(1);
15327 SDValue V;
15328 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15329 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15330 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15331 if (I == 0)
15332 V = Elt;
15333 else
15334 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15335 }
15336 return V;
15337 }
15338
15339 if (!DCI.isBeforeLegalize())
15340 return SDValue();
15341
15342 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15343 // elements. This exposes more load reduction opportunities by replacing
15344 // multiple small extract_vector_elements with a single 32-bit extract.
15345 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15346 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15347 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15348 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15349
15350 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15351 unsigned EltIdx = BitIndex / 32;
15352 unsigned LeftoverBitIdx = BitIndex % 32;
15353 SDLoc SL(N);
15354
15355 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15356 DCI.AddToWorklist(Cast.getNode());
15357
15358 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15359 DAG.getConstant(EltIdx, SL, MVT::i32));
15360 DCI.AddToWorklist(Elt.getNode());
15361 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15362 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15363 DCI.AddToWorklist(Srl.getNode());
15364
15365 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15366 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15367 DCI.AddToWorklist(Trunc.getNode());
15368
15369 if (VecEltVT == ResVT) {
15370 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15371 }
15372
15373 assert(ResVT.isScalarInteger());
15374 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15375 }
15376
15377 return SDValue();
15378}
15379
15380SDValue
15381SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15382 DAGCombinerInfo &DCI) const {
15383 SDValue Vec = N->getOperand(0);
15384 SDValue Idx = N->getOperand(2);
15385 EVT VecVT = Vec.getValueType();
15386 EVT EltVT = VecVT.getVectorElementType();
15387
15388 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15389 // => BUILD_VECTOR n x select (e, const-idx)
15391 return SDValue();
15392
15393 SelectionDAG &DAG = DCI.DAG;
15394 SDLoc SL(N);
15395 SDValue Ins = N->getOperand(1);
15396 EVT IdxVT = Idx.getValueType();
15397
15399 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15400 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15401 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15402 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15403 Ops.push_back(V);
15404 }
15405
15406 return DAG.getBuildVector(VecVT, SL, Ops);
15407}
15408
15409/// Return the source of an fp_extend from f16 to f32, or a converted FP
15410/// constant.
15412 if (Src.getOpcode() == ISD::FP_EXTEND &&
15413 Src.getOperand(0).getValueType() == MVT::f16) {
15414 return Src.getOperand(0);
15415 }
15416
15417 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15418 APFloat Val = CFP->getValueAPF();
15419 bool LosesInfo = true;
15421 if (!LosesInfo)
15422 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15423 }
15424
15425 return SDValue();
15426}
15427
15428SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15429 DAGCombinerInfo &DCI) const {
15430 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15431 "combine only useful on gfx8");
15432
15433 SDValue TruncSrc = N->getOperand(0);
15434 EVT VT = N->getValueType(0);
15435 if (VT != MVT::f16)
15436 return SDValue();
15437
15438 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15439 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15440 return SDValue();
15441
15442 SelectionDAG &DAG = DCI.DAG;
15443 SDLoc SL(N);
15444
15445 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15446 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15447 // casting back.
15448
15449 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15450 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15451 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15452 if (!A)
15453 return SDValue();
15454
15455 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15456 if (!B)
15457 return SDValue();
15458
15459 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15460 if (!C)
15461 return SDValue();
15462
15463 // This changes signaling nan behavior. If an input is a signaling nan, it
15464 // would have been quieted by the fpext originally. We don't care because
15465 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15466 // we would be worse off than just doing the promotion.
15467 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15468 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15469 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15470 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15471}
15472
15473unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15474 const SDNode *N0,
15475 const SDNode *N1) const {
15476 EVT VT = N0->getValueType(0);
15477
15478 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15479 // support denormals ever.
15480 if (((VT == MVT::f32 &&
15482 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15485 return ISD::FMAD;
15486
15487 const TargetOptions &Options = DAG.getTarget().Options;
15488 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15489 (N0->getFlags().hasAllowContract() &&
15490 N1->getFlags().hasAllowContract())) &&
15492 return ISD::FMA;
15493 }
15494
15495 return 0;
15496}
15497
15498// For a reassociatable opcode perform:
15499// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15500SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15501 SelectionDAG &DAG) const {
15502 EVT VT = N->getValueType(0);
15503 if (VT != MVT::i32 && VT != MVT::i64)
15504 return SDValue();
15505
15506 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15507 return SDValue();
15508
15509 unsigned Opc = N->getOpcode();
15510 SDValue Op0 = N->getOperand(0);
15511 SDValue Op1 = N->getOperand(1);
15512
15513 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15514 return SDValue();
15515
15516 if (Op0->isDivergent())
15517 std::swap(Op0, Op1);
15518
15519 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15520 return SDValue();
15521
15522 SDValue Op2 = Op1.getOperand(1);
15523 Op1 = Op1.getOperand(0);
15524 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15525 return SDValue();
15526
15527 if (Op1->isDivergent())
15528 std::swap(Op1, Op2);
15529
15530 SDLoc SL(N);
15531 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15532 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15533}
15534
15535static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15536 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15538 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15539 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15540 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15541}
15542
15543// Fold
15544// y = lshr i64 x, 32
15545// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15546// with Const.hi == -1
15547// To
15548// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15550 SDValue MulLHS, SDValue MulRHS,
15551 SDValue AddRHS) {
15552 if (MulRHS.getOpcode() == ISD::SRL)
15553 std::swap(MulLHS, MulRHS);
15554
15555 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15556 return SDValue();
15557
15558 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15559 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15560 MulLHS.getOperand(0) != AddRHS)
15561 return SDValue();
15562
15564 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15565 return SDValue();
15566
15567 SDValue ConstMul =
15568 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15569 return getMad64_32(DAG, SL, MVT::i64,
15570 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15571 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15572}
15573
15574// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15575// multiplies, if any.
15576//
15577// Full 64-bit multiplies that feed into an addition are lowered here instead
15578// of using the generic expansion. The generic expansion ends up with
15579// a tree of ADD nodes that prevents us from using the "add" part of the
15580// MAD instruction. The expansion produced here results in a chain of ADDs
15581// instead of a tree.
15582SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15583 DAGCombinerInfo &DCI) const {
15584 assert(N->isAnyAdd());
15585
15586 SelectionDAG &DAG = DCI.DAG;
15587 EVT VT = N->getValueType(0);
15588 SDLoc SL(N);
15589 SDValue LHS = N->getOperand(0);
15590 SDValue RHS = N->getOperand(1);
15591
15592 if (VT.isVector())
15593 return SDValue();
15594
15595 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15596 // result in scalar registers for uniform values.
15597 if (!N->isDivergent() && Subtarget->hasSMulHi())
15598 return SDValue();
15599
15600 unsigned NumBits = VT.getScalarSizeInBits();
15601 if (NumBits <= 32 || NumBits > 64)
15602 return SDValue();
15603
15604 if (LHS.getOpcode() != ISD::MUL) {
15605 assert(RHS.getOpcode() == ISD::MUL);
15606 std::swap(LHS, RHS);
15607 }
15608
15609 // Avoid the fold if it would unduly increase the number of multiplies due to
15610 // multiple uses, except on hardware with full-rate multiply-add (which is
15611 // part of full-rate 64-bit ops).
15612 if (!Subtarget->hasFullRate64Ops()) {
15613 unsigned NumUsers = 0;
15614 for (SDNode *User : LHS->users()) {
15615 // There is a use that does not feed into addition, so the multiply can't
15616 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15617 if (!User->isAnyAdd())
15618 return SDValue();
15619
15620 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15621 // MUL + 3xADD + 3xADDC over 3xMAD.
15622 ++NumUsers;
15623 if (NumUsers >= 3)
15624 return SDValue();
15625 }
15626 }
15627
15628 SDValue MulLHS = LHS.getOperand(0);
15629 SDValue MulRHS = LHS.getOperand(1);
15630 SDValue AddRHS = RHS;
15631
15632 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15633 return FoldedMAD;
15634
15635 // Always check whether operands are small unsigned values, since that
15636 // knowledge is useful in more cases. Check for small signed values only if
15637 // doing so can unlock a shorter code sequence.
15638 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15639 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15640
15641 bool MulSignedLo = false;
15642 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15643 MulSignedLo =
15644 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15645 }
15646
15647 // The operands and final result all have the same number of bits. If
15648 // operands need to be extended, they can be extended with garbage. The
15649 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15650 // truncated away in the end.
15651 if (VT != MVT::i64) {
15652 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15653 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15654 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15655 }
15656
15657 // The basic code generated is conceptually straightforward. Pseudo code:
15658 //
15659 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15660 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15661 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15662 //
15663 // The second and third lines are optional, depending on whether the factors
15664 // are {sign,zero}-extended or not.
15665 //
15666 // The actual DAG is noisier than the pseudo code, but only due to
15667 // instructions that disassemble values into low and high parts, and
15668 // assemble the final result.
15669 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15670
15671 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15672 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15673 SDValue Accum =
15674 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15675
15676 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15677 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15678
15679 if (!MulLHSUnsigned32) {
15680 auto MulLHSHi =
15681 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15682 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15683 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15684 }
15685
15686 if (!MulRHSUnsigned32) {
15687 auto MulRHSHi =
15688 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15689 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15690 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15691 }
15692
15693 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15694 Accum = DAG.getBitcast(MVT::i64, Accum);
15695 }
15696
15697 if (VT != MVT::i64)
15698 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15699 return Accum;
15700}
15701
15702SDValue
15703SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15704 DAGCombinerInfo &DCI) const {
15705 SDValue RHS = N->getOperand(1);
15706 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15707 if (!CRHS)
15708 return SDValue();
15709
15710 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15711 // common.
15712 uint64_t Val = CRHS->getZExtValue();
15713 if (countr_zero(Val) >= 32) {
15714 SelectionDAG &DAG = DCI.DAG;
15715 SDLoc SL(N);
15716 SDValue LHS = N->getOperand(0);
15717
15718 // Avoid carry machinery if we know the low half of the add does not
15719 // contribute to the final result.
15720 //
15721 // add i64:x, K if computeTrailingZeros(K) >= 32
15722 // => build_pair (add x.hi, K.hi), x.lo
15723
15724 // Breaking the 64-bit add here with this strange constant is unlikely
15725 // to interfere with addressing mode patterns.
15726
15727 SDValue Hi = getHiHalf64(LHS, DAG);
15728 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15729 unsigned Opcode = N->getOpcode();
15730 if (Opcode == ISD::PTRADD)
15731 Opcode = ISD::ADD;
15732 SDValue AddHi =
15733 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15734
15735 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15736 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15737 }
15738
15739 return SDValue();
15740}
15741
15742// Collect the ultimate src of each of the mul node's operands, and confirm
15743// each operand is 8 bytes.
15744static std::optional<ByteProvider<SDValue>>
15745handleMulOperand(const SDValue &MulOperand) {
15746 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15747 if (!Byte0 || Byte0->isConstantZero()) {
15748 return std::nullopt;
15749 }
15750 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15751 if (Byte1 && !Byte1->isConstantZero()) {
15752 return std::nullopt;
15753 }
15754 return Byte0;
15755}
15756
15757static unsigned addPermMasks(unsigned First, unsigned Second) {
15758 unsigned FirstCs = First & 0x0c0c0c0c;
15759 unsigned SecondCs = Second & 0x0c0c0c0c;
15760 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15761 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15762
15763 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15764 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15765 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15766 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15767
15768 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15769}
15770
15771struct DotSrc {
15773 int64_t PermMask;
15775};
15776
15780 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15781
15782 assert(Src0.Src.has_value() && Src1.Src.has_value());
15783 // Src0s and Src1s are empty, just place arbitrarily.
15784 if (Step == 0) {
15785 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15786 Src0.SrcOffset / 4});
15787 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15788 Src1.SrcOffset / 4});
15789 return;
15790 }
15791
15792 for (int BPI = 0; BPI < 2; BPI++) {
15793 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15794 if (BPI == 1) {
15795 BPP = {Src1, Src0};
15796 }
15797 unsigned ZeroMask = 0x0c0c0c0c;
15798 unsigned FMask = 0xFF << (8 * (3 - Step));
15799
15800 unsigned FirstMask =
15801 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15802 unsigned SecondMask =
15803 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15804 // Attempt to find Src vector which contains our SDValue, if so, add our
15805 // perm mask to the existing one. If we are unable to find a match for the
15806 // first SDValue, attempt to find match for the second.
15807 int FirstGroup = -1;
15808 for (int I = 0; I < 2; I++) {
15809 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15810 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15811 return IterElt.SrcOp == *BPP.first.Src &&
15812 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15813 };
15814
15815 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15816 if (Match != Srcs.end()) {
15817 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15818 FirstGroup = I;
15819 break;
15820 }
15821 }
15822 if (FirstGroup != -1) {
15823 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15824 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15825 return IterElt.SrcOp == *BPP.second.Src &&
15826 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15827 };
15828 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15829 if (Match != Srcs.end()) {
15830 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15831 } else
15832 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15833 return;
15834 }
15835 }
15836
15837 // If we have made it here, then we could not find a match in Src0s or Src1s
15838 // for either Src0 or Src1, so just place them arbitrarily.
15839
15840 unsigned ZeroMask = 0x0c0c0c0c;
15841 unsigned FMask = 0xFF << (8 * (3 - Step));
15842
15843 Src0s.push_back(
15844 {*Src0.Src,
15845 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15846 Src0.SrcOffset / 4});
15847 Src1s.push_back(
15848 {*Src1.Src,
15849 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15850 Src1.SrcOffset / 4});
15851}
15852
15854 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15855 bool IsAny) {
15856
15857 // If we just have one source, just permute it accordingly.
15858 if (Srcs.size() == 1) {
15859 auto *Elt = Srcs.begin();
15860 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15861
15862 // v_perm will produce the original value
15863 if (Elt->PermMask == 0x3020100)
15864 return EltOp;
15865
15866 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15867 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15868 }
15869
15870 auto *FirstElt = Srcs.begin();
15871 auto *SecondElt = std::next(FirstElt);
15872
15874
15875 // If we have multiple sources in the chain, combine them via perms (using
15876 // calculated perm mask) and Ors.
15877 while (true) {
15878 auto FirstMask = FirstElt->PermMask;
15879 auto SecondMask = SecondElt->PermMask;
15880
15881 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15882 unsigned FirstPlusFour = FirstMask | 0x04040404;
15883 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15884 // original 0x0C.
15885 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15886
15887 auto PermMask = addPermMasks(FirstMask, SecondMask);
15888 auto FirstVal =
15889 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15890 auto SecondVal =
15891 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15892
15893 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15894 SecondVal,
15895 DAG.getConstant(PermMask, SL, MVT::i32)));
15896
15897 FirstElt = std::next(SecondElt);
15898 if (FirstElt == Srcs.end())
15899 break;
15900
15901 SecondElt = std::next(FirstElt);
15902 // If we only have a FirstElt, then just combine that into the cumulative
15903 // source node.
15904 if (SecondElt == Srcs.end()) {
15905 auto EltOp =
15906 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15907
15908 Perms.push_back(
15909 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15910 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15911 break;
15912 }
15913 }
15914
15915 assert(Perms.size() == 1 || Perms.size() == 2);
15916 return Perms.size() == 2
15917 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15918 : Perms[0];
15919}
15920
15921static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15922 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15923 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15924 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15925 EntryMask += ZeroMask;
15926 }
15927}
15928
15929static bool isMul(const SDValue Op) {
15930 auto Opcode = Op.getOpcode();
15931
15932 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15933 Opcode == AMDGPUISD::MUL_I24);
15934}
15935
15936static std::optional<bool>
15938 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15939 const SDValue &S1Op, const SelectionDAG &DAG) {
15940 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15941 // of the dot4 is irrelevant.
15942 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15943 return false;
15944
15945 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15946 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15947 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15948 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15949 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15950 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15951
15952 assert(!(S0IsUnsigned && S0IsSigned));
15953 assert(!(S1IsUnsigned && S1IsSigned));
15954
15955 // There are 9 possible permutations of
15956 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15957
15958 // In two permutations, the sign bits are known to be the same for both Ops,
15959 // so simply return Signed / Unsigned corresponding to the MSB
15960
15961 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15962 return S0IsSigned;
15963
15964 // In another two permutations, the sign bits are known to be opposite. In
15965 // this case return std::nullopt to indicate a bad match.
15966
15967 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15968 return std::nullopt;
15969
15970 // In the remaining five permutations, we don't know the value of the sign
15971 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15972 // the upper bits must be extension bits. Thus, the only ways for the sign
15973 // bit to be unknown is if it was sign extended from unknown value, or if it
15974 // was any extended. In either case, it is correct to use the signed
15975 // version of the signedness semantics of dot4
15976
15977 // In two of such permutations, we known the sign bit is set for
15978 // one op, and the other is unknown. It is okay to used signed version of
15979 // dot4.
15980 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15981 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15982 return true;
15983
15984 // In one such permutation, we don't know either of the sign bits. It is okay
15985 // to used the signed version of dot4.
15986 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15987 return true;
15988
15989 // In two of such permutations, we known the sign bit is unset for
15990 // one op, and the other is unknown. Return std::nullopt to indicate a
15991 // bad match.
15992 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15993 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15994 return std::nullopt;
15995
15996 llvm_unreachable("Fully covered condition");
15997}
15998
15999SDValue SITargetLowering::performAddCombine(SDNode *N,
16000 DAGCombinerInfo &DCI) const {
16001 SelectionDAG &DAG = DCI.DAG;
16002 EVT VT = N->getValueType(0);
16003 SDLoc SL(N);
16004 SDValue LHS = N->getOperand(0);
16005 SDValue RHS = N->getOperand(1);
16006
16007 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16008 if (Subtarget->hasMad64_32()) {
16009 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16010 return Folded;
16011 }
16012 }
16013
16014 if (SDValue V = reassociateScalarOps(N, DAG)) {
16015 return V;
16016 }
16017
16018 if (VT == MVT::i64) {
16019 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16020 return Folded;
16021 }
16022
16023 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16024 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16025 SDValue TempNode(N, 0);
16026 std::optional<bool> IsSigned;
16030
16031 // Match the v_dot4 tree, while collecting src nodes.
16032 int ChainLength = 0;
16033 for (int I = 0; I < 4; I++) {
16034 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16035 if (MulIdx == -1)
16036 break;
16037 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16038 if (!Src0)
16039 break;
16040 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16041 if (!Src1)
16042 break;
16043
16044 auto IterIsSigned = checkDot4MulSignedness(
16045 TempNode->getOperand(MulIdx), *Src0, *Src1,
16046 TempNode->getOperand(MulIdx)->getOperand(0),
16047 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16048 if (!IterIsSigned)
16049 break;
16050 if (!IsSigned)
16051 IsSigned = *IterIsSigned;
16052 if (*IterIsSigned != *IsSigned)
16053 break;
16054 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16055 auto AddIdx = 1 - MulIdx;
16056 // Allow the special case where add (add (mul24, 0), mul24) became ->
16057 // add (mul24, mul24).
16058 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16059 Src2s.push_back(TempNode->getOperand(AddIdx));
16060 auto Src0 =
16061 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16062 if (!Src0)
16063 break;
16064 auto Src1 =
16065 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16066 if (!Src1)
16067 break;
16068 auto IterIsSigned = checkDot4MulSignedness(
16069 TempNode->getOperand(AddIdx), *Src0, *Src1,
16070 TempNode->getOperand(AddIdx)->getOperand(0),
16071 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16072 if (!IterIsSigned)
16073 break;
16074 assert(IsSigned);
16075 if (*IterIsSigned != *IsSigned)
16076 break;
16077 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16078 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16079 ChainLength = I + 2;
16080 break;
16081 }
16082
16083 TempNode = TempNode->getOperand(AddIdx);
16084 Src2s.push_back(TempNode);
16085 ChainLength = I + 1;
16086 if (TempNode->getNumOperands() < 2)
16087 break;
16088 LHS = TempNode->getOperand(0);
16089 RHS = TempNode->getOperand(1);
16090 }
16091
16092 if (ChainLength < 2)
16093 return SDValue();
16094
16095 // Masks were constructed with assumption that we would find a chain of
16096 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16097 // 0x0c) so they do not affect dot calculation.
16098 if (ChainLength < 4) {
16099 fixMasks(Src0s, ChainLength);
16100 fixMasks(Src1s, ChainLength);
16101 }
16102
16103 SDValue Src0, Src1;
16104
16105 // If we are just using a single source for both, and have permuted the
16106 // bytes consistently, we can just use the sources without permuting
16107 // (commutation).
16108 bool UseOriginalSrc = false;
16109 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16110 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16111 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16112 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16113 SmallVector<unsigned, 4> SrcBytes;
16114 auto Src0Mask = Src0s.begin()->PermMask;
16115 SrcBytes.push_back(Src0Mask & 0xFF000000);
16116 bool UniqueEntries = true;
16117 for (auto I = 1; I < 4; I++) {
16118 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16119
16120 if (is_contained(SrcBytes, NextByte)) {
16121 UniqueEntries = false;
16122 break;
16123 }
16124 SrcBytes.push_back(NextByte);
16125 }
16126
16127 if (UniqueEntries) {
16128 UseOriginalSrc = true;
16129
16130 auto *FirstElt = Src0s.begin();
16131 auto FirstEltOp =
16132 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16133
16134 auto *SecondElt = Src1s.begin();
16135 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16136 SecondElt->DWordOffset);
16137
16138 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16139 MVT::getIntegerVT(32));
16140 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16141 MVT::getIntegerVT(32));
16142 }
16143 }
16144
16145 if (!UseOriginalSrc) {
16146 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16147 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16148 }
16149
16150 assert(IsSigned);
16151 SDValue Src2 =
16152 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16153
16154 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16155 : Intrinsic::amdgcn_udot4,
16156 SL, MVT::i64);
16157
16158 assert(!VT.isVector());
16159 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16160 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16161
16162 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16163 }
16164
16165 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16166 return SDValue();
16167
16168 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16169 // add x, sext (setcc) => usubo_carry x, 0, setcc
16170 unsigned Opc = LHS.getOpcode();
16173 std::swap(RHS, LHS);
16174
16175 Opc = RHS.getOpcode();
16176 switch (Opc) {
16177 default:
16178 break;
16179 case ISD::ZERO_EXTEND:
16180 case ISD::SIGN_EXTEND:
16181 case ISD::ANY_EXTEND: {
16182 auto Cond = RHS.getOperand(0);
16183 // If this won't be a real VOPC output, we would still need to insert an
16184 // extra instruction anyway.
16185 if (!isBoolSGPR(Cond))
16186 break;
16187 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16188 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16190 return DAG.getNode(Opc, SL, VTList, Args);
16191 }
16192 case ISD::UADDO_CARRY: {
16193 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16194 if (!isNullConstant(RHS.getOperand(1)))
16195 break;
16196 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16197 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16198 }
16199 }
16200 return SDValue();
16201}
16202
16203SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16204 DAGCombinerInfo &DCI) const {
16205 SelectionDAG &DAG = DCI.DAG;
16206 SDLoc DL(N);
16207 EVT VT = N->getValueType(0);
16208 SDValue N0 = N->getOperand(0);
16209 SDValue N1 = N->getOperand(1);
16210
16211 // The following folds transform PTRADDs into regular arithmetic in cases
16212 // where the PTRADD wouldn't be folded as an immediate offset into memory
16213 // instructions anyway. They are target-specific in that other targets might
16214 // prefer to not lose information about the pointer arithmetic.
16215
16216 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16217 // Adapted from DAGCombiner::visitADDLikeCommutative.
16218 SDValue V, K;
16219 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16220 SDNodeFlags ShlFlags = N1->getFlags();
16221 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16222 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16223 // preserved.
16224 SDNodeFlags NewShlFlags =
16225 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16227 : SDNodeFlags();
16228 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16229 DCI.AddToWorklist(Inner.getNode());
16230 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16231 }
16232
16233 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16234 // performAddCombine.
16235 if (N1.getOpcode() == ISD::MUL) {
16236 if (Subtarget->hasMad64_32()) {
16237 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16238 return Folded;
16239 }
16240 }
16241
16242 // If the 32 low bits of the constant are all zero, there is nothing to fold
16243 // into an immediate offset, so it's better to eliminate the unnecessary
16244 // addition for the lower 32 bits than to preserve the PTRADD.
16245 // Analogous to a fold in performAddCombine.
16246 if (VT == MVT::i64) {
16247 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16248 return Folded;
16249 }
16250
16251 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16252 return SDValue();
16253
16254 SDValue X = N0;
16255 SDValue Y = N1.getOperand(0);
16256 SDValue Z = N1.getOperand(1);
16257 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16258 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16259
16260 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16261 Y->isDivergent() != Z->isDivergent()) {
16262 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16263 // y are uniform and z isn't.
16264 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16265 // z are uniform and y isn't.
16266 // The goal is to push uniform operands up in the computation, so that they
16267 // can be handled with scalar operations. We can't use reassociateScalarOps
16268 // for this since it requires two identical commutative operations to
16269 // reassociate.
16270 if (Y->isDivergent())
16271 std::swap(Y, Z);
16272 // If both additions in the original were NUW, reassociation preserves that.
16273 SDNodeFlags ReassocFlags =
16274 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16275 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16276 DCI.AddToWorklist(UniformInner.getNode());
16277 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16278 }
16279
16280 return SDValue();
16281}
16282
16283SDValue SITargetLowering::performSubCombine(SDNode *N,
16284 DAGCombinerInfo &DCI) const {
16285 SelectionDAG &DAG = DCI.DAG;
16286 EVT VT = N->getValueType(0);
16287
16288 if (VT == MVT::i64) {
16289 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16290 return Folded;
16291 }
16292
16293 if (VT != MVT::i32)
16294 return SDValue();
16295
16296 SDLoc SL(N);
16297 SDValue LHS = N->getOperand(0);
16298 SDValue RHS = N->getOperand(1);
16299
16300 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16301 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16302 unsigned Opc = RHS.getOpcode();
16303 switch (Opc) {
16304 default:
16305 break;
16306 case ISD::ZERO_EXTEND:
16307 case ISD::SIGN_EXTEND:
16308 case ISD::ANY_EXTEND: {
16309 auto Cond = RHS.getOperand(0);
16310 // If this won't be a real VOPC output, we would still need to insert an
16311 // extra instruction anyway.
16312 if (!isBoolSGPR(Cond))
16313 break;
16314 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16315 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16317 return DAG.getNode(Opc, SL, VTList, Args);
16318 }
16319 }
16320
16321 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16322 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16323 if (!isNullConstant(LHS.getOperand(1)))
16324 return SDValue();
16325 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16326 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16327 }
16328 return SDValue();
16329}
16330
16331SDValue
16332SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16333 DAGCombinerInfo &DCI) const {
16334
16335 if (N->getValueType(0) != MVT::i32)
16336 return SDValue();
16337
16338 if (!isNullConstant(N->getOperand(1)))
16339 return SDValue();
16340
16341 SelectionDAG &DAG = DCI.DAG;
16342 SDValue LHS = N->getOperand(0);
16343
16344 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16345 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16346 unsigned LHSOpc = LHS.getOpcode();
16347 unsigned Opc = N->getOpcode();
16348 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16349 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16350 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16351 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16352 }
16353 return SDValue();
16354}
16355
16356SDValue SITargetLowering::performFAddCombine(SDNode *N,
16357 DAGCombinerInfo &DCI) const {
16358 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16359 return SDValue();
16360
16361 SelectionDAG &DAG = DCI.DAG;
16362 EVT VT = N->getValueType(0);
16363
16364 SDLoc SL(N);
16365 SDValue LHS = N->getOperand(0);
16366 SDValue RHS = N->getOperand(1);
16367
16368 // These should really be instruction patterns, but writing patterns with
16369 // source modifiers is a pain.
16370
16371 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16372 if (LHS.getOpcode() == ISD::FADD) {
16373 SDValue A = LHS.getOperand(0);
16374 if (A == LHS.getOperand(1)) {
16375 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16376 if (FusedOp != 0) {
16377 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16378 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16379 }
16380 }
16381 }
16382
16383 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16384 if (RHS.getOpcode() == ISD::FADD) {
16385 SDValue A = RHS.getOperand(0);
16386 if (A == RHS.getOperand(1)) {
16387 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16388 if (FusedOp != 0) {
16389 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16390 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16391 }
16392 }
16393 }
16394
16395 return SDValue();
16396}
16397
16398SDValue SITargetLowering::performFSubCombine(SDNode *N,
16399 DAGCombinerInfo &DCI) const {
16400 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16401 return SDValue();
16402
16403 SelectionDAG &DAG = DCI.DAG;
16404 SDLoc SL(N);
16405 EVT VT = N->getValueType(0);
16406 assert(!VT.isVector());
16407
16408 // Try to get the fneg to fold into the source modifier. This undoes generic
16409 // DAG combines and folds them into the mad.
16410 //
16411 // Only do this if we are not trying to support denormals. v_mad_f32 does
16412 // not support denormals ever.
16413 SDValue LHS = N->getOperand(0);
16414 SDValue RHS = N->getOperand(1);
16415 if (LHS.getOpcode() == ISD::FADD) {
16416 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16417 SDValue A = LHS.getOperand(0);
16418 if (A == LHS.getOperand(1)) {
16419 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16420 if (FusedOp != 0) {
16421 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16422 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16423
16424 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16425 }
16426 }
16427 }
16428
16429 if (RHS.getOpcode() == ISD::FADD) {
16430 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16431
16432 SDValue A = RHS.getOperand(0);
16433 if (A == RHS.getOperand(1)) {
16434 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16435 if (FusedOp != 0) {
16436 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16437 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16438 }
16439 }
16440 }
16441
16442 return SDValue();
16443}
16444
16445SDValue SITargetLowering::performFDivCombine(SDNode *N,
16446 DAGCombinerInfo &DCI) const {
16447 SelectionDAG &DAG = DCI.DAG;
16448 SDLoc SL(N);
16449 EVT VT = N->getValueType(0);
16450 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16451 return SDValue();
16452
16453 SDValue LHS = N->getOperand(0);
16454 SDValue RHS = N->getOperand(1);
16455
16456 SDNodeFlags Flags = N->getFlags();
16457 SDNodeFlags RHSFlags = RHS->getFlags();
16458 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16459 !RHS->hasOneUse())
16460 return SDValue();
16461
16462 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16463 bool IsNegative = false;
16464 if (CLHS->isExactlyValue(1.0) ||
16465 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16466 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16467 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16468 if (RHS.getOpcode() == ISD::FSQRT) {
16469 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16470 SDValue Rsq =
16471 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16472 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16473 }
16474 }
16475 }
16476
16477 return SDValue();
16478}
16479
16480SDValue SITargetLowering::performFMulCombine(SDNode *N,
16481 DAGCombinerInfo &DCI) const {
16482 SelectionDAG &DAG = DCI.DAG;
16483 EVT VT = N->getValueType(0);
16484 EVT ScalarVT = VT.getScalarType();
16485 EVT IntVT = VT.changeElementType(MVT::i32);
16486
16487 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16488 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16489 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16490 return SDValue();
16491 }
16492
16493 SDValue LHS = N->getOperand(0);
16494 SDValue RHS = N->getOperand(1);
16495
16496 // It is cheaper to realize i32 inline constants as compared against
16497 // materializing f16 or f64 (or even non-inline f32) values,
16498 // possible via ldexp usage, as shown below :
16499 //
16500 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16501 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16502 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16503 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16504 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16505 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16506 if (!TrueNode)
16507 return SDValue();
16508 const ConstantFPSDNode *FalseNode =
16509 isConstOrConstSplatFP(RHS.getOperand(2));
16510 if (!FalseNode)
16511 return SDValue();
16512
16513 if (TrueNode->isNegative() != FalseNode->isNegative())
16514 return SDValue();
16515
16516 // For f32, only non-inline constants should be transformed.
16517 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16518 if (ScalarVT == MVT::f32 &&
16519 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16520 TII->isInlineConstant(FalseNode->getValueAPF()))
16521 return SDValue();
16522
16523 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16524 if (TrueNodeExpVal == INT_MIN)
16525 return SDValue();
16526 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16527 if (FalseNodeExpVal == INT_MIN)
16528 return SDValue();
16529
16530 SDLoc SL(N);
16531 SDValue SelectNode =
16532 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16533 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16534 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16535
16536 LHS = TrueNode->isNegative()
16537 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16538 : LHS;
16539
16540 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16541 }
16542
16543 return SDValue();
16544}
16545
16546SDValue SITargetLowering::performFMACombine(SDNode *N,
16547 DAGCombinerInfo &DCI) const {
16548 SelectionDAG &DAG = DCI.DAG;
16549 EVT VT = N->getValueType(0);
16550 SDLoc SL(N);
16551
16552 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16553 return SDValue();
16554
16555 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16556 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16557 SDValue Op1 = N->getOperand(0);
16558 SDValue Op2 = N->getOperand(1);
16559 SDValue FMA = N->getOperand(2);
16560
16561 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16562 Op2.getOpcode() != ISD::FP_EXTEND)
16563 return SDValue();
16564
16565 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16566 // regardless of the denorm mode setting. Therefore,
16567 // fp-contract is sufficient to allow generating fdot2.
16568 const TargetOptions &Options = DAG.getTarget().Options;
16569 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16570 (N->getFlags().hasAllowContract() &&
16571 FMA->getFlags().hasAllowContract())) {
16572 Op1 = Op1.getOperand(0);
16573 Op2 = Op2.getOperand(0);
16574 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16576 return SDValue();
16577
16578 SDValue Vec1 = Op1.getOperand(0);
16579 SDValue Idx1 = Op1.getOperand(1);
16580 SDValue Vec2 = Op2.getOperand(0);
16581
16582 SDValue FMAOp1 = FMA.getOperand(0);
16583 SDValue FMAOp2 = FMA.getOperand(1);
16584 SDValue FMAAcc = FMA.getOperand(2);
16585
16586 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16587 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16588 return SDValue();
16589
16590 FMAOp1 = FMAOp1.getOperand(0);
16591 FMAOp2 = FMAOp2.getOperand(0);
16592 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16594 return SDValue();
16595
16596 SDValue Vec3 = FMAOp1.getOperand(0);
16597 SDValue Vec4 = FMAOp2.getOperand(0);
16598 SDValue Idx2 = FMAOp1.getOperand(1);
16599
16600 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16601 // Idx1 and Idx2 cannot be the same.
16602 Idx1 == Idx2)
16603 return SDValue();
16604
16605 if (Vec1 == Vec2 || Vec3 == Vec4)
16606 return SDValue();
16607
16608 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16609 return SDValue();
16610
16611 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16612 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16613 DAG.getTargetConstant(0, SL, MVT::i1));
16614 }
16615 }
16616 return SDValue();
16617}
16618
16619SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16620 DAGCombinerInfo &DCI) const {
16621 SelectionDAG &DAG = DCI.DAG;
16622 SDLoc SL(N);
16623
16624 SDValue LHS = N->getOperand(0);
16625 SDValue RHS = N->getOperand(1);
16626 EVT VT = LHS.getValueType();
16627 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16628
16629 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16630 if (!CRHS) {
16632 if (CRHS) {
16633 std::swap(LHS, RHS);
16634 CC = getSetCCSwappedOperands(CC);
16635 }
16636 }
16637
16638 if (CRHS) {
16639 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16640 isBoolSGPR(LHS.getOperand(0))) {
16641 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16642 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16643 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16644 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16645 if ((CRHS->isAllOnes() &&
16646 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16647 (CRHS->isZero() &&
16648 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16649 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16650 DAG.getAllOnesConstant(SL, MVT::i1));
16651 if ((CRHS->isAllOnes() &&
16652 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16653 (CRHS->isZero() &&
16654 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16655 return LHS.getOperand(0);
16656 }
16657
16658 const APInt &CRHSVal = CRHS->getAPIntValue();
16659 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16660 LHS.getOpcode() == ISD::SELECT &&
16661 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16662 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16663 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16664 isBoolSGPR(LHS.getOperand(0))) {
16665 // Given CT != FT:
16666 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16667 // setcc (select cc, CT, CF), CF, ne => cc
16668 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16669 // setcc (select cc, CT, CF), CT, eq => cc
16670 const APInt &CT = LHS.getConstantOperandAPInt(1);
16671 const APInt &CF = LHS.getConstantOperandAPInt(2);
16672
16673 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16674 (CT == CRHSVal && CC == ISD::SETNE))
16675 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16676 DAG.getAllOnesConstant(SL, MVT::i1));
16677 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16678 (CT == CRHSVal && CC == ISD::SETEQ))
16679 return LHS.getOperand(0);
16680 }
16681 }
16682
16683 // Eliminate setcc by using carryout from add/sub instruction
16684
16685 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16686 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16687 // similarly for subtraction
16688
16689 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16690 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16691
16692 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16694 (CC == ISD::SETUGT &&
16696 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16697 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16698 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16699
16700 SDValue Op0 = LHS.getOperand(0);
16701 SDValue Op1 = LHS.getOperand(1);
16702
16703 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16704 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16705
16706 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16707 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16708
16709 SDValue NodeLo =
16710 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16711 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16712
16713 SDValue CarryInHi = NodeLo.getValue(1);
16714 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16715 SL, DAG.getVTList(MVT::i32, MVT::i1),
16716 {Op0Hi, Op1Hi, CarryInHi});
16717
16718 SDValue ResultLo = NodeLo.getValue(0);
16719 SDValue ResultHi = NodeHi.getValue(0);
16720
16721 SDValue JoinedResult =
16722 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16723
16724 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16725 SDValue Overflow = NodeHi.getValue(1);
16726 DCI.CombineTo(LHS.getNode(), Result);
16727 return Overflow;
16728 }
16729
16730 if (VT != MVT::f32 && VT != MVT::f64 &&
16731 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16732 return SDValue();
16733
16734 // Match isinf/isfinite pattern
16735 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16736 // (fcmp one (fabs x), inf) -> (fp_class x,
16737 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16738 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16739 LHS.getOpcode() == ISD::FABS) {
16740 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16741 if (!CRHS)
16742 return SDValue();
16743
16744 const APFloat &APF = CRHS->getValueAPF();
16745 if (APF.isInfinity() && !APF.isNegative()) {
16746 const unsigned IsInfMask =
16748 const unsigned IsFiniteMask =
16752 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16753 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16754 DAG.getConstant(Mask, SL, MVT::i32));
16755 }
16756 }
16757
16758 return SDValue();
16759}
16760
16761SDValue
16762SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16763 DAGCombinerInfo &DCI) const {
16764 SelectionDAG &DAG = DCI.DAG;
16765 SDLoc SL(N);
16766 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16767
16768 SDValue Src = N->getOperand(0);
16769 SDValue Shift = N->getOperand(0);
16770
16771 // TODO: Extend type shouldn't matter (assuming legal types).
16772 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16773 Shift = Shift.getOperand(0);
16774
16775 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16776 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16777 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16778 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16779 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16780 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16781 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16782 SDValue Shifted = DAG.getZExtOrTrunc(
16783 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16784
16785 unsigned ShiftOffset = 8 * Offset;
16786 if (Shift.getOpcode() == ISD::SHL)
16787 ShiftOffset -= C->getZExtValue();
16788 else
16789 ShiftOffset += C->getZExtValue();
16790
16791 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16792 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16793 MVT::f32, Shifted);
16794 }
16795 }
16796 }
16797
16798 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16799 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16800 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16801 // We simplified Src. If this node is not dead, visit it again so it is
16802 // folded properly.
16803 if (N->getOpcode() != ISD::DELETED_NODE)
16804 DCI.AddToWorklist(N);
16805 return SDValue(N, 0);
16806 }
16807
16808 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16809 if (SDValue DemandedSrc =
16810 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16811 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16812
16813 return SDValue();
16814}
16815
16816SDValue SITargetLowering::performClampCombine(SDNode *N,
16817 DAGCombinerInfo &DCI) const {
16818 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16819 if (!CSrc)
16820 return SDValue();
16821
16822 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16823 const APFloat &F = CSrc->getValueAPF();
16824 APFloat Zero = APFloat::getZero(F.getSemantics());
16825 if (F < Zero ||
16826 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16827 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16828 }
16829
16830 APFloat One(F.getSemantics(), "1.0");
16831 if (F > One)
16832 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16833
16834 return SDValue(CSrc, 0);
16835}
16836
16837SDValue SITargetLowering::performSelectCombine(SDNode *N,
16838 DAGCombinerInfo &DCI) const {
16839
16840 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16841 // integer).
16842 // Detect when CMP and SELECT use the same constant and fold them to avoid
16843 // loading the constant twice. Specifically handles patterns like:
16844 // %cmp = icmp eq i32 %val, 4242
16845 // %sel = select i1 %cmp, i32 4242, i32 %other
16846 // It can be optimized to reuse %val instead of 4242 in select.
16847 SDValue Cond = N->getOperand(0);
16848 SDValue TrueVal = N->getOperand(1);
16849 SDValue FalseVal = N->getOperand(2);
16850
16851 // Check if condition is a comparison.
16852 if (Cond.getOpcode() != ISD::SETCC)
16853 return SDValue();
16854
16855 SDValue LHS = Cond.getOperand(0);
16856 SDValue RHS = Cond.getOperand(1);
16857 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16858
16859 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16860 bool isInteger = LHS.getValueType().isInteger();
16861
16862 // Handle simple floating-point and integer types only.
16863 if (!isFloatingPoint && !isInteger)
16864 return SDValue();
16865
16866 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16867 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16868 if (!isEquality && !isNonEquality)
16869 return SDValue();
16870
16871 SDValue ArgVal, ConstVal;
16872 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16873 (isInteger && isa<ConstantSDNode>(RHS))) {
16874 ConstVal = RHS;
16875 ArgVal = LHS;
16876 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16877 (isInteger && isa<ConstantSDNode>(LHS))) {
16878 ConstVal = LHS;
16879 ArgVal = RHS;
16880 } else {
16881 return SDValue();
16882 }
16883
16884 // Skip optimization for inlinable immediates.
16885 if (isFloatingPoint) {
16886 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16887 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16888 return SDValue();
16889 } else {
16891 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16892 return SDValue();
16893 }
16894
16895 // For equality and non-equality comparisons, patterns:
16896 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16897 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16898 if (!(isEquality && TrueVal == ConstVal) &&
16899 !(isNonEquality && FalseVal == ConstVal))
16900 return SDValue();
16901
16902 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16903 SDValue SelectRHS =
16904 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16905 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16906 SelectLHS, SelectRHS);
16907}
16908
16910 DAGCombinerInfo &DCI) const {
16911 switch (N->getOpcode()) {
16912 case ISD::ADD:
16913 case ISD::SUB:
16914 case ISD::SHL:
16915 case ISD::SRL:
16916 case ISD::SRA:
16917 case ISD::AND:
16918 case ISD::OR:
16919 case ISD::XOR:
16920 case ISD::MUL:
16921 case ISD::SETCC:
16922 case ISD::SELECT:
16923 case ISD::SMIN:
16924 case ISD::SMAX:
16925 case ISD::UMIN:
16926 case ISD::UMAX:
16927 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16928 return Res;
16929 break;
16930 default:
16931 break;
16932 }
16933
16934 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16935 return SDValue();
16936
16937 switch (N->getOpcode()) {
16938 case ISD::ADD:
16939 return performAddCombine(N, DCI);
16940 case ISD::PTRADD:
16941 return performPtrAddCombine(N, DCI);
16942 case ISD::SUB:
16943 return performSubCombine(N, DCI);
16944 case ISD::UADDO_CARRY:
16945 case ISD::USUBO_CARRY:
16946 return performAddCarrySubCarryCombine(N, DCI);
16947 case ISD::FADD:
16948 return performFAddCombine(N, DCI);
16949 case ISD::FSUB:
16950 return performFSubCombine(N, DCI);
16951 case ISD::FDIV:
16952 return performFDivCombine(N, DCI);
16953 case ISD::FMUL:
16954 return performFMulCombine(N, DCI);
16955 case ISD::SETCC:
16956 return performSetCCCombine(N, DCI);
16957 case ISD::SELECT:
16958 if (auto Res = performSelectCombine(N, DCI))
16959 return Res;
16960 break;
16961 case ISD::FMAXNUM:
16962 case ISD::FMINNUM:
16963 case ISD::FMAXNUM_IEEE:
16964 case ISD::FMINNUM_IEEE:
16965 case ISD::FMAXIMUM:
16966 case ISD::FMINIMUM:
16967 case ISD::FMAXIMUMNUM:
16968 case ISD::FMINIMUMNUM:
16969 case ISD::SMAX:
16970 case ISD::SMIN:
16971 case ISD::UMAX:
16972 case ISD::UMIN:
16973 case AMDGPUISD::FMIN_LEGACY:
16974 case AMDGPUISD::FMAX_LEGACY:
16975 return performMinMaxCombine(N, DCI);
16976 case ISD::FMA:
16977 return performFMACombine(N, DCI);
16978 case ISD::AND:
16979 return performAndCombine(N, DCI);
16980 case ISD::OR:
16981 return performOrCombine(N, DCI);
16982 case ISD::FSHR: {
16984 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16985 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16986 return matchPERM(N, DCI);
16987 }
16988 break;
16989 }
16990 case ISD::XOR:
16991 return performXorCombine(N, DCI);
16992 case ISD::ZERO_EXTEND:
16993 return performZeroExtendCombine(N, DCI);
16995 return performSignExtendInRegCombine(N, DCI);
16996 case AMDGPUISD::FP_CLASS:
16997 return performClassCombine(N, DCI);
16998 case ISD::FCANONICALIZE:
16999 return performFCanonicalizeCombine(N, DCI);
17000 case AMDGPUISD::RCP:
17001 return performRcpCombine(N, DCI);
17002 case ISD::FLDEXP:
17003 case AMDGPUISD::FRACT:
17004 case AMDGPUISD::RSQ:
17005 case AMDGPUISD::RCP_LEGACY:
17006 case AMDGPUISD::RCP_IFLAG:
17007 case AMDGPUISD::RSQ_CLAMP: {
17008 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17009 SDValue Src = N->getOperand(0);
17010 if (Src.isUndef())
17011 return Src;
17012 break;
17013 }
17014 case ISD::SINT_TO_FP:
17015 case ISD::UINT_TO_FP:
17016 return performUCharToFloatCombine(N, DCI);
17017 case ISD::FCOPYSIGN:
17018 return performFCopySignCombine(N, DCI);
17019 case AMDGPUISD::CVT_F32_UBYTE0:
17020 case AMDGPUISD::CVT_F32_UBYTE1:
17021 case AMDGPUISD::CVT_F32_UBYTE2:
17022 case AMDGPUISD::CVT_F32_UBYTE3:
17023 return performCvtF32UByteNCombine(N, DCI);
17024 case AMDGPUISD::FMED3:
17025 return performFMed3Combine(N, DCI);
17026 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17027 return performCvtPkRTZCombine(N, DCI);
17028 case AMDGPUISD::CLAMP:
17029 return performClampCombine(N, DCI);
17030 case ISD::SCALAR_TO_VECTOR: {
17031 SelectionDAG &DAG = DCI.DAG;
17032 EVT VT = N->getValueType(0);
17033
17034 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17035 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17036 SDLoc SL(N);
17037 SDValue Src = N->getOperand(0);
17038 EVT EltVT = Src.getValueType();
17039 if (EltVT != MVT::i16)
17040 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17041
17042 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17043 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17044 }
17045
17046 break;
17047 }
17049 return performExtractVectorEltCombine(N, DCI);
17051 return performInsertVectorEltCombine(N, DCI);
17052 case ISD::FP_ROUND:
17053 return performFPRoundCombine(N, DCI);
17054 case ISD::LOAD: {
17055 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17056 return Widened;
17057 [[fallthrough]];
17058 }
17059 default: {
17060 if (!DCI.isBeforeLegalize()) {
17061 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17062 return performMemSDNodeCombine(MemNode, DCI);
17063 }
17064
17065 break;
17066 }
17067 }
17068
17070}
17071
17072/// Helper function for adjustWritemask
17073static unsigned SubIdx2Lane(unsigned Idx) {
17074 switch (Idx) {
17075 default:
17076 return ~0u;
17077 case AMDGPU::sub0:
17078 return 0;
17079 case AMDGPU::sub1:
17080 return 1;
17081 case AMDGPU::sub2:
17082 return 2;
17083 case AMDGPU::sub3:
17084 return 3;
17085 case AMDGPU::sub4:
17086 return 4; // Possible with TFE/LWE
17087 }
17088}
17089
17090/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17091SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17092 SelectionDAG &DAG) const {
17093 unsigned Opcode = Node->getMachineOpcode();
17094
17095 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17096 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17097 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17098 return Node; // not implemented for D16
17099
17100 SDNode *Users[5] = {nullptr};
17101 unsigned Lane = 0;
17102 unsigned DmaskIdx =
17103 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17104 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17105 unsigned NewDmask = 0;
17106 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17107 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17108 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17109 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17110 unsigned TFCLane = 0;
17111 bool HasChain = Node->getNumValues() > 1;
17112
17113 if (OldDmask == 0) {
17114 // These are folded out, but on the chance it happens don't assert.
17115 return Node;
17116 }
17117
17118 unsigned OldBitsSet = llvm::popcount(OldDmask);
17119 // Work out which is the TFE/LWE lane if that is enabled.
17120 if (UsesTFC) {
17121 TFCLane = OldBitsSet;
17122 }
17123
17124 // Try to figure out the used register components
17125 for (SDUse &Use : Node->uses()) {
17126
17127 // Don't look at users of the chain.
17128 if (Use.getResNo() != 0)
17129 continue;
17130
17131 SDNode *User = Use.getUser();
17132
17133 // Abort if we can't understand the usage
17134 if (!User->isMachineOpcode() ||
17135 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17136 return Node;
17137
17138 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17139 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17140 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17141 // set, etc.
17142 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17143 if (Lane == ~0u)
17144 return Node;
17145
17146 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17147 if (UsesTFC && Lane == TFCLane) {
17148 Users[Lane] = User;
17149 } else {
17150 // Set which texture component corresponds to the lane.
17151 unsigned Comp;
17152 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17153 Comp = llvm::countr_zero(Dmask);
17154 Dmask &= ~(1 << Comp);
17155 }
17156
17157 // Abort if we have more than one user per component.
17158 if (Users[Lane])
17159 return Node;
17160
17161 Users[Lane] = User;
17162 NewDmask |= 1 << Comp;
17163 }
17164 }
17165
17166 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17167 bool NoChannels = !NewDmask;
17168 if (NoChannels) {
17169 if (!UsesTFC) {
17170 // No uses of the result and not using TFC. Then do nothing.
17171 return Node;
17172 }
17173 // If the original dmask has one channel - then nothing to do
17174 if (OldBitsSet == 1)
17175 return Node;
17176 // Use an arbitrary dmask - required for the instruction to work
17177 NewDmask = 1;
17178 }
17179 // Abort if there's no change
17180 if (NewDmask == OldDmask)
17181 return Node;
17182
17183 unsigned BitsSet = llvm::popcount(NewDmask);
17184
17185 // Check for TFE or LWE - increase the number of channels by one to account
17186 // for the extra return value
17187 // This will need adjustment for D16 if this is also included in
17188 // adjustWriteMask (this function) but at present D16 are excluded.
17189 unsigned NewChannels = BitsSet + UsesTFC;
17190
17191 int NewOpcode =
17192 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17193 assert(NewOpcode != -1 &&
17194 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17195 "failed to find equivalent MIMG op");
17196
17197 // Adjust the writemask in the node
17199 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17200 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17201 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17202
17203 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17204
17205 MVT ResultVT = NewChannels == 1
17206 ? SVT
17207 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17208 : NewChannels == 5 ? 8
17209 : NewChannels);
17210 SDVTList NewVTList =
17211 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17212
17213 MachineSDNode *NewNode =
17214 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17215
17216 if (HasChain) {
17217 // Update chain.
17218 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17219 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17220 }
17221
17222 if (NewChannels == 1) {
17223 assert(Node->hasNUsesOfValue(1, 0));
17224 SDNode *Copy =
17225 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17226 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17227 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17228 return nullptr;
17229 }
17230
17231 // Update the users of the node with the new indices
17232 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17233 SDNode *User = Users[i];
17234 if (!User) {
17235 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17236 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17237 if (i || !NoChannels)
17238 continue;
17239 } else {
17240 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17241 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17242 if (NewUser != User) {
17243 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17244 DAG.RemoveDeadNode(User);
17245 }
17246 }
17247
17248 switch (Idx) {
17249 default:
17250 break;
17251 case AMDGPU::sub0:
17252 Idx = AMDGPU::sub1;
17253 break;
17254 case AMDGPU::sub1:
17255 Idx = AMDGPU::sub2;
17256 break;
17257 case AMDGPU::sub2:
17258 Idx = AMDGPU::sub3;
17259 break;
17260 case AMDGPU::sub3:
17261 Idx = AMDGPU::sub4;
17262 break;
17263 }
17264 }
17265
17266 DAG.RemoveDeadNode(Node);
17267 return nullptr;
17268}
17269
17271 if (Op.getOpcode() == ISD::AssertZext)
17272 Op = Op.getOperand(0);
17273
17274 return isa<FrameIndexSDNode>(Op);
17275}
17276
17277/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17278/// with frame index operands.
17279/// LLVM assumes that inputs are to these instructions are registers.
17280SDNode *
17282 SelectionDAG &DAG) const {
17283 if (Node->getOpcode() == ISD::CopyToReg) {
17284 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17285 SDValue SrcVal = Node->getOperand(2);
17286
17287 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17288 // to try understanding copies to physical registers.
17289 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17290 SDLoc SL(Node);
17292 SDValue VReg = DAG.getRegister(
17293 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17294
17295 SDNode *Glued = Node->getGluedNode();
17296 SDValue ToVReg = DAG.getCopyToReg(
17297 Node->getOperand(0), SL, VReg, SrcVal,
17298 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17299 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17300 VReg, ToVReg.getValue(1));
17301 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17302 DAG.RemoveDeadNode(Node);
17303 return ToResultReg.getNode();
17304 }
17305 }
17306
17308 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17309 if (!isFrameIndexOp(Node->getOperand(i))) {
17310 Ops.push_back(Node->getOperand(i));
17311 continue;
17312 }
17313
17314 SDLoc DL(Node);
17315 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17316 Node->getOperand(i).getValueType(),
17317 Node->getOperand(i)),
17318 0));
17319 }
17320
17321 return DAG.UpdateNodeOperands(Node, Ops);
17322}
17323
17324/// Fold the instructions after selecting them.
17325/// Returns null if users were already updated.
17327 SelectionDAG &DAG) const {
17329 unsigned Opcode = Node->getMachineOpcode();
17330
17331 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17332 !TII->isGather4(Opcode) &&
17333 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17334 return adjustWritemask(Node, DAG);
17335 }
17336
17337 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17339 return Node;
17340 }
17341
17342 switch (Opcode) {
17343 case AMDGPU::V_DIV_SCALE_F32_e64:
17344 case AMDGPU::V_DIV_SCALE_F64_e64: {
17345 // Satisfy the operand register constraint when one of the inputs is
17346 // undefined. Ordinarily each undef value will have its own implicit_def of
17347 // a vreg, so force these to use a single register.
17348 SDValue Src0 = Node->getOperand(1);
17349 SDValue Src1 = Node->getOperand(3);
17350 SDValue Src2 = Node->getOperand(5);
17351
17352 if ((Src0.isMachineOpcode() &&
17353 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17354 (Src0 == Src1 || Src0 == Src2))
17355 break;
17356
17357 MVT VT = Src0.getValueType().getSimpleVT();
17358 const TargetRegisterClass *RC =
17359 getRegClassFor(VT, Src0.getNode()->isDivergent());
17360
17362 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17363
17364 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17365 Src0, SDValue());
17366
17367 // src0 must be the same register as src1 or src2, even if the value is
17368 // undefined, so make sure we don't violate this constraint.
17369 if (Src0.isMachineOpcode() &&
17370 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17371 if (Src1.isMachineOpcode() &&
17372 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17373 Src0 = Src1;
17374 else if (Src2.isMachineOpcode() &&
17375 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17376 Src0 = Src2;
17377 else {
17378 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17379 Src0 = UndefReg;
17380 Src1 = UndefReg;
17381 }
17382 } else
17383 break;
17384
17386 Ops[1] = Src0;
17387 Ops[3] = Src1;
17388 Ops[5] = Src2;
17389 Ops.push_back(ImpDef.getValue(1));
17390 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17391 }
17392 default:
17393 break;
17394 }
17395
17396 return Node;
17397}
17398
17399// Any MIMG instructions that use tfe or lwe require an initialization of the
17400// result register that will be written in the case of a memory access failure.
17401// The required code is also added to tie this init code to the result of the
17402// img instruction.
17405 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17406 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17407 MachineBasicBlock &MBB = *MI.getParent();
17408
17409 int DstIdx =
17410 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17411 unsigned InitIdx = 0;
17412
17413 if (TII->isImage(MI)) {
17414 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17415 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17416 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17417
17418 if (!TFE && !LWE) // intersect_ray
17419 return;
17420
17421 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17422 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17423 unsigned D16Val = D16 ? D16->getImm() : 0;
17424
17425 if (!TFEVal && !LWEVal)
17426 return;
17427
17428 // At least one of TFE or LWE are non-zero
17429 // We have to insert a suitable initialization of the result value and
17430 // tie this to the dest of the image instruction.
17431
17432 // Calculate which dword we have to initialize to 0.
17433 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17434
17435 // check that dmask operand is found.
17436 assert(MO_Dmask && "Expected dmask operand in instruction");
17437
17438 unsigned dmask = MO_Dmask->getImm();
17439 // Determine the number of active lanes taking into account the
17440 // Gather4 special case
17441 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17442
17443 bool Packed = !Subtarget->hasUnpackedD16VMem();
17444
17445 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17446
17447 // Abandon attempt if the dst size isn't large enough
17448 // - this is in fact an error but this is picked up elsewhere and
17449 // reported correctly.
17450 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17451
17452 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17453 if (DstSize < InitIdx)
17454 return;
17455 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17456 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17457 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17458 } else {
17459 return;
17460 }
17461
17462 const DebugLoc &DL = MI.getDebugLoc();
17463
17464 // Create a register for the initialization value.
17465 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17466 unsigned NewDst = 0; // Final initialized value will be in here
17467
17468 // If PRTStrictNull feature is enabled (the default) then initialize
17469 // all the result registers to 0, otherwise just the error indication
17470 // register (VGPRn+1)
17471 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17472 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17473
17474 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17475 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17476 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17477 // Initialize dword
17478 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17479 // clang-format off
17480 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17481 .addImm(0);
17482 // clang-format on
17483 // Insert into the super-reg
17484 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17485 .addReg(PrevDst)
17486 .addReg(SubReg)
17488
17489 PrevDst = NewDst;
17490 }
17491
17492 // Add as an implicit operand
17493 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17494
17495 // Tie the just added implicit operand to the dst
17496 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17497}
17498
17499/// Assign the register class depending on the number of
17500/// bits set in the writemask
17502 SDNode *Node) const {
17504
17505 MachineFunction *MF = MI.getMF();
17507
17508 if (TII->isVOP3(MI.getOpcode())) {
17509 // Make sure constant bus requirements are respected.
17510 TII->legalizeOperandsVOP3(MRI, MI);
17511
17512 if (TII->isMAI(MI)) {
17513 // The ordinary src0, src1, src2 were legalized above.
17514 //
17515 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17516 // as a separate instruction.
17517 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17518 AMDGPU::OpName::scale_src0);
17519 if (Src0Idx != -1) {
17520 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17521 AMDGPU::OpName::scale_src1);
17522 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17523 TII->usesConstantBus(MRI, MI, Src1Idx))
17524 TII->legalizeOpWithMove(MI, Src1Idx);
17525 }
17526 }
17527
17528 return;
17529 }
17530
17531 if (TII->isImage(MI))
17532 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17533}
17534
17536 uint64_t Val) {
17537 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17538 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17539}
17540
17542 const SDLoc &DL,
17543 SDValue Ptr) const {
17545
17546 // Build the half of the subregister with the constants before building the
17547 // full 128-bit register. If we are building multiple resource descriptors,
17548 // this will allow CSEing of the 2-component register.
17549 const SDValue Ops0[] = {
17550 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17551 buildSMovImm32(DAG, DL, 0),
17552 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17553 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17554 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17555
17556 SDValue SubRegHi = SDValue(
17557 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17558
17559 // Combine the constants and the pointer.
17560 const SDValue Ops1[] = {
17561 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17562 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17563 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17564
17565 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17566}
17567
17568/// Return a resource descriptor with the 'Add TID' bit enabled
17569/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17570/// of the resource descriptor) to create an offset, which is added to
17571/// the resource pointer.
17573 SDValue Ptr, uint32_t RsrcDword1,
17574 uint64_t RsrcDword2And3) const {
17575 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17576 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17577 if (RsrcDword1) {
17578 PtrHi =
17579 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17580 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17581 0);
17582 }
17583
17584 SDValue DataLo =
17585 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17586 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17587
17588 const SDValue Ops[] = {
17589 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17590 PtrLo,
17591 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17592 PtrHi,
17593 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17594 DataLo,
17595 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17596 DataHi,
17597 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17598
17599 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17600}
17601
17602//===----------------------------------------------------------------------===//
17603// SI Inline Assembly Support
17604//===----------------------------------------------------------------------===//
17605
17606std::pair<unsigned, const TargetRegisterClass *>
17608 StringRef Constraint,
17609 MVT VT) const {
17610 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17611
17612 const TargetRegisterClass *RC = nullptr;
17613 if (Constraint.size() == 1) {
17614 // Check if we cannot determine the bit size of the given value type. This
17615 // can happen, for example, in this situation where we have an empty struct
17616 // (size 0): `call void asm "", "v"({} poison)`-
17617 if (VT == MVT::Other)
17618 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17619 const unsigned BitWidth = VT.getSizeInBits();
17620 switch (Constraint[0]) {
17621 default:
17622 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17623 case 's':
17624 case 'r':
17625 switch (BitWidth) {
17626 case 16:
17627 RC = &AMDGPU::SReg_32RegClass;
17628 break;
17629 case 64:
17630 RC = &AMDGPU::SGPR_64RegClass;
17631 break;
17632 default:
17634 if (!RC)
17635 return std::pair(0U, nullptr);
17636 break;
17637 }
17638 break;
17639 case 'v':
17640 switch (BitWidth) {
17641 case 16:
17642 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17643 : &AMDGPU::VGPR_32_Lo256RegClass;
17644 break;
17645 default:
17646 RC = Subtarget->has1024AddressableVGPRs()
17647 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17648 : TRI->getVGPRClassForBitWidth(BitWidth);
17649 if (!RC)
17650 return std::pair(0U, nullptr);
17651 break;
17652 }
17653 break;
17654 case 'a':
17655 if (!Subtarget->hasMAIInsts())
17656 break;
17657 switch (BitWidth) {
17658 case 16:
17659 RC = &AMDGPU::AGPR_32RegClass;
17660 break;
17661 default:
17662 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17663 if (!RC)
17664 return std::pair(0U, nullptr);
17665 break;
17666 }
17667 break;
17668 }
17669 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17670 const unsigned BitWidth = VT.getSizeInBits();
17671 switch (BitWidth) {
17672 case 16:
17673 RC = &AMDGPU::AV_32RegClass;
17674 break;
17675 default:
17676 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17677 if (!RC)
17678 return std::pair(0U, nullptr);
17679 break;
17680 }
17681 }
17682
17683 // We actually support i128, i16 and f16 as inline parameters
17684 // even if they are not reported as legal
17685 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17686 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17687 return std::pair(0U, RC);
17688
17689 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17690 if (Kind != '\0') {
17691 if (Kind == 'v') {
17692 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17693 } else if (Kind == 's') {
17694 RC = &AMDGPU::SGPR_32RegClass;
17695 } else if (Kind == 'a') {
17696 RC = &AMDGPU::AGPR_32RegClass;
17697 }
17698
17699 if (RC) {
17700 if (NumRegs > 1) {
17701 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17702 return std::pair(0U, nullptr);
17703
17704 uint32_t Width = NumRegs * 32;
17705 // Prohibit constraints for register ranges with a width that does not
17706 // match the required type.
17707 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17708 return std::pair(0U, nullptr);
17709
17710 MCRegister Reg = RC->getRegister(Idx);
17712 RC = TRI->getVGPRClassForBitWidth(Width);
17713 else if (SIRegisterInfo::isSGPRClass(RC))
17714 RC = TRI->getSGPRClassForBitWidth(Width);
17715 else if (SIRegisterInfo::isAGPRClass(RC))
17716 RC = TRI->getAGPRClassForBitWidth(Width);
17717 if (RC) {
17718 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17719 if (!Reg) {
17720 // The register class does not contain the requested register,
17721 // e.g., because it is an SGPR pair that would violate alignment
17722 // requirements.
17723 return std::pair(0U, nullptr);
17724 }
17725 return std::pair(Reg, RC);
17726 }
17727 }
17728
17729 // Check for lossy scalar/vector conversions.
17730 if (VT.isVector() && VT.getSizeInBits() != 32)
17731 return std::pair(0U, nullptr);
17732 if (Idx < RC->getNumRegs())
17733 return std::pair(RC->getRegister(Idx), RC);
17734 return std::pair(0U, nullptr);
17735 }
17736 }
17737
17738 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17739 if (Ret.first)
17740 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17741
17742 return Ret;
17743}
17744
17745static bool isImmConstraint(StringRef Constraint) {
17746 if (Constraint.size() == 1) {
17747 switch (Constraint[0]) {
17748 default:
17749 break;
17750 case 'I':
17751 case 'J':
17752 case 'A':
17753 case 'B':
17754 case 'C':
17755 return true;
17756 }
17757 } else if (Constraint == "DA" || Constraint == "DB") {
17758 return true;
17759 }
17760 return false;
17761}
17762
17765 if (Constraint.size() == 1) {
17766 switch (Constraint[0]) {
17767 default:
17768 break;
17769 case 's':
17770 case 'v':
17771 case 'a':
17772 return C_RegisterClass;
17773 }
17774 } else if (Constraint.size() == 2) {
17775 if (Constraint == "VA")
17776 return C_RegisterClass;
17777 }
17778 if (isImmConstraint(Constraint)) {
17779 return C_Other;
17780 }
17781 return TargetLowering::getConstraintType(Constraint);
17782}
17783
17784static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17786 Val = Val & maskTrailingOnes<uint64_t>(Size);
17787 }
17788 return Val;
17789}
17790
17792 StringRef Constraint,
17793 std::vector<SDValue> &Ops,
17794 SelectionDAG &DAG) const {
17795 if (isImmConstraint(Constraint)) {
17796 uint64_t Val;
17797 if (getAsmOperandConstVal(Op, Val) &&
17798 checkAsmConstraintVal(Op, Constraint, Val)) {
17799 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17800 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17801 }
17802 } else {
17804 }
17805}
17806
17808 unsigned Size = Op.getScalarValueSizeInBits();
17809 if (Size > 64)
17810 return false;
17811
17812 if (Size == 16 && !Subtarget->has16BitInsts())
17813 return false;
17814
17816 Val = C->getSExtValue();
17817 return true;
17818 }
17820 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17821 return true;
17822 }
17824 if (Size != 16 || Op.getNumOperands() != 2)
17825 return false;
17826 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17827 return false;
17828 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17829 Val = C->getSExtValue();
17830 return true;
17831 }
17832 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17833 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17834 return true;
17835 }
17836 }
17837
17838 return false;
17839}
17840
17842 uint64_t Val) const {
17843 if (Constraint.size() == 1) {
17844 switch (Constraint[0]) {
17845 case 'I':
17847 case 'J':
17848 return isInt<16>(Val);
17849 case 'A':
17850 return checkAsmConstraintValA(Op, Val);
17851 case 'B':
17852 return isInt<32>(Val);
17853 case 'C':
17854 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17856 default:
17857 break;
17858 }
17859 } else if (Constraint.size() == 2) {
17860 if (Constraint == "DA") {
17861 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17862 int64_t LoBits = static_cast<int32_t>(Val);
17863 return checkAsmConstraintValA(Op, HiBits, 32) &&
17864 checkAsmConstraintValA(Op, LoBits, 32);
17865 }
17866 if (Constraint == "DB") {
17867 return true;
17868 }
17869 }
17870 llvm_unreachable("Invalid asm constraint");
17871}
17872
17874 unsigned MaxSize) const {
17875 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17876 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17877 if (Size == 16) {
17878 MVT VT = Op.getSimpleValueType();
17879 switch (VT.SimpleTy) {
17880 default:
17881 return false;
17882 case MVT::i16:
17883 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17884 case MVT::f16:
17885 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17886 case MVT::bf16:
17887 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17888 case MVT::v2i16:
17889 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17890 case MVT::v2f16:
17891 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17892 case MVT::v2bf16:
17893 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17894 }
17895 }
17896 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17897 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17898 return true;
17899 return false;
17900}
17901
17902static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17903 switch (UnalignedClassID) {
17904 case AMDGPU::VReg_64RegClassID:
17905 return AMDGPU::VReg_64_Align2RegClassID;
17906 case AMDGPU::VReg_96RegClassID:
17907 return AMDGPU::VReg_96_Align2RegClassID;
17908 case AMDGPU::VReg_128RegClassID:
17909 return AMDGPU::VReg_128_Align2RegClassID;
17910 case AMDGPU::VReg_160RegClassID:
17911 return AMDGPU::VReg_160_Align2RegClassID;
17912 case AMDGPU::VReg_192RegClassID:
17913 return AMDGPU::VReg_192_Align2RegClassID;
17914 case AMDGPU::VReg_224RegClassID:
17915 return AMDGPU::VReg_224_Align2RegClassID;
17916 case AMDGPU::VReg_256RegClassID:
17917 return AMDGPU::VReg_256_Align2RegClassID;
17918 case AMDGPU::VReg_288RegClassID:
17919 return AMDGPU::VReg_288_Align2RegClassID;
17920 case AMDGPU::VReg_320RegClassID:
17921 return AMDGPU::VReg_320_Align2RegClassID;
17922 case AMDGPU::VReg_352RegClassID:
17923 return AMDGPU::VReg_352_Align2RegClassID;
17924 case AMDGPU::VReg_384RegClassID:
17925 return AMDGPU::VReg_384_Align2RegClassID;
17926 case AMDGPU::VReg_512RegClassID:
17927 return AMDGPU::VReg_512_Align2RegClassID;
17928 case AMDGPU::VReg_1024RegClassID:
17929 return AMDGPU::VReg_1024_Align2RegClassID;
17930 case AMDGPU::AReg_64RegClassID:
17931 return AMDGPU::AReg_64_Align2RegClassID;
17932 case AMDGPU::AReg_96RegClassID:
17933 return AMDGPU::AReg_96_Align2RegClassID;
17934 case AMDGPU::AReg_128RegClassID:
17935 return AMDGPU::AReg_128_Align2RegClassID;
17936 case AMDGPU::AReg_160RegClassID:
17937 return AMDGPU::AReg_160_Align2RegClassID;
17938 case AMDGPU::AReg_192RegClassID:
17939 return AMDGPU::AReg_192_Align2RegClassID;
17940 case AMDGPU::AReg_256RegClassID:
17941 return AMDGPU::AReg_256_Align2RegClassID;
17942 case AMDGPU::AReg_512RegClassID:
17943 return AMDGPU::AReg_512_Align2RegClassID;
17944 case AMDGPU::AReg_1024RegClassID:
17945 return AMDGPU::AReg_1024_Align2RegClassID;
17946 default:
17947 return -1;
17948 }
17949}
17950
17951// Figure out which registers should be reserved for stack access. Only after
17952// the function is legalized do we know all of the non-spill stack objects or if
17953// calls are present.
17957 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17958 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17959 const SIInstrInfo *TII = ST.getInstrInfo();
17960
17961 if (Info->isEntryFunction()) {
17962 // Callable functions have fixed registers used for stack access.
17964 }
17965
17966 // TODO: Move this logic to getReservedRegs()
17967 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17968 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17969 Register SReg = ST.isWave32()
17970 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17971 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17972 &AMDGPU::SGPR_64RegClass);
17973 Info->setSGPRForEXECCopy(SReg);
17974
17975 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17976 Info->getStackPtrOffsetReg()));
17977 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17978 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17979
17980 // We need to worry about replacing the default register with itself in case
17981 // of MIR testcases missing the MFI.
17982 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17983 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17984
17985 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17986 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17987
17988 Info->limitOccupancy(MF);
17989
17990 if (ST.isWave32() && !MF.empty()) {
17991 for (auto &MBB : MF) {
17992 for (auto &MI : MBB) {
17993 TII->fixImplicitOperands(MI);
17994 }
17995 }
17996 }
17997
17998 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17999 // classes if required. Ideally the register class constraints would differ
18000 // per-subtarget, but there's no easy way to achieve that right now. This is
18001 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18002 // from using them as the register class for legal types.
18003 if (ST.needsAlignedVGPRs()) {
18004 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18005 const Register Reg = Register::index2VirtReg(I);
18006 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18007 if (!RC)
18008 continue;
18009 int NewClassID = getAlignedAGPRClassID(RC->getID());
18010 if (NewClassID != -1)
18011 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18012 }
18013 }
18014
18016}
18017
18019 KnownBits &Known,
18020 const APInt &DemandedElts,
18021 const SelectionDAG &DAG,
18022 unsigned Depth) const {
18023 Known.resetAll();
18024 unsigned Opc = Op.getOpcode();
18025 switch (Opc) {
18027 unsigned IID = Op.getConstantOperandVal(0);
18028 switch (IID) {
18029 case Intrinsic::amdgcn_mbcnt_lo:
18030 case Intrinsic::amdgcn_mbcnt_hi: {
18031 const GCNSubtarget &ST =
18033 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18034 // most 31 + src1.
18035 Known.Zero.setBitsFrom(
18036 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18037 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18038 Known = KnownBits::add(Known, Known2);
18039 return;
18040 }
18041 }
18042 break;
18043 }
18044 }
18046 Op, Known, DemandedElts, DAG, Depth);
18047}
18048
18050 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18052
18053 // Set the high bits to zero based on the maximum allowed scratch size per
18054 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18055 // calculation won't overflow, so assume the sign bit is never set.
18056 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18057}
18058
18060 GISelValueTracking &VT, KnownBits &Known,
18061 unsigned Dim) {
18062 unsigned MaxValue =
18063 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18064 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18065}
18066
18068 KnownBits &Known, const APInt &DemandedElts,
18069 unsigned BFEWidth, bool SExt, unsigned Depth) {
18071 const MachineOperand &Src1 = MI.getOperand(2);
18072
18073 unsigned Src1Cst = 0;
18074 if (Src1.isImm()) {
18075 Src1Cst = Src1.getImm();
18076 } else if (Src1.isReg()) {
18077 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18078 if (!Cst)
18079 return;
18080 Src1Cst = Cst->Value.getZExtValue();
18081 } else {
18082 return;
18083 }
18084
18085 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18086 // Width is always [22:16].
18087 const unsigned Offset =
18088 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18089 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18090
18091 if (Width >= BFEWidth) // Ill-formed.
18092 return;
18093
18094 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18095 Depth + 1);
18096
18097 Known = Known.extractBits(Width, Offset);
18098
18099 if (SExt)
18100 Known = Known.sext(BFEWidth);
18101 else
18102 Known = Known.zext(BFEWidth);
18103}
18104
18106 GISelValueTracking &VT, Register R, KnownBits &Known,
18107 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18108 unsigned Depth) const {
18109 Known.resetAll();
18110 const MachineInstr *MI = MRI.getVRegDef(R);
18111 switch (MI->getOpcode()) {
18112 case AMDGPU::S_BFE_I32:
18113 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18114 /*SExt=*/true, Depth);
18115 case AMDGPU::S_BFE_U32:
18116 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18117 /*SExt=*/false, Depth);
18118 case AMDGPU::S_BFE_I64:
18119 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18120 /*SExt=*/true, Depth);
18121 case AMDGPU::S_BFE_U64:
18122 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18123 /*SExt=*/false, Depth);
18124 case AMDGPU::G_INTRINSIC:
18125 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18126 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18127 switch (IID) {
18128 case Intrinsic::amdgcn_workitem_id_x:
18129 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18130 break;
18131 case Intrinsic::amdgcn_workitem_id_y:
18132 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18133 break;
18134 case Intrinsic::amdgcn_workitem_id_z:
18135 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18136 break;
18137 case Intrinsic::amdgcn_mbcnt_lo:
18138 case Intrinsic::amdgcn_mbcnt_hi: {
18139 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18140 // most 31 + src1.
18141 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18142 ? getSubtarget()->getWavefrontSizeLog2()
18143 : 5);
18144 KnownBits Known2;
18145 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18146 Depth + 1);
18147 Known = KnownBits::add(Known, Known2);
18148 break;
18149 }
18150 case Intrinsic::amdgcn_groupstaticsize: {
18151 // We can report everything over the maximum size as 0. We can't report
18152 // based on the actual size because we don't know if it's accurate or not
18153 // at any given point.
18154 Known.Zero.setHighBits(
18155 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18156 break;
18157 }
18158 }
18159 break;
18160 }
18161 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18162 Known.Zero.setHighBits(24);
18163 break;
18164 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18165 Known.Zero.setHighBits(16);
18166 break;
18167 case AMDGPU::G_AMDGPU_SMED3:
18168 case AMDGPU::G_AMDGPU_UMED3: {
18169 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18170
18171 KnownBits Known2;
18172 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18173 if (Known2.isUnknown())
18174 break;
18175
18176 KnownBits Known1;
18177 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18178 if (Known1.isUnknown())
18179 break;
18180
18181 KnownBits Known0;
18182 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18183 if (Known0.isUnknown())
18184 break;
18185
18186 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18187 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18188 Known.One = Known0.One & Known1.One & Known2.One;
18189 break;
18190 }
18191 }
18192}
18193
18196 unsigned Depth) const {
18197 const MachineInstr *MI = MRI.getVRegDef(R);
18198 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18199 // FIXME: Can this move to generic code? What about the case where the call
18200 // site specifies a lower alignment?
18201 Intrinsic::ID IID = GI->getIntrinsicID();
18203 AttributeList Attrs =
18204 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18205 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18206 return *RetAlign;
18207 }
18208 return Align(1);
18209}
18210
18213 const Align CacheLineAlign = Align(64);
18214
18215 // Pre-GFX10 target did not benefit from loop alignment
18216 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18217 getSubtarget()->hasInstFwdPrefetchBug())
18218 return PrefAlign;
18219
18220 // On GFX10 I$ is 4 x 64 bytes cache lines.
18221 // By default prefetcher keeps one cache line behind and reads two ahead.
18222 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18223 // behind and one ahead.
18224 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18225 // If loop fits 64 bytes it always spans no more than two cache lines and
18226 // does not need an alignment.
18227 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18228 // Else if loop is less or equal 192 bytes we need two lines behind.
18229
18231 const MachineBasicBlock *Header = ML->getHeader();
18232 if (Header->getAlignment() != PrefAlign)
18233 return Header->getAlignment(); // Already processed.
18234
18235 unsigned LoopSize = 0;
18236 for (const MachineBasicBlock *MBB : ML->blocks()) {
18237 // If inner loop block is aligned assume in average half of the alignment
18238 // size to be added as nops.
18239 if (MBB != Header)
18240 LoopSize += MBB->getAlignment().value() / 2;
18241
18242 for (const MachineInstr &MI : *MBB) {
18243 LoopSize += TII->getInstSizeInBytes(MI);
18244 if (LoopSize > 192)
18245 return PrefAlign;
18246 }
18247 }
18248
18249 if (LoopSize <= 64)
18250 return PrefAlign;
18251
18252 if (LoopSize <= 128)
18253 return CacheLineAlign;
18254
18255 // If any of parent loops is surrounded by prefetch instructions do not
18256 // insert new for inner loop, which would reset parent's settings.
18257 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18258 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18259 auto I = Exit->getFirstNonDebugInstr();
18260 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18261 return CacheLineAlign;
18262 }
18263 }
18264
18265 MachineBasicBlock *Pre = ML->getLoopPreheader();
18266 MachineBasicBlock *Exit = ML->getExitBlock();
18267
18268 if (Pre && Exit) {
18269 auto PreTerm = Pre->getFirstTerminator();
18270 if (PreTerm == Pre->begin() ||
18271 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18272 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18273 .addImm(1); // prefetch 2 lines behind PC
18274
18275 auto ExitHead = Exit->getFirstNonDebugInstr();
18276 if (ExitHead == Exit->end() ||
18277 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18278 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18279 .addImm(2); // prefetch 1 line behind PC
18280 }
18281
18282 return CacheLineAlign;
18283}
18284
18285[[maybe_unused]]
18286static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18287 assert(N->getOpcode() == ISD::CopyFromReg);
18288 do {
18289 // Follow the chain until we find an INLINEASM node.
18290 N = N->getOperand(0).getNode();
18291 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18292 return true;
18293 } while (N->getOpcode() == ISD::CopyFromReg);
18294 return false;
18295}
18296
18299 UniformityInfo *UA) const {
18300 switch (N->getOpcode()) {
18301 case ISD::CopyFromReg: {
18302 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18303 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18304 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18305 Register Reg = R->getReg();
18306
18307 // FIXME: Why does this need to consider isLiveIn?
18308 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18309 return !TRI->isSGPRReg(MRI, Reg);
18310
18311 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18312 return UA->isDivergent(V);
18313
18315 return !TRI->isSGPRReg(MRI, Reg);
18316 }
18317 case ISD::LOAD: {
18318 const LoadSDNode *L = cast<LoadSDNode>(N);
18319 unsigned AS = L->getAddressSpace();
18320 // A flat load may access private memory.
18322 }
18323 case ISD::CALLSEQ_END:
18324 return true;
18326 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18328 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18329 case AMDGPUISD::ATOMIC_CMP_SWAP:
18330 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18331 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18332 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18333 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18334 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18335 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18336 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18337 case AMDGPUISD::BUFFER_ATOMIC_AND:
18338 case AMDGPUISD::BUFFER_ATOMIC_OR:
18339 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18340 case AMDGPUISD::BUFFER_ATOMIC_INC:
18341 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18342 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18343 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
18344 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18345 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18346 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18347 // Target-specific read-modify-write atomics are sources of divergence.
18348 return true;
18349 default:
18350 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18351 // Generic read-modify-write atomics are sources of divergence.
18352 return A->readMem() && A->writeMem();
18353 }
18354 return false;
18355 }
18356}
18357
18359 EVT VT) const {
18360 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18361 case MVT::f32:
18363 case MVT::f64:
18364 case MVT::f16:
18366 default:
18367 return false;
18368 }
18369}
18370
18372 LLT Ty, const MachineFunction &MF) const {
18373 switch (Ty.getScalarSizeInBits()) {
18374 case 32:
18375 return !denormalModeIsFlushAllF32(MF);
18376 case 64:
18377 case 16:
18378 return !denormalModeIsFlushAllF64F16(MF);
18379 default:
18380 return false;
18381 }
18382}
18383
18385 const APInt &DemandedElts,
18386 const SelectionDAG &DAG,
18387 bool SNaN,
18388 unsigned Depth) const {
18389 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18390 const MachineFunction &MF = DAG.getMachineFunction();
18392
18393 if (Info->getMode().DX10Clamp)
18394 return true; // Clamped to 0.
18395 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18396 }
18397
18399 DAG, SNaN, Depth);
18400}
18401
18402// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18403// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18405 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18406 return true;
18407
18408 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18409 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18410 if (DenormMode == DenormalMode::getPreserveSign())
18411 return true;
18412
18413 // TODO: Remove this.
18414 return RMW->getFunction()
18415 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18416 .getValueAsBool();
18417}
18418
18420 LLVMContext &Ctx = RMW->getContext();
18421 StringRef MemScope =
18422 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18423
18424 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18425 << "Hardware instruction generated for atomic "
18426 << RMW->getOperationName(RMW->getOperation())
18427 << " operation at memory scope " << MemScope;
18428}
18429
18430static bool isV2F16OrV2BF16(Type *Ty) {
18431 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18432 Type *EltTy = VT->getElementType();
18433 return VT->getNumElements() == 2 &&
18434 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18435 }
18436
18437 return false;
18438}
18439
18440static bool isV2F16(Type *Ty) {
18442 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18443}
18444
18445static bool isV2BF16(Type *Ty) {
18447 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18448}
18449
18450/// \return true if atomicrmw integer ops work for the type.
18451static bool isAtomicRMWLegalIntTy(Type *Ty) {
18452 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18453 unsigned BW = IT->getBitWidth();
18454 return BW == 32 || BW == 64;
18455 }
18456
18457 return false;
18458}
18459
18460/// \return true if this atomicrmw xchg type can be selected.
18461static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18462 Type *Ty = RMW->getType();
18463 if (isAtomicRMWLegalIntTy(Ty))
18464 return true;
18465
18466 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18467 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18468 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18469 return BW == 32 || BW == 64;
18470 }
18471
18472 if (Ty->isFloatTy() || Ty->isDoubleTy())
18473 return true;
18474
18476 return VT->getNumElements() == 2 &&
18477 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18478 }
18479
18480 return false;
18481}
18482
18483/// \returns true if it's valid to emit a native instruction for \p RMW, based
18484/// on the properties of the target memory.
18485static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18486 const AtomicRMWInst *RMW,
18487 bool HasSystemScope) {
18488 // The remote/fine-grained access logic is different from the integer
18489 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18490 // fine-grained access does not work, even for a device local allocation.
18491 //
18492 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18493 // allocations work.
18494 if (HasSystemScope) {
18496 RMW->hasMetadata("amdgpu.no.remote.memory"))
18497 return true;
18498 if (Subtarget.hasEmulatedSystemScopeAtomics())
18499 return true;
18501 return true;
18502
18503 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18504}
18505
18506/// \return Action to perform on AtomicRMWInsts for integer operations.
18513
18514/// Return if a flat address space atomicrmw can access private memory.
18516 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18517 return !MD ||
18519}
18520
18528
18531 unsigned AS = RMW->getPointerAddressSpace();
18532 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18534
18535 // 64-bit flat atomics that dynamically reside in private memory will silently
18536 // be dropped.
18537 //
18538 // Note that we will emit a new copy of the original atomic in the expansion,
18539 // which will be incrementally relegalized.
18540 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18541 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18542 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18545
18546 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18548 ORE.emit([=]() {
18549 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18550 });
18551 return Kind;
18552 };
18553
18554 auto SSID = RMW->getSyncScopeID();
18555 bool HasSystemScope =
18556 SSID == SyncScope::System ||
18557 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18558
18559 auto Op = RMW->getOperation();
18560 switch (Op) {
18562 // PCIe supports add and xchg for system atomics.
18563 return isAtomicRMWLegalXChgTy(RMW)
18566 case AtomicRMWInst::Add:
18567 // PCIe supports add and xchg for system atomics.
18569 case AtomicRMWInst::Sub:
18570 case AtomicRMWInst::And:
18571 case AtomicRMWInst::Or:
18572 case AtomicRMWInst::Xor:
18573 case AtomicRMWInst::Max:
18574 case AtomicRMWInst::Min:
18581 if (Subtarget->hasEmulatedSystemScopeAtomics())
18583
18584 // On most subtargets, for atomicrmw operations other than add/xchg,
18585 // whether or not the instructions will behave correctly depends on where
18586 // the address physically resides and what interconnect is used in the
18587 // system configuration. On some some targets the instruction will nop,
18588 // and in others synchronization will only occur at degraded device scope.
18589 //
18590 // If the allocation is known local to the device, the instructions should
18591 // work correctly.
18592 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18594
18595 // If fine-grained remote memory works at device scope, we don't need to
18596 // do anything.
18597 if (!HasSystemScope &&
18598 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18600
18601 // If we are targeting a remote allocated address, it depends what kind of
18602 // allocation the address belongs to.
18603 //
18604 // If the allocation is fine-grained (in host memory, or in PCIe peer
18605 // device memory), the operation will fail depending on the target.
18606 //
18607 // Note fine-grained host memory access does work on APUs or if XGMI is
18608 // used, but we do not know if we are targeting an APU or the system
18609 // configuration from the ISA version/target-cpu.
18610 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18612
18615 // Atomic sub/or/xor do not work over PCI express, but atomic add
18616 // does. InstCombine transforms these with 0 to or, so undo that.
18617 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18618 ConstVal && ConstVal->isNullValue())
18620 }
18621
18622 // If the allocation could be in remote, fine-grained memory, the rmw
18623 // instructions may fail. cmpxchg should work, so emit that. On some
18624 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18625 // even work, so you're out of luck anyway.
18626
18627 // In summary:
18628 //
18629 // Cases that may fail:
18630 // - fine-grained pinned host memory
18631 // - fine-grained migratable host memory
18632 // - fine-grained PCIe peer device
18633 //
18634 // Cases that should work, but may be treated overly conservatively.
18635 // - fine-grained host memory on an APU
18636 // - fine-grained XGMI peer device
18638 }
18639
18641 }
18642 case AtomicRMWInst::FAdd: {
18643 Type *Ty = RMW->getType();
18644
18645 // TODO: Handle REGION_ADDRESS
18646 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18647 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18648 // is fixed to round-to-nearest-even.
18649 //
18650 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18651 // round-to-nearest-even.
18652 //
18653 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18654 // suggests it is OK if the floating-point mode may not match the calling
18655 // thread.
18656 if (Ty->isFloatTy()) {
18657 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18659 }
18660
18661 if (Ty->isDoubleTy()) {
18662 // Ignores denormal mode, but we don't consider flushing mandatory.
18663 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18665 }
18666
18667 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18669
18671 }
18672
18673 // LDS atomics respect the denormal mode from the mode register.
18674 //
18675 // Traditionally f32 global/buffer memory atomics would unconditionally
18676 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18677 // flush.
18678 //
18679 // On targets with flat atomic fadd, denormals would flush depending on
18680 // whether the target address resides in LDS or global memory. We consider
18681 // this flat-maybe-flush as will-flush.
18682 if (Ty->isFloatTy() &&
18683 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18686
18687 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18688 // safe. The message phrasing also should be better.
18689 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18690 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18691 // gfx942, gfx12
18692 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18693 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18694 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18695 // gfx90a, gfx942, gfx12
18696 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18697 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18698
18699 // gfx942, gfx12
18700 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18701 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18702 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18703 // gfx90a, gfx942, gfx12
18704 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18705 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18706
18707 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18708 // buffer. gfx12 does have the buffer version.
18709 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18710 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18711 }
18712
18713 // global and flat atomic fadd f64: gfx90a, gfx942.
18714 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18715 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18716
18717 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18718 if (Ty->isFloatTy()) {
18719 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18720 // gfx11+.
18721 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18722 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18723 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18724 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18725 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18726 } else {
18727 // gfx908
18728 if (RMW->use_empty() &&
18729 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18730 isV2F16(Ty))
18731 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18732 }
18733 }
18734
18735 // flat atomic fadd f32: gfx942, gfx11+.
18736 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18737 if (Subtarget->hasFlatAtomicFaddF32Inst())
18738 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18739
18740 // If it is in flat address space, and the type is float, we will try to
18741 // expand it, if the target supports global and lds atomic fadd. The
18742 // reason we need that is, in the expansion, we emit the check of
18743 // address space. If it is in global address space, we emit the global
18744 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18745 // fadd.
18746 if (Subtarget->hasLDSFPAtomicAddF32()) {
18747 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18749 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18751 }
18752 }
18753 }
18754
18756 }
18758 case AtomicRMWInst::FMax: {
18759 Type *Ty = RMW->getType();
18760
18761 // LDS float and double fmin/fmax were always supported.
18762 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18763 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18765 }
18766
18767 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18768 // For flat and global cases:
18769 // float, double in gfx7. Manual claims denormal support.
18770 // Removed in gfx8.
18771 // float, double restored in gfx10.
18772 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18773 //
18774 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18775 // no f32.
18776 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18777 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18778 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18779 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18780 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18781 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18783 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18784 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18785 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18786 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18787 }
18788 }
18789
18791 }
18794 default:
18796 }
18797
18798 llvm_unreachable("covered atomicrmw op switch");
18799}
18800
18807
18814
18817 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18818 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18820
18821 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18823
18824 const DataLayout &DL = CmpX->getDataLayout();
18825
18826 Type *ValTy = CmpX->getNewValOperand()->getType();
18827
18828 // If a 64-bit flat atomic may alias private, we need to avoid using the
18829 // atomic in the private case.
18830 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18832}
18833
18834const TargetRegisterClass *
18835SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18837 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18838 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18839 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18840 : &AMDGPU::SReg_32RegClass;
18841 if (!TRI->isSGPRClass(RC) && !isDivergent)
18842 return TRI->getEquivalentSGPRClass(RC);
18843 if (TRI->isSGPRClass(RC) && isDivergent) {
18844 if (Subtarget->hasGFX90AInsts())
18845 return TRI->getEquivalentAVClass(RC);
18846 return TRI->getEquivalentVGPRClass(RC);
18847 }
18848
18849 return RC;
18850}
18851
18852// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18853// uniform values (as produced by the mask results of control flow intrinsics)
18854// used outside of divergent blocks. The phi users need to also be treated as
18855// always uniform.
18856//
18857// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18858static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18859 unsigned WaveSize) {
18860 // FIXME: We assume we never cast the mask results of a control flow
18861 // intrinsic.
18862 // Early exit if the type won't be consistent as a compile time hack.
18863 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18864 if (!IT || IT->getBitWidth() != WaveSize)
18865 return false;
18866
18867 if (!isa<Instruction>(V))
18868 return false;
18869 if (!Visited.insert(V).second)
18870 return false;
18871 bool Result = false;
18872 for (const auto *U : V->users()) {
18874 if (V == U->getOperand(1)) {
18875 switch (Intrinsic->getIntrinsicID()) {
18876 default:
18877 Result = false;
18878 break;
18879 case Intrinsic::amdgcn_if_break:
18880 case Intrinsic::amdgcn_if:
18881 case Intrinsic::amdgcn_else:
18882 Result = true;
18883 break;
18884 }
18885 }
18886 if (V == U->getOperand(0)) {
18887 switch (Intrinsic->getIntrinsicID()) {
18888 default:
18889 Result = false;
18890 break;
18891 case Intrinsic::amdgcn_end_cf:
18892 case Intrinsic::amdgcn_loop:
18893 Result = true;
18894 break;
18895 }
18896 }
18897 } else {
18898 Result = hasCFUser(U, Visited, WaveSize);
18899 }
18900 if (Result)
18901 break;
18902 }
18903 return Result;
18904}
18905
18907 const Value *V) const {
18908 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18909 if (CI->isInlineAsm()) {
18910 // FIXME: This cannot give a correct answer. This should only trigger in
18911 // the case where inline asm returns mixed SGPR and VGPR results, used
18912 // outside the defining block. We don't have a specific result to
18913 // consider, so this assumes if any value is SGPR, the overall register
18914 // also needs to be SGPR.
18915 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18917 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18918 for (auto &TC : TargetConstraints) {
18919 if (TC.Type == InlineAsm::isOutput) {
18921 const TargetRegisterClass *RC =
18922 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18923 TC.ConstraintVT)
18924 .second;
18925 if (RC && SIRI->isSGPRClass(RC))
18926 return true;
18927 }
18928 }
18929 }
18930 }
18932 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18933}
18934
18936 for (SDUse &Use : N->uses()) {
18938 if (getBasePtrIndex(M) == Use.getOperandNo())
18939 return true;
18940 }
18941 }
18942 return false;
18943}
18944
18946 SDValue N1) const {
18947 if (!N0.hasOneUse())
18948 return false;
18949 // Take care of the opportunity to keep N0 uniform
18950 if (N0->isDivergent() || !N1->isDivergent())
18951 return true;
18952 // Check if we have a good chance to form the memory access pattern with the
18953 // base and offset
18954 return (DAG.isBaseWithConstantOffset(N0) &&
18956}
18957
18959 Register N0, Register N1) const {
18960 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18961}
18962
18965 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18967 if (I.getMetadata("amdgpu.noclobber"))
18968 Flags |= MONoClobber;
18969 if (I.getMetadata("amdgpu.last.use"))
18970 Flags |= MOLastUse;
18971 return Flags;
18972}
18973
18975 Instruction *AI) const {
18976 // Given: atomicrmw fadd ptr %addr, float %val ordering
18977 //
18978 // With this expansion we produce the following code:
18979 // [...]
18980 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18981 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18982 //
18983 // atomicrmw.shared:
18984 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18985 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18986 // float %val ordering
18987 // br label %atomicrmw.phi
18988 //
18989 // atomicrmw.check.private:
18990 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18991 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18992 //
18993 // atomicrmw.private:
18994 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18995 // %loaded.private = load float, ptr addrspace(5) %cast.private
18996 // %val.new = fadd float %loaded.private, %val
18997 // store float %val.new, ptr addrspace(5) %cast.private
18998 // br label %atomicrmw.phi
18999 //
19000 // atomicrmw.global:
19001 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19002 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19003 // float %val ordering
19004 // br label %atomicrmw.phi
19005 //
19006 // atomicrmw.phi:
19007 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19008 // [ %loaded.private, %atomicrmw.private ],
19009 // [ %loaded.global, %atomicrmw.global ]
19010 // br label %atomicrmw.end
19011 //
19012 // atomicrmw.end:
19013 // [...]
19014 //
19015 //
19016 // For 64-bit atomics which may reside in private memory, we perform a simpler
19017 // version that only inserts the private check, and uses the flat operation.
19018
19019 IRBuilder<> Builder(AI);
19020 LLVMContext &Ctx = Builder.getContext();
19021
19022 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19023 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19025 Value *Addr = AI->getOperand(PtrOpIdx);
19026
19027 /// TODO: Only need to check private, then emit flat-known-not private (no
19028 /// need for shared block, or cast to global).
19030
19031 Align Alignment;
19032 if (RMW)
19033 Alignment = RMW->getAlign();
19034 else if (CX)
19035 Alignment = CX->getAlign();
19036 else
19037 llvm_unreachable("unhandled atomic operation");
19038
19039 // FullFlatEmulation is true if we need to issue the private, shared, and
19040 // global cases.
19041 //
19042 // If this is false, we are only dealing with the flat-targeting-private case,
19043 // where we only insert a check for private and still use the flat instruction
19044 // for global and shared.
19045
19046 bool FullFlatEmulation =
19047 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19048 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19049 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19050 RMW->getType()->isDoubleTy()));
19051
19052 // If the return value isn't used, do not introduce a false use in the phi.
19053 bool ReturnValueIsUsed = !AI->use_empty();
19054
19055 BasicBlock *BB = Builder.GetInsertBlock();
19056 Function *F = BB->getParent();
19057 BasicBlock *ExitBB =
19058 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19059 BasicBlock *SharedBB = nullptr;
19060
19061 BasicBlock *CheckPrivateBB = BB;
19062 if (FullFlatEmulation) {
19063 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19064 CheckPrivateBB =
19065 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19066 }
19067
19068 BasicBlock *PrivateBB =
19069 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19070 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19071 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19072
19073 std::prev(BB->end())->eraseFromParent();
19074 Builder.SetInsertPoint(BB);
19075
19076 Value *LoadedShared = nullptr;
19077 if (FullFlatEmulation) {
19078 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19079 {Addr}, nullptr, "is.shared");
19080 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19081 Builder.SetInsertPoint(SharedBB);
19082 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19084
19085 Instruction *Clone = AI->clone();
19086 Clone->insertInto(SharedBB, SharedBB->end());
19087 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19088 LoadedShared = Clone;
19089
19090 Builder.CreateBr(PhiBB);
19091 Builder.SetInsertPoint(CheckPrivateBB);
19092 }
19093
19094 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19095 {Addr}, nullptr, "is.private");
19096 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19097
19098 Builder.SetInsertPoint(PrivateBB);
19099
19100 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19102
19103 Value *LoadedPrivate;
19104 if (RMW) {
19105 LoadedPrivate = Builder.CreateAlignedLoad(
19106 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19107
19108 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19109 LoadedPrivate, RMW->getValOperand());
19110
19111 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19112 } else {
19113 auto [ResultLoad, Equal] =
19114 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19115 CX->getNewValOperand(), CX->getAlign());
19116
19117 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19118 ResultLoad, 0);
19119 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19120 }
19121
19122 Builder.CreateBr(PhiBB);
19123
19124 Builder.SetInsertPoint(GlobalBB);
19125
19126 // Continue using a flat instruction if we only emitted the check for private.
19127 Instruction *LoadedGlobal = AI;
19128 if (FullFlatEmulation) {
19129 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19131 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19132 }
19133
19134 AI->removeFromParent();
19135 AI->insertInto(GlobalBB, GlobalBB->end());
19136
19137 // The new atomicrmw may go through another round of legalization later.
19138 if (!FullFlatEmulation) {
19139 // We inserted the runtime check already, make sure we do not try to
19140 // re-expand this.
19141 // TODO: Should union with any existing metadata.
19142 MDBuilder MDB(F->getContext());
19143 MDNode *RangeNotPrivate =
19146 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19147 RangeNotPrivate);
19148 }
19149
19150 Builder.CreateBr(PhiBB);
19151
19152 Builder.SetInsertPoint(PhiBB);
19153
19154 if (ReturnValueIsUsed) {
19155 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19156 AI->replaceAllUsesWith(Loaded);
19157 if (FullFlatEmulation)
19158 Loaded->addIncoming(LoadedShared, SharedBB);
19159 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19160 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19161 Loaded->takeName(AI);
19162 }
19163
19164 Builder.CreateBr(ExitBB);
19165}
19166
19168 unsigned PtrOpIdx) {
19169 Value *PtrOp = I->getOperand(PtrOpIdx);
19172
19173 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19174 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19175 I->getIterator());
19176 I->setOperand(PtrOpIdx, ASCast);
19177}
19178
19181
19184
19187 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19188 ConstVal && ConstVal->isNullValue()) {
19189 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19191
19192 // We may still need the private-alias-flat handling below.
19193
19194 // TODO: Skip this for cases where we cannot access remote memory.
19195 }
19196 }
19197
19198 // The non-flat expansions should only perform the de-canonicalization of
19199 // identity values.
19201 return;
19202
19204}
19205
19212
19216
19218 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19219}
19220
19222 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19223 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19224
19226 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19227}
19228
19229LoadInst *
19231 IRBuilder<> Builder(AI);
19232 auto Order = AI->getOrdering();
19233
19234 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19235 // must be flushed if the atomic ordering had a release semantics. This is
19236 // not necessary a fence, a release fence just coincides to do that flush.
19237 // Avoid replacing of an atomicrmw with a release semantics.
19238 if (isReleaseOrStronger(Order))
19239 return nullptr;
19240
19241 LoadInst *LI = Builder.CreateAlignedLoad(
19242 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19243 LI->setAtomic(Order, AI->getSyncScopeID());
19244 LI->copyMetadata(*AI);
19245 LI->takeName(AI);
19246 AI->replaceAllUsesWith(LI);
19247 AI->eraseFromParent();
19248 return LI;
19249}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1102
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6053
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
bool isNegative() const
Definition APFloat.h:1431
bool isNormal() const
Definition APFloat.h:1435
APInt bitcastToAPInt() const
Definition APFloat.h:1335
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
bool isInfinity() const
Definition APFloat.h:1428
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:208
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:422
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
@ BRCOND
X86 conditional branches.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs