LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236 }
237
238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254
255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262
263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264
268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269
270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274
276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Expand);
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Expand);
289
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Custom);
294
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298
300
302
304 Expand);
305
306#if 0
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
331 case ISD::IS_FPCLASS:
332 break;
337 break;
338 default:
340 break;
341 }
342 }
343 }
344
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420 }
421
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Expand);
425
426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
455 Expand);
456
457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458
459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
468
469 if (Subtarget->hasSMemRealTime() ||
473
474 if (Subtarget->has16BitInsts()) {
477 } else {
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487
488 if (!Subtarget->hasBCNT(32))
490
491 if (!Subtarget->hasBCNT(64))
493
494 if (Subtarget->hasFFBH())
496
497 if (Subtarget->hasFFBL())
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517 Legal);
518
519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520 Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
526 {MVT::f32, MVT::f64}, Legal);
527
528 if (Subtarget->haveRoundOpsF64())
530 Legal);
531 else
533 MVT::f64, Custom);
534
536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537 Legal);
538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539
542
543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
550
551 if (Subtarget->has16BitInsts()) {
554 MVT::i16, Legal);
555
556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557
559 MVT::i16, Expand);
560
564 ISD::CTPOP},
565 MVT::i16, Promote);
566
568
569 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570
572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575
579
581
582 // F16 - Constant Actions.
585
586 // F16 - Load/Store Actions.
588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591
592 // BF16 - Load/Store Actions.
594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597
598 // F16 - VOP1 Actions.
601 MVT::f16, Custom);
602
605
606 // F16 - VOP2 Actions.
607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608 Expand);
612
613 // F16 - VOP3 Actions.
615 if (STI.hasMadF16())
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
638 break;
639 default:
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652
653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Legal);
655
657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660
662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665
666 setOperationAction(ISD::AND, MVT::v2i16, Promote);
667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668 setOperationAction(ISD::OR, MVT::v2i16, Promote);
669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672
674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679
681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693
695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698
700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705
706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712
714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719
720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726
728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733
735 MVT::v2i32, Expand);
737
739 MVT::v4i32, Expand);
740
742 MVT::v8i32, Expand);
743
744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
754
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 }
859
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863 MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
871 Custom);
872
874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
885
886 // TODO: Could move this to custom lowering, could benefit from combines on
887 // extract of relevant bits.
889
891
894 ISD::SUB,
896 ISD::FADD,
897 ISD::FSUB,
898 ISD::FDIV,
905 ISD::FMA,
906 ISD::SMIN,
907 ISD::SMAX,
908 ISD::UMIN,
909 ISD::UMAX,
911 ISD::AND,
912 ISD::OR,
913 ISD::XOR,
914 ISD::FSHR,
924
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
927
928 // All memory operations. Some folding on the pointer operand is done to help
929 // matching the constant offsets in the addressing modes.
954
955 // FIXME: In other contexts we pretend this is a per-function property.
957
959}
960
962 return Subtarget;
963}
964
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
967 return RCRegs;
968}
969
970//===----------------------------------------------------------------------===//
971// TargetLowering queries
972//===----------------------------------------------------------------------===//
973
974// v_mad_mix* support a conversion from f16 to f32.
975//
976// There is only one special case when denormals are enabled we don't currently,
977// where this is OK to use.
978bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
979 EVT DestVT, EVT SrcVT) const {
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
982 DestVT.getScalarType() == MVT::f32 &&
983 SrcVT.getScalarType() == MVT::f16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 LLT DestTy, LLT SrcTy) const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
992 DestTy.getScalarSizeInBits() == 32 &&
993 SrcTy.getScalarSizeInBits() == 16 &&
994 // TODO: This probably only requires no input flushing?
996}
997
999 // SI has some legal vector types, but no legal vector operations. Say no
1000 // shuffles are legal in order to prefer scalarizing some vector operations.
1001 return false;
1002}
1003
1006 EVT VT) const {
1009
1010 if (VT.isVector()) {
1011 EVT ScalarVT = VT.getScalarType();
1012 unsigned Size = ScalarVT.getSizeInBits();
1013 if (Size == 16) {
1014 if (Subtarget->has16BitInsts()) {
1015 if (VT.isInteger())
1016 return MVT::v2i16;
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1018 }
1019 return VT.isInteger() ? MVT::i32 : MVT::f32;
1020 }
1021
1022 if (Size < 16)
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1024 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1025 }
1026
1027 if (VT.getSizeInBits() > 32)
1028 return MVT::i32;
1029
1031}
1032
1035 EVT VT) const {
1038
1039 if (VT.isVector()) {
1040 unsigned NumElts = VT.getVectorNumElements();
1041 EVT ScalarVT = VT.getScalarType();
1042 unsigned Size = ScalarVT.getSizeInBits();
1043
1044 // FIXME: Should probably promote 8-bit vectors to i16.
1045 if (Size == 16 && Subtarget->has16BitInsts())
1046 return (NumElts + 1) / 2;
1047
1048 if (Size <= 32)
1049 return NumElts;
1050
1051 if (Size > 32)
1052 return NumElts * ((Size + 31) / 32);
1053 } else if (VT.getSizeInBits() > 32)
1054 return (VT.getSizeInBits() + 31) / 32;
1055
1057}
1058
1060 LLVMContext &Context, CallingConv::ID CC,
1061 EVT VT, EVT &IntermediateVT,
1062 unsigned &NumIntermediates, MVT &RegisterVT) const {
1063 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1064 unsigned NumElts = VT.getVectorNumElements();
1065 EVT ScalarVT = VT.getScalarType();
1066 unsigned Size = ScalarVT.getSizeInBits();
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068 // support, but unless we can properly handle 3-vectors, it will be still be
1069 // inconsistent.
1070 if (Size == 16 && Subtarget->has16BitInsts()) {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1074 } else {
1075 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1077 }
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1080 }
1081
1082 if (Size == 32) {
1083 RegisterVT = ScalarVT.getSimpleVT();
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1087 }
1088
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1090 // FIXME: Should probably form v2i16 pieces
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1095 }
1096
1097
1098 if (Size != 16 && Size <= 32) {
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1103 }
1104
1105 if (Size > 32) {
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((Size + 31) / 32);
1109 return NumIntermediates;
1110 }
1111 }
1112
1114 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1115}
1116
1118 const DataLayout &DL, Type *Ty,
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1121
1122 LLVMContext &Ctx = Ty->getContext();
1123 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1126 NumElts);
1127 }
1128
1129 return TLI.getValueType(DL, Ty);
1130}
1131
1132// Peek through TFE struct returns to only use the data size.
1134 const DataLayout &DL, Type *Ty,
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Ty);
1137 if (!ST)
1138 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1139
1140 // TFE intrinsics return an aggregate type.
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1144}
1145
1146/// Map address space 7 to MVT::v5i32 because that's its in-memory
1147/// representation. This return value is vector-typed because there is no
1148/// MVT::i160 and it is not clear if one can be added. While this could
1149/// cause issues during codegen, these address space 7 pointers will be
1150/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1151/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1152/// modeling, to work.
1154 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1155 return MVT::v5i32;
1157 DL.getPointerSizeInBits(AS) == 192)
1158 return MVT::v6i32;
1160}
1161/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1162/// v8i32 when padding is added.
1163/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1164/// also v8i32 with padding.
1166 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1167 DL.getPointerSizeInBits(AS) == 160) ||
1169 DL.getPointerSizeInBits(AS) == 192))
1170 return MVT::v8i32;
1172}
1173
1175 const CallInst &CI,
1176 MachineFunction &MF,
1177 unsigned IntrID) const {
1179 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1181
1182 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1185 (Intrinsic::ID)IntrID);
1186 MemoryEffects ME = Attr.getMemoryEffects();
1187 if (ME.doesNotAccessMemory())
1188 return false;
1189
1190 // TODO: Should images get their own address space?
1191 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1192
1193 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1194 if (RsrcIntr->IsImage) {
1197 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1198 Info.align.reset();
1199 }
1200
1201 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1202 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1203 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1204 // We conservatively set the memory operand of a buffer intrinsic to the
1205 // base resource pointer, so that we can access alias information about
1206 // those pointers. Cases like "this points at the same value
1207 // but with a different offset" are handled in
1208 // areMemAccessesTriviallyDisjoint.
1209 Info.ptrVal = RsrcArg;
1210 }
1211
1212 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1213 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1216 if (ME.onlyReadsMemory()) {
1217 if (RsrcIntr->IsImage) {
1218 unsigned MaxNumLanes = 4;
1219
1220 if (!BaseOpcode->Gather4) {
1221 // If this isn't a gather, we may have excess loaded elements in the
1222 // IR type. Check the dmask for the real number of elements loaded.
1223 unsigned DMask
1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1225 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1226 }
1227
1228 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1229 CI.getType(), MaxNumLanes);
1230 } else {
1231 Info.memVT =
1233 std::numeric_limits<unsigned>::max());
1234 }
1235
1236 // FIXME: What does alignment mean for an image?
1239 } else if (ME.onlyWritesMemory()) {
1241
1242 Type *DataTy = CI.getArgOperand(0)->getType();
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1245 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1246 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1247 DMaskLanes);
1248 } else
1249 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1250
1252 } else {
1253 // Atomic or NoReturn Sampler
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1259
1260 switch (IntrID) {
1261 default:
1262 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1263 // Fake memory access type for no return sampler intrinsics
1264 Info.memVT = MVT::i32;
1265 } else {
1266 // XXX - Should this be volatile without known ordering?
1268 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1269 }
1270 break;
1271 case Intrinsic::amdgcn_raw_buffer_load_lds:
1272 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1273 case Intrinsic::amdgcn_struct_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1275 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1276 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1277 Info.ptrVal = CI.getArgOperand(1);
1278 return true;
1279 }
1280 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1281 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1282 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1283 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1284 Info.memVT =
1286 std::numeric_limits<unsigned>::max());
1287 Info.flags &= ~MachineMemOperand::MOStore;
1288 return true;
1289 }
1290 }
1291 }
1292 return true;
1293 }
1294
1295 switch (IntrID) {
1296 case Intrinsic::amdgcn_ds_ordered_add:
1297 case Intrinsic::amdgcn_ds_ordered_swap: {
1299 Info.memVT = MVT::getVT(CI.getType());
1300 Info.ptrVal = CI.getOperand(0);
1301 Info.align.reset();
1303
1304 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1305 if (!Vol->isZero())
1307
1308 return true;
1309 }
1310 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1311 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1313 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1314 Info.ptrVal = nullptr;
1315 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1317 return true;
1318 }
1319 case Intrinsic::amdgcn_ds_append:
1320 case Intrinsic::amdgcn_ds_consume: {
1322 Info.memVT = MVT::getVT(CI.getType());
1323 Info.ptrVal = CI.getOperand(0);
1324 Info.align.reset();
1326
1327 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1328 if (!Vol->isZero())
1330
1331 return true;
1332 }
1333 case Intrinsic::amdgcn_global_atomic_csub: {
1335 Info.memVT = MVT::getVT(CI.getType());
1336 Info.ptrVal = CI.getOperand(0);
1337 Info.align.reset();
1341 return true;
1342 }
1343 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1345 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1346
1347 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1348 Info.align.reset();
1351 return true;
1352 }
1353 case Intrinsic::amdgcn_global_atomic_fadd:
1354 case Intrinsic::amdgcn_global_atomic_fmin:
1355 case Intrinsic::amdgcn_global_atomic_fmax:
1356 case Intrinsic::amdgcn_global_atomic_fmin_num:
1357 case Intrinsic::amdgcn_global_atomic_fmax_num:
1358 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1359 case Intrinsic::amdgcn_flat_atomic_fadd:
1360 case Intrinsic::amdgcn_flat_atomic_fmin:
1361 case Intrinsic::amdgcn_flat_atomic_fmax:
1362 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1363 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1364 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1365 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1366 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1368 Info.memVT = MVT::getVT(CI.getType());
1369 Info.ptrVal = CI.getOperand(0);
1370 Info.align.reset();
1375 return true;
1376 }
1377 case Intrinsic::amdgcn_global_load_tr_b64:
1378 case Intrinsic::amdgcn_global_load_tr_b128: {
1380 Info.memVT = MVT::getVT(CI.getType());
1381 Info.ptrVal = CI.getOperand(0);
1382 Info.align.reset();
1384 return true;
1385 }
1386 case Intrinsic::amdgcn_ds_gws_init:
1387 case Intrinsic::amdgcn_ds_gws_barrier:
1388 case Intrinsic::amdgcn_ds_gws_sema_v:
1389 case Intrinsic::amdgcn_ds_gws_sema_br:
1390 case Intrinsic::amdgcn_ds_gws_sema_p:
1391 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1393
1394 const GCNTargetMachine &TM =
1395 static_cast<const GCNTargetMachine &>(getTargetMachine());
1396
1398 Info.ptrVal = MFI->getGWSPSV(TM);
1399
1400 // This is an abstract access, but we need to specify a type and size.
1401 Info.memVT = MVT::i32;
1402 Info.size = 4;
1403 Info.align = Align(4);
1404
1405 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1407 else
1409 return true;
1410 }
1411 case Intrinsic::amdgcn_global_load_lds: {
1413 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1414 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1415 Info.ptrVal = CI.getArgOperand(1);
1417 return true;
1418 }
1419 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1421
1422 const GCNTargetMachine &TM =
1423 static_cast<const GCNTargetMachine &>(getTargetMachine());
1424
1426 Info.ptrVal = MFI->getGWSPSV(TM);
1427
1428 // This is an abstract access, but we need to specify a type and size.
1429 Info.memVT = MVT::i32;
1430 Info.size = 4;
1431 Info.align = Align(4);
1432
1434 return true;
1435 }
1436 default:
1437 return false;
1438 }
1439}
1440
1442 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1443 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1444 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1445 // The DAG's ValueType loses the addrspaces.
1446 // Add them as 2 extra Constant operands "from" and "to".
1447 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1448 unsigned DstAS = I.getType()->getPointerAddressSpace();
1449 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1450 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1451 break;
1452 }
1453 default:
1454 break;
1455 }
1456}
1457
1460 Type *&AccessTy) const {
1461 Value *Ptr = nullptr;
1462 switch (II->getIntrinsicID()) {
1463 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1464 case Intrinsic::amdgcn_ds_append:
1465 case Intrinsic::amdgcn_ds_consume:
1466 case Intrinsic::amdgcn_ds_ordered_add:
1467 case Intrinsic::amdgcn_ds_ordered_swap:
1468 case Intrinsic::amdgcn_flat_atomic_fadd:
1469 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1470 case Intrinsic::amdgcn_flat_atomic_fmax:
1471 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1472 case Intrinsic::amdgcn_flat_atomic_fmin:
1473 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1474 case Intrinsic::amdgcn_global_atomic_csub:
1475 case Intrinsic::amdgcn_global_atomic_fadd:
1476 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1477 case Intrinsic::amdgcn_global_atomic_fmax:
1478 case Intrinsic::amdgcn_global_atomic_fmax_num:
1479 case Intrinsic::amdgcn_global_atomic_fmin:
1480 case Intrinsic::amdgcn_global_atomic_fmin_num:
1481 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1482 case Intrinsic::amdgcn_global_load_tr_b64:
1483 case Intrinsic::amdgcn_global_load_tr_b128:
1484 Ptr = II->getArgOperand(0);
1485 break;
1486 case Intrinsic::amdgcn_global_load_lds:
1487 Ptr = II->getArgOperand(1);
1488 break;
1489 default:
1490 return false;
1491 }
1492 AccessTy = II->getType();
1493 Ops.push_back(Ptr);
1494 return true;
1495}
1496
1498 unsigned AddrSpace) const {
1499 if (!Subtarget->hasFlatInstOffsets()) {
1500 // Flat instructions do not have offsets, and only have the register
1501 // address.
1502 return AM.BaseOffs == 0 && AM.Scale == 0;
1503 }
1504
1505 decltype(SIInstrFlags::FLAT) FlatVariant =
1509
1510 return AM.Scale == 0 &&
1511 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1512 AM.BaseOffs, AddrSpace, FlatVariant));
1513}
1514
1516 if (Subtarget->hasFlatGlobalInsts())
1518
1519 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1520 // Assume the we will use FLAT for all global memory accesses
1521 // on VI.
1522 // FIXME: This assumption is currently wrong. On VI we still use
1523 // MUBUF instructions for the r + i addressing mode. As currently
1524 // implemented, the MUBUF instructions only work on buffer < 4GB.
1525 // It may be possible to support > 4GB buffers with MUBUF instructions,
1526 // by setting the stride value in the resource descriptor which would
1527 // increase the size limit to (stride * 4GB). However, this is risky,
1528 // because it has never been validated.
1530 }
1531
1532 return isLegalMUBUFAddressingMode(AM);
1533}
1534
1535bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1536 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1537 // additionally can do r + r + i with addr64. 32-bit has more addressing
1538 // mode options. Depending on the resource constant, it can also do
1539 // (i64 r0) + (i32 r1) * (i14 i).
1540 //
1541 // Private arrays end up using a scratch buffer most of the time, so also
1542 // assume those use MUBUF instructions. Scratch loads / stores are currently
1543 // implemented as mubuf instructions with offen bit set, so slightly
1544 // different than the normal addr64.
1545 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1546 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1547 return false;
1548
1549 // FIXME: Since we can split immediate into soffset and immediate offset,
1550 // would it make sense to allow any immediate?
1551
1552 switch (AM.Scale) {
1553 case 0: // r + i or just i, depending on HasBaseReg.
1554 return true;
1555 case 1:
1556 return true; // We have r + r or r + i.
1557 case 2:
1558 if (AM.HasBaseReg) {
1559 // Reject 2 * r + r.
1560 return false;
1561 }
1562
1563 // Allow 2 * r as r + r
1564 // Or 2 * r + i is allowed as r + r + i.
1565 return true;
1566 default: // Don't allow n * r
1567 return false;
1568 }
1569}
1570
1572 const AddrMode &AM, Type *Ty,
1573 unsigned AS, Instruction *I) const {
1574 // No global is ever allowed as a base.
1575 if (AM.BaseGV)
1576 return false;
1577
1578 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1579 return isLegalGlobalAddressingMode(AM);
1580
1581 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1585 // If the offset isn't a multiple of 4, it probably isn't going to be
1586 // correctly aligned.
1587 // FIXME: Can we get the real alignment here?
1588 if (AM.BaseOffs % 4 != 0)
1589 return isLegalMUBUFAddressingMode(AM);
1590
1591 if (!Subtarget->hasScalarSubwordLoads()) {
1592 // There are no SMRD extloads, so if we have to do a small type access we
1593 // will use a MUBUF load.
1594 // FIXME?: We also need to do this if unaligned, but we don't know the
1595 // alignment here.
1596 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1597 return isLegalGlobalAddressingMode(AM);
1598 }
1599
1601 // SMRD instructions have an 8-bit, dword offset on SI.
1602 if (!isUInt<8>(AM.BaseOffs / 4))
1603 return false;
1604 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1605 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1606 // in 8-bits, it can use a smaller encoding.
1607 if (!isUInt<32>(AM.BaseOffs / 4))
1608 return false;
1609 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1610 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1611 if (!isUInt<20>(AM.BaseOffs))
1612 return false;
1613 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1614 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1615 // for S_BUFFER_* instructions).
1616 if (!isInt<21>(AM.BaseOffs))
1617 return false;
1618 } else {
1619 // On GFX12, all offsets are signed 24-bit in bytes.
1620 if (!isInt<24>(AM.BaseOffs))
1621 return false;
1622 }
1623
1624 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1626 AM.BaseOffs < 0) {
1627 // Scalar (non-buffer) loads can only use a negative offset if
1628 // soffset+offset is non-negative. Since the compiler can only prove that
1629 // in a few special cases, it is safer to claim that negative offsets are
1630 // not supported.
1631 return false;
1632 }
1633
1634 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1635 return true;
1636
1637 if (AM.Scale == 1 && AM.HasBaseReg)
1638 return true;
1639
1640 return false;
1641 }
1642
1643 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1644 return Subtarget->enableFlatScratch()
1646 : isLegalMUBUFAddressingMode(AM);
1647
1648 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1649 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1650 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1651 // field.
1652 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1653 // an 8-bit dword offset but we don't know the alignment here.
1654 if (!isUInt<16>(AM.BaseOffs))
1655 return false;
1656
1657 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1658 return true;
1659
1660 if (AM.Scale == 1 && AM.HasBaseReg)
1661 return true;
1662
1663 return false;
1664 }
1665
1667 // For an unknown address space, this usually means that this is for some
1668 // reason being used for pure arithmetic, and not based on some addressing
1669 // computation. We don't have instructions that compute pointers with any
1670 // addressing modes, so treat them as having no offset like flat
1671 // instructions.
1673 }
1674
1675 // Assume a user alias of global for unknown address spaces.
1676 return isLegalGlobalAddressingMode(AM);
1677}
1678
1680 const MachineFunction &MF) const {
1682 return (MemVT.getSizeInBits() <= 4 * 32);
1683 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1684 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1685 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1686 }
1688 return (MemVT.getSizeInBits() <= 2 * 32);
1689 return true;
1690}
1691
1693 unsigned Size, unsigned AddrSpace, Align Alignment,
1694 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1695 if (IsFast)
1696 *IsFast = 0;
1697
1698 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1699 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1700 // Check if alignment requirements for ds_read/write instructions are
1701 // disabled.
1702 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1703 return false;
1704
1705 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1706 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1707 Alignment < RequiredAlignment)
1708 return false;
1709
1710 // Either, the alignment requirements are "enabled", or there is an
1711 // unaligned LDS access related hardware bug though alignment requirements
1712 // are "disabled". In either case, we need to check for proper alignment
1713 // requirements.
1714 //
1715 switch (Size) {
1716 case 64:
1717 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1718 // address is negative, then the instruction is incorrectly treated as
1719 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1720 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1721 // load later in the SILoadStoreOptimizer.
1722 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1723 return false;
1724
1725 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1726 // can do a 4 byte aligned, 8 byte access in a single operation using
1727 // ds_read2/write2_b32 with adjacent offsets.
1728 RequiredAlignment = Align(4);
1729
1730 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1731 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1732 // ds_write2_b32 depending on the alignment. In either case with either
1733 // alignment there is no faster way of doing this.
1734
1735 // The numbers returned here and below are not additive, it is a 'speed
1736 // rank'. They are just meant to be compared to decide if a certain way
1737 // of lowering an operation is faster than another. For that purpose
1738 // naturally aligned operation gets it bitsize to indicate that "it
1739 // operates with a speed comparable to N-bit wide load". With the full
1740 // alignment ds128 is slower than ds96 for example. If underaligned it
1741 // is comparable to a speed of a single dword access, which would then
1742 // mean 32 < 128 and it is faster to issue a wide load regardless.
1743 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1744 // wider load which will not be aligned anymore the latter is slower.
1745 if (IsFast)
1746 *IsFast = (Alignment >= RequiredAlignment) ? 64
1747 : (Alignment < Align(4)) ? 32
1748 : 1;
1749 return true;
1750 }
1751
1752 break;
1753 case 96:
1754 if (!Subtarget->hasDS96AndDS128())
1755 return false;
1756
1757 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1758 // gfx8 and older.
1759
1760 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1761 // Naturally aligned access is fastest. However, also report it is Fast
1762 // if memory is aligned less than DWORD. A narrow load or store will be
1763 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1764 // be more of them, so overall we will pay less penalty issuing a single
1765 // instruction.
1766
1767 // See comment on the values above.
1768 if (IsFast)
1769 *IsFast = (Alignment >= RequiredAlignment) ? 96
1770 : (Alignment < Align(4)) ? 32
1771 : 1;
1772 return true;
1773 }
1774
1775 break;
1776 case 128:
1777 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1778 return false;
1779
1780 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1781 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1782 // single operation using ds_read2/write2_b64.
1783 RequiredAlignment = Align(8);
1784
1785 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1786 // Naturally aligned access is fastest. However, also report it is Fast
1787 // if memory is aligned less than DWORD. A narrow load or store will be
1788 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1789 // will be more of them, so overall we will pay less penalty issuing a
1790 // single instruction.
1791
1792 // See comment on the values above.
1793 if (IsFast)
1794 *IsFast = (Alignment >= RequiredAlignment) ? 128
1795 : (Alignment < Align(4)) ? 32
1796 : 1;
1797 return true;
1798 }
1799
1800 break;
1801 default:
1802 if (Size > 32)
1803 return false;
1804
1805 break;
1806 }
1807
1808 // See comment on the values above.
1809 // Note that we have a single-dword or sub-dword here, so if underaligned
1810 // it is a slowest possible access, hence returned value is 0.
1811 if (IsFast)
1812 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1813
1814 return Alignment >= RequiredAlignment ||
1815 Subtarget->hasUnalignedDSAccessEnabled();
1816 }
1817
1818 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1819 bool AlignedBy4 = Alignment >= Align(4);
1820 if (IsFast)
1821 *IsFast = AlignedBy4;
1822
1823 return AlignedBy4 ||
1824 Subtarget->enableFlatScratch() ||
1825 Subtarget->hasUnalignedScratchAccess();
1826 }
1827
1828 // FIXME: We have to be conservative here and assume that flat operations
1829 // will access scratch. If we had access to the IR function, then we
1830 // could determine if any private memory was used in the function.
1831 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1832 !Subtarget->hasUnalignedScratchAccess()) {
1833 bool AlignedBy4 = Alignment >= Align(4);
1834 if (IsFast)
1835 *IsFast = AlignedBy4;
1836
1837 return AlignedBy4;
1838 }
1839
1840 // So long as they are correct, wide global memory operations perform better
1841 // than multiple smaller memory ops -- even when misaligned
1842 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1843 if (IsFast)
1844 *IsFast = Size;
1845
1846 return Alignment >= Align(4) ||
1848 }
1849
1850 // Smaller than dword value must be aligned.
1851 if (Size < 32)
1852 return false;
1853
1854 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1855 // byte-address are ignored, thus forcing Dword alignment.
1856 // This applies to private, global, and constant memory.
1857 if (IsFast)
1858 *IsFast = 1;
1859
1860 return Size >= 32 && Alignment >= Align(4);
1861}
1862
1864 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1865 unsigned *IsFast) const {
1867 Alignment, Flags, IsFast);
1868}
1869
1871 const MemOp &Op, const AttributeList &FuncAttributes) const {
1872 // FIXME: Should account for address space here.
1873
1874 // The default fallback uses the private pointer size as a guess for a type to
1875 // use. Make sure we switch these to 64-bit accesses.
1876
1877 if (Op.size() >= 16 &&
1878 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1879 return MVT::v4i32;
1880
1881 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1882 return MVT::v2i32;
1883
1884 // Use the default.
1885 return MVT::Other;
1886}
1887
1889 const MemSDNode *MemNode = cast<MemSDNode>(N);
1890 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1891}
1892
1894 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1896}
1897
1899 unsigned DestAS) const {
1900 // Flat -> private/local is a simple truncate.
1901 // Flat -> global is no-op
1902 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1903 return true;
1904
1905 const GCNTargetMachine &TM =
1906 static_cast<const GCNTargetMachine &>(getTargetMachine());
1907 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1908}
1909
1911 const MemSDNode *MemNode = cast<MemSDNode>(N);
1912
1914}
1915
1918 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1919 VT.getScalarType().bitsLE(MVT::i16))
1922}
1923
1925 Type *Ty) const {
1926 // FIXME: Could be smarter if called for vector constants.
1927 return true;
1928}
1929
1931 unsigned Index) const {
1933 return false;
1934
1935 // TODO: Add more cases that are cheap.
1936 return Index == 0;
1937}
1938
1940 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1941 switch (Op) {
1942 case ISD::LOAD:
1943 case ISD::STORE:
1944
1945 // These operations are done with 32-bit instructions anyway.
1946 case ISD::AND:
1947 case ISD::OR:
1948 case ISD::XOR:
1949 case ISD::SELECT:
1950 // TODO: Extensions?
1951 return true;
1952 default:
1953 return false;
1954 }
1955 }
1956
1957 // SimplifySetCC uses this function to determine whether or not it should
1958 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1959 if (VT == MVT::i1 && Op == ISD::SETCC)
1960 return false;
1961
1963}
1964
1965SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1966 const SDLoc &SL,
1967 SDValue Chain,
1968 uint64_t Offset) const {
1969 const DataLayout &DL = DAG.getDataLayout();
1972
1973 const ArgDescriptor *InputPtrReg;
1974 const TargetRegisterClass *RC;
1975 LLT ArgTy;
1977
1978 std::tie(InputPtrReg, RC, ArgTy) =
1980
1981 // We may not have the kernarg segment argument if we have no kernel
1982 // arguments.
1983 if (!InputPtrReg)
1984 return DAG.getConstant(Offset, SL, PtrVT);
1985
1987 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1988 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1989
1990 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1991}
1992
1993SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1994 const SDLoc &SL) const {
1997 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1998}
1999
2000SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2001 const SDLoc &SL) const {
2002
2004 std::optional<uint32_t> KnownSize =
2006 if (KnownSize.has_value())
2007 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2008 return SDValue();
2009}
2010
2011SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2012 const SDLoc &SL, SDValue Val,
2013 bool Signed,
2014 const ISD::InputArg *Arg) const {
2015 // First, if it is a widened vector, narrow it.
2016 if (VT.isVector() &&
2018 EVT NarrowedVT =
2021 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2022 DAG.getConstant(0, SL, MVT::i32));
2023 }
2024
2025 // Then convert the vector elements or scalar value.
2026 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2027 VT.bitsLT(MemVT)) {
2028 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2029 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2030 }
2031
2032 if (MemVT.isFloatingPoint())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2034 else if (Signed)
2035 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2036 else
2037 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2038
2039 return Val;
2040}
2041
2042SDValue SITargetLowering::lowerKernargMemParameter(
2043 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2044 uint64_t Offset, Align Alignment, bool Signed,
2045 const ISD::InputArg *Arg) const {
2047
2048 // Try to avoid using an extload by loading earlier than the argument address,
2049 // and extracting the relevant bits. The load should hopefully be merged with
2050 // the previous argument.
2051 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2052 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2053 int64_t AlignDownOffset = alignDown(Offset, 4);
2054 int64_t OffsetDiff = Offset - AlignDownOffset;
2055
2056 EVT IntVT = MemVT.changeTypeToInteger();
2057
2058 // TODO: If we passed in the base kernel offset we could have a better
2059 // alignment than 4, but we don't really need it.
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2061 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2064
2065 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2066 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2067
2068 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2069 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2071
2072
2073 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2074 }
2075
2076 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2077 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2080
2081 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2082 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2083}
2084
2085SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2086 const SDLoc &SL, SDValue Chain,
2087 const ISD::InputArg &Arg) const {
2089 MachineFrameInfo &MFI = MF.getFrameInfo();
2090
2091 if (Arg.Flags.isByVal()) {
2092 unsigned Size = Arg.Flags.getByValSize();
2093 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2094 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2095 }
2096
2097 unsigned ArgOffset = VA.getLocMemOffset();
2098 unsigned ArgSize = VA.getValVT().getStoreSize();
2099
2100 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2101
2102 // Create load nodes to retrieve arguments from the stack.
2103 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2104 SDValue ArgValue;
2105
2106 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2108 MVT MemVT = VA.getValVT();
2109
2110 switch (VA.getLocInfo()) {
2111 default:
2112 break;
2113 case CCValAssign::BCvt:
2114 MemVT = VA.getLocVT();
2115 break;
2116 case CCValAssign::SExt:
2117 ExtType = ISD::SEXTLOAD;
2118 break;
2119 case CCValAssign::ZExt:
2120 ExtType = ISD::ZEXTLOAD;
2121 break;
2122 case CCValAssign::AExt:
2123 ExtType = ISD::EXTLOAD;
2124 break;
2125 }
2126
2127 ArgValue = DAG.getExtLoad(
2128 ExtType, SL, VA.getLocVT(), Chain, FIN,
2130 MemVT);
2131 return ArgValue;
2132}
2133
2134SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2135 const SIMachineFunctionInfo &MFI,
2136 EVT VT,
2138 const ArgDescriptor *Reg = nullptr;
2139 const TargetRegisterClass *RC;
2140 LLT Ty;
2141
2143 const ArgDescriptor WorkGroupIDX =
2144 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2145 // If GridZ is not programmed in an entry function then the hardware will set
2146 // it to all zeros, so there is no need to mask the GridY value in the low
2147 // order bits.
2148 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2149 AMDGPU::TTMP7,
2150 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2151 const ArgDescriptor WorkGroupIDZ =
2152 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2153 if (Subtarget->hasArchitectedSGPRs() &&
2155 switch (PVID) {
2157 Reg = &WorkGroupIDX;
2158 RC = &AMDGPU::SReg_32RegClass;
2159 Ty = LLT::scalar(32);
2160 break;
2162 Reg = &WorkGroupIDY;
2163 RC = &AMDGPU::SReg_32RegClass;
2164 Ty = LLT::scalar(32);
2165 break;
2167 Reg = &WorkGroupIDZ;
2168 RC = &AMDGPU::SReg_32RegClass;
2169 Ty = LLT::scalar(32);
2170 break;
2171 default:
2172 break;
2173 }
2174 }
2175
2176 if (!Reg)
2177 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2178 if (!Reg) {
2180 // It's possible for a kernarg intrinsic call to appear in a kernel with
2181 // no allocated segment, in which case we do not add the user sgpr
2182 // argument, so just return null.
2183 return DAG.getConstant(0, SDLoc(), VT);
2184 }
2185
2186 // It's undefined behavior if a function marked with the amdgpu-no-*
2187 // attributes uses the corresponding intrinsic.
2188 return DAG.getUNDEF(VT);
2189 }
2190
2191 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2192}
2193
2195 CallingConv::ID CallConv,
2196 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2197 FunctionType *FType,
2198 SIMachineFunctionInfo *Info) {
2199 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2200 const ISD::InputArg *Arg = &Ins[I];
2201
2202 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2203 "vector type argument should have been split");
2204
2205 // First check if it's a PS input addr.
2206 if (CallConv == CallingConv::AMDGPU_PS &&
2207 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2208 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2209
2210 // Inconveniently only the first part of the split is marked as isSplit,
2211 // so skip to the end. We only want to increment PSInputNum once for the
2212 // entire split argument.
2213 if (Arg->Flags.isSplit()) {
2214 while (!Arg->Flags.isSplitEnd()) {
2215 assert((!Arg->VT.isVector() ||
2216 Arg->VT.getScalarSizeInBits() == 16) &&
2217 "unexpected vector split in ps argument type");
2218 if (!SkipArg)
2219 Splits.push_back(*Arg);
2220 Arg = &Ins[++I];
2221 }
2222 }
2223
2224 if (SkipArg) {
2225 // We can safely skip PS inputs.
2226 Skipped.set(Arg->getOrigArgIndex());
2227 ++PSInputNum;
2228 continue;
2229 }
2230
2231 Info->markPSInputAllocated(PSInputNum);
2232 if (Arg->Used)
2233 Info->markPSInputEnabled(PSInputNum);
2234
2235 ++PSInputNum;
2236 }
2237
2238 Splits.push_back(*Arg);
2239 }
2240}
2241
2242// Allocate special inputs passed in VGPRs.
2244 MachineFunction &MF,
2245 const SIRegisterInfo &TRI,
2246 SIMachineFunctionInfo &Info) const {
2247 const LLT S32 = LLT::scalar(32);
2249
2250 if (Info.hasWorkItemIDX()) {
2251 Register Reg = AMDGPU::VGPR0;
2252 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2253
2254 CCInfo.AllocateReg(Reg);
2255 unsigned Mask = (Subtarget->hasPackedTID() &&
2256 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2257 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2258 }
2259
2260 if (Info.hasWorkItemIDY()) {
2261 assert(Info.hasWorkItemIDX());
2262 if (Subtarget->hasPackedTID()) {
2263 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2264 0x3ff << 10));
2265 } else {
2266 unsigned Reg = AMDGPU::VGPR1;
2267 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2268
2269 CCInfo.AllocateReg(Reg);
2270 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2271 }
2272 }
2273
2274 if (Info.hasWorkItemIDZ()) {
2275 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2276 if (Subtarget->hasPackedTID()) {
2277 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2278 0x3ff << 20));
2279 } else {
2280 unsigned Reg = AMDGPU::VGPR2;
2281 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2282
2283 CCInfo.AllocateReg(Reg);
2284 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2285 }
2286 }
2287}
2288
2289// Try to allocate a VGPR at the end of the argument list, or if no argument
2290// VGPRs are left allocating a stack slot.
2291// If \p Mask is is given it indicates bitfield position in the register.
2292// If \p Arg is given use it with new ]p Mask instead of allocating new.
2293static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2294 ArgDescriptor Arg = ArgDescriptor()) {
2295 if (Arg.isSet())
2296 return ArgDescriptor::createArg(Arg, Mask);
2297
2298 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2299 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2300 if (RegIdx == ArgVGPRs.size()) {
2301 // Spill to stack required.
2302 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2303
2304 return ArgDescriptor::createStack(Offset, Mask);
2305 }
2306
2307 unsigned Reg = ArgVGPRs[RegIdx];
2308 Reg = CCInfo.AllocateReg(Reg);
2309 assert(Reg != AMDGPU::NoRegister);
2310
2311 MachineFunction &MF = CCInfo.getMachineFunction();
2312 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2313 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2314 return ArgDescriptor::createRegister(Reg, Mask);
2315}
2316
2318 const TargetRegisterClass *RC,
2319 unsigned NumArgRegs) {
2320 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2321 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2322 if (RegIdx == ArgSGPRs.size())
2323 report_fatal_error("ran out of SGPRs for arguments");
2324
2325 unsigned Reg = ArgSGPRs[RegIdx];
2326 Reg = CCInfo.AllocateReg(Reg);
2327 assert(Reg != AMDGPU::NoRegister);
2328
2329 MachineFunction &MF = CCInfo.getMachineFunction();
2330 MF.addLiveIn(Reg, RC);
2332}
2333
2334// If this has a fixed position, we still should allocate the register in the
2335// CCInfo state. Technically we could get away with this for values passed
2336// outside of the normal argument range.
2338 const TargetRegisterClass *RC,
2339 MCRegister Reg) {
2340 Reg = CCInfo.AllocateReg(Reg);
2341 assert(Reg != AMDGPU::NoRegister);
2342 MachineFunction &MF = CCInfo.getMachineFunction();
2343 MF.addLiveIn(Reg, RC);
2344}
2345
2346static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2347 if (Arg) {
2348 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2349 Arg.getRegister());
2350 } else
2351 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2352}
2353
2354static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2355 if (Arg) {
2356 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2357 Arg.getRegister());
2358 } else
2359 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2360}
2361
2362/// Allocate implicit function VGPR arguments at the end of allocated user
2363/// arguments.
2365 CCState &CCInfo, MachineFunction &MF,
2366 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2367 const unsigned Mask = 0x3ff;
2368 ArgDescriptor Arg;
2369
2370 if (Info.hasWorkItemIDX()) {
2371 Arg = allocateVGPR32Input(CCInfo, Mask);
2372 Info.setWorkItemIDX(Arg);
2373 }
2374
2375 if (Info.hasWorkItemIDY()) {
2376 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2377 Info.setWorkItemIDY(Arg);
2378 }
2379
2380 if (Info.hasWorkItemIDZ())
2381 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2382}
2383
2384/// Allocate implicit function VGPR arguments in fixed registers.
2386 CCState &CCInfo, MachineFunction &MF,
2387 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2388 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2389 if (!Reg)
2390 report_fatal_error("failed to allocated VGPR for implicit arguments");
2391
2392 const unsigned Mask = 0x3ff;
2393 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2394 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2395 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2396}
2397
2399 CCState &CCInfo,
2400 MachineFunction &MF,
2401 const SIRegisterInfo &TRI,
2402 SIMachineFunctionInfo &Info) const {
2403 auto &ArgInfo = Info.getArgInfo();
2404 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2405
2406 // TODO: Unify handling with private memory pointers.
2407 if (UserSGPRInfo.hasDispatchPtr())
2408 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2409
2410 const Module *M = MF.getFunction().getParent();
2411 if (UserSGPRInfo.hasQueuePtr() &&
2413 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2414
2415 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2416 // constant offset from the kernarg segment.
2417 if (Info.hasImplicitArgPtr())
2418 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2419
2420 if (UserSGPRInfo.hasDispatchID())
2421 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2422
2423 // flat_scratch_init is not applicable for non-kernel functions.
2424
2425 if (Info.hasWorkGroupIDX())
2426 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2427
2428 if (Info.hasWorkGroupIDY())
2429 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2430
2431 if (Info.hasWorkGroupIDZ())
2432 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2433
2434 if (Info.hasLDSKernelId())
2435 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2436}
2437
2438// Allocate special inputs passed in user SGPRs.
2440 MachineFunction &MF,
2441 const SIRegisterInfo &TRI,
2442 SIMachineFunctionInfo &Info) const {
2443 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2444 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2445 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2446 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2447 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2448 }
2449
2450 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2451 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2452 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2453 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2454 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2455 }
2456
2457 if (UserSGPRInfo.hasDispatchPtr()) {
2458 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2459 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2460 CCInfo.AllocateReg(DispatchPtrReg);
2461 }
2462
2463 const Module *M = MF.getFunction().getParent();
2464 if (UserSGPRInfo.hasQueuePtr() &&
2466 Register QueuePtrReg = Info.addQueuePtr(TRI);
2467 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2468 CCInfo.AllocateReg(QueuePtrReg);
2469 }
2470
2471 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2473 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2474 CCInfo.AllocateReg(InputPtrReg);
2475
2476 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2477 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2478 }
2479
2480 if (UserSGPRInfo.hasDispatchID()) {
2481 Register DispatchIDReg = Info.addDispatchID(TRI);
2482 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2483 CCInfo.AllocateReg(DispatchIDReg);
2484 }
2485
2486 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2487 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2488 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2489 CCInfo.AllocateReg(FlatScratchInitReg);
2490 }
2491
2492 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2493 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2494 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2495 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2496 }
2497
2498 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2499 // these from the dispatch pointer.
2500}
2501
2502// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2503// sequential starting from the first argument.
2505 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2507 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2508 Function &F = MF.getFunction();
2509 unsigned LastExplicitArgOffset =
2510 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2511 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2512 bool InPreloadSequence = true;
2513 unsigned InIdx = 0;
2514 for (auto &Arg : F.args()) {
2515 if (!InPreloadSequence || !Arg.hasInRegAttr())
2516 break;
2517
2518 int ArgIdx = Arg.getArgNo();
2519 // Don't preload non-original args or parts not in the current preload
2520 // sequence.
2521 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2522 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2523 break;
2524
2525 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2526 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2527 InIdx++) {
2528 assert(ArgLocs[ArgIdx].isMemLoc());
2529 auto &ArgLoc = ArgLocs[InIdx];
2530 const Align KernelArgBaseAlign = Align(16);
2531 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2532 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2533 unsigned NumAllocSGPRs =
2534 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2535
2536 // Arg is preloaded into the previous SGPR.
2537 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2538 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2539 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2540 continue;
2541 }
2542
2543 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2544 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2545 // Check for free user SGPRs for preloading.
2546 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2547 SGPRInfo.getNumFreeUserSGPRs()) {
2548 InPreloadSequence = false;
2549 break;
2550 }
2551
2552 // Preload this argument.
2553 const TargetRegisterClass *RC =
2554 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2555 SmallVectorImpl<MCRegister> *PreloadRegs =
2556 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2557
2558 if (PreloadRegs->size() > 1)
2559 RC = &AMDGPU::SGPR_32RegClass;
2560 for (auto &Reg : *PreloadRegs) {
2561 assert(Reg);
2562 MF.addLiveIn(Reg, RC);
2563 CCInfo.AllocateReg(Reg);
2564 }
2565
2566 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2567 }
2568 }
2569}
2570
2572 const SIRegisterInfo &TRI,
2573 SIMachineFunctionInfo &Info) const {
2574 // Always allocate this last since it is a synthetic preload.
2575 if (Info.hasLDSKernelId()) {
2576 Register Reg = Info.addLDSKernelId();
2577 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2578 CCInfo.AllocateReg(Reg);
2579 }
2580}
2581
2582// Allocate special input registers that are initialized per-wave.
2584 MachineFunction &MF,
2586 CallingConv::ID CallConv,
2587 bool IsShader) const {
2588 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2589 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2590 // Note: user SGPRs are handled by the front-end for graphics shaders
2591 // Pad up the used user SGPRs with dead inputs.
2592
2593 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2594 // before enabling architected SGPRs for workgroup IDs.
2595 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2596
2597 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2598 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2599 // rely on it to reach 16 since if we end up having no stack usage, it will
2600 // not really be added.
2601 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2602 Info.hasWorkGroupIDY() +
2603 Info.hasWorkGroupIDZ() +
2604 Info.hasWorkGroupInfo();
2605 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2606 Register Reg = Info.addReservedUserSGPR();
2607 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2608 CCInfo.AllocateReg(Reg);
2609 }
2610 }
2611
2612 if (!HasArchitectedSGPRs) {
2613 if (Info.hasWorkGroupIDX()) {
2614 Register Reg = Info.addWorkGroupIDX();
2615 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2616 CCInfo.AllocateReg(Reg);
2617 }
2618
2619 if (Info.hasWorkGroupIDY()) {
2620 Register Reg = Info.addWorkGroupIDY();
2621 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2622 CCInfo.AllocateReg(Reg);
2623 }
2624
2625 if (Info.hasWorkGroupIDZ()) {
2626 Register Reg = Info.addWorkGroupIDZ();
2627 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.AllocateReg(Reg);
2629 }
2630 }
2631
2632 if (Info.hasWorkGroupInfo()) {
2633 Register Reg = Info.addWorkGroupInfo();
2634 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2635 CCInfo.AllocateReg(Reg);
2636 }
2637
2638 if (Info.hasPrivateSegmentWaveByteOffset()) {
2639 // Scratch wave offset passed in system SGPR.
2640 unsigned PrivateSegmentWaveByteOffsetReg;
2641
2642 if (IsShader) {
2643 PrivateSegmentWaveByteOffsetReg =
2644 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2645
2646 // This is true if the scratch wave byte offset doesn't have a fixed
2647 // location.
2648 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2649 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2650 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2651 }
2652 } else
2653 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2654
2655 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2656 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2657 }
2658
2659 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2660 Info.getNumPreloadedSGPRs() >= 16);
2661}
2662
2664 MachineFunction &MF,
2665 const SIRegisterInfo &TRI,
2666 SIMachineFunctionInfo &Info) {
2667 // Now that we've figured out where the scratch register inputs are, see if
2668 // should reserve the arguments and use them directly.
2669 MachineFrameInfo &MFI = MF.getFrameInfo();
2670 bool HasStackObjects = MFI.hasStackObjects();
2671 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2672
2673 // Record that we know we have non-spill stack objects so we don't need to
2674 // check all stack objects later.
2675 if (HasStackObjects)
2676 Info.setHasNonSpillStackObjects(true);
2677
2678 // Everything live out of a block is spilled with fast regalloc, so it's
2679 // almost certain that spilling will be required.
2680 if (TM.getOptLevel() == CodeGenOptLevel::None)
2681 HasStackObjects = true;
2682
2683 // For now assume stack access is needed in any callee functions, so we need
2684 // the scratch registers to pass in.
2685 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2686
2687 if (!ST.enableFlatScratch()) {
2688 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2689 // If we have stack objects, we unquestionably need the private buffer
2690 // resource. For the Code Object V2 ABI, this will be the first 4 user
2691 // SGPR inputs. We can reserve those and use them directly.
2692
2693 Register PrivateSegmentBufferReg =
2695 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2696 } else {
2697 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2698 // We tentatively reserve the last registers (skipping the last registers
2699 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2700 // we'll replace these with the ones immediately after those which were
2701 // really allocated. In the prologue copies will be inserted from the
2702 // argument to these reserved registers.
2703
2704 // Without HSA, relocations are used for the scratch pointer and the
2705 // buffer resource setup is always inserted in the prologue. Scratch wave
2706 // offset is still in an input SGPR.
2707 Info.setScratchRSrcReg(ReservedBufferReg);
2708 }
2709 }
2710
2712
2713 // For entry functions we have to set up the stack pointer if we use it,
2714 // whereas non-entry functions get this "for free". This means there is no
2715 // intrinsic advantage to using S32 over S34 in cases where we do not have
2716 // calls but do need a frame pointer (i.e. if we are requested to have one
2717 // because frame pointer elimination is disabled). To keep things simple we
2718 // only ever use S32 as the call ABI stack pointer, and so using it does not
2719 // imply we need a separate frame pointer.
2720 //
2721 // Try to use s32 as the SP, but move it if it would interfere with input
2722 // arguments. This won't work with calls though.
2723 //
2724 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2725 // registers.
2726 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2727 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2728 } else {
2730
2731 if (MFI.hasCalls())
2732 report_fatal_error("call in graphics shader with too many input SGPRs");
2733
2734 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2735 if (!MRI.isLiveIn(Reg)) {
2736 Info.setStackPtrOffsetReg(Reg);
2737 break;
2738 }
2739 }
2740
2741 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2742 report_fatal_error("failed to find register for SP");
2743 }
2744
2745 // hasFP should be accurate for entry functions even before the frame is
2746 // finalized, because it does not rely on the known stack size, only
2747 // properties like whether variable sized objects are present.
2748 if (ST.getFrameLowering()->hasFP(MF)) {
2749 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2750 }
2751}
2752
2755 return !Info->isEntryFunction();
2756}
2757
2759
2760}
2761
2763 MachineBasicBlock *Entry,
2764 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2766
2767 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2768 if (!IStart)
2769 return;
2770
2771 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2772 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2773 MachineBasicBlock::iterator MBBI = Entry->begin();
2774 for (const MCPhysReg *I = IStart; *I; ++I) {
2775 const TargetRegisterClass *RC = nullptr;
2776 if (AMDGPU::SReg_64RegClass.contains(*I))
2777 RC = &AMDGPU::SGPR_64RegClass;
2778 else if (AMDGPU::SReg_32RegClass.contains(*I))
2779 RC = &AMDGPU::SGPR_32RegClass;
2780 else
2781 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2782
2783 Register NewVR = MRI->createVirtualRegister(RC);
2784 // Create copy from CSR to a virtual register.
2785 Entry->addLiveIn(*I);
2786 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2787 .addReg(*I);
2788
2789 // Insert the copy-back instructions right before the terminator.
2790 for (auto *Exit : Exits)
2791 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2792 TII->get(TargetOpcode::COPY), *I)
2793 .addReg(NewVR);
2794 }
2795}
2796
2798 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2799 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2800 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2802
2804 const Function &Fn = MF.getFunction();
2807
2808 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2809 DiagnosticInfoUnsupported NoGraphicsHSA(
2810 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2811 DAG.getContext()->diagnose(NoGraphicsHSA);
2812 return DAG.getEntryNode();
2813 }
2814
2817 BitVector Skipped(Ins.size());
2818 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2819 *DAG.getContext());
2820
2821 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2822 bool IsKernel = AMDGPU::isKernel(CallConv);
2823 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2824
2825 if (IsGraphics) {
2826 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2827 assert(!UserSGPRInfo.hasDispatchPtr() &&
2828 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2829 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2830 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2831 (void)UserSGPRInfo;
2832 if (!Subtarget->enableFlatScratch())
2833 assert(!UserSGPRInfo.hasFlatScratchInit());
2834 if ((CallConv != CallingConv::AMDGPU_CS &&
2835 CallConv != CallingConv::AMDGPU_Gfx) ||
2836 !Subtarget->hasArchitectedSGPRs())
2837 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2838 !Info->hasWorkGroupIDZ());
2839 }
2840
2841 if (CallConv == CallingConv::AMDGPU_PS) {
2842 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2843
2844 // At least one interpolation mode must be enabled or else the GPU will
2845 // hang.
2846 //
2847 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2848 // set PSInputAddr, the user wants to enable some bits after the compilation
2849 // based on run-time states. Since we can't know what the final PSInputEna
2850 // will look like, so we shouldn't do anything here and the user should take
2851 // responsibility for the correct programming.
2852 //
2853 // Otherwise, the following restrictions apply:
2854 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2855 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2856 // enabled too.
2857 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2858 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2859 CCInfo.AllocateReg(AMDGPU::VGPR0);
2860 CCInfo.AllocateReg(AMDGPU::VGPR1);
2861 Info->markPSInputAllocated(0);
2862 Info->markPSInputEnabled(0);
2863 }
2864 if (Subtarget->isAmdPalOS()) {
2865 // For isAmdPalOS, the user does not enable some bits after compilation
2866 // based on run-time states; the register values being generated here are
2867 // the final ones set in hardware. Therefore we need to apply the
2868 // workaround to PSInputAddr and PSInputEnable together. (The case where
2869 // a bit is set in PSInputAddr but not PSInputEnable is where the
2870 // frontend set up an input arg for a particular interpolation mode, but
2871 // nothing uses that input arg. Really we should have an earlier pass
2872 // that removes such an arg.)
2873 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2874 if ((PsInputBits & 0x7F) == 0 ||
2875 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2876 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2877 }
2878 } else if (IsKernel) {
2879 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2880 } else {
2881 Splits.append(Ins.begin(), Ins.end());
2882 }
2883
2884 if (IsKernel)
2885 analyzeFormalArgumentsCompute(CCInfo, Ins);
2886
2887 if (IsEntryFunc) {
2888 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2889 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2890 if (IsKernel && Subtarget->hasKernargPreload())
2891 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2892
2893 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2894 } else if (!IsGraphics) {
2895 // For the fixed ABI, pass workitem IDs in the last argument register.
2896 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2897
2898 // FIXME: Sink this into allocateSpecialInputSGPRs
2899 if (!Subtarget->enableFlatScratch())
2900 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2901
2902 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2903 }
2904
2905 if (!IsKernel) {
2906 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2907 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2908 }
2909
2911
2912 // FIXME: This is the minimum kernel argument alignment. We should improve
2913 // this to the maximum alignment of the arguments.
2914 //
2915 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2916 // kern arg offset.
2917 const Align KernelArgBaseAlign = Align(16);
2918
2919 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2920 const ISD::InputArg &Arg = Ins[i];
2921 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2922 InVals.push_back(DAG.getUNDEF(Arg.VT));
2923 continue;
2924 }
2925
2926 CCValAssign &VA = ArgLocs[ArgIdx++];
2927 MVT VT = VA.getLocVT();
2928
2929 if (IsEntryFunc && VA.isMemLoc()) {
2930 VT = Ins[i].VT;
2931 EVT MemVT = VA.getLocVT();
2932
2933 const uint64_t Offset = VA.getLocMemOffset();
2934 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2935
2936 if (Arg.Flags.isByRef()) {
2937 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2938
2939 const GCNTargetMachine &TM =
2940 static_cast<const GCNTargetMachine &>(getTargetMachine());
2941 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2942 Arg.Flags.getPointerAddrSpace())) {
2945 }
2946
2947 InVals.push_back(Ptr);
2948 continue;
2949 }
2950
2951 SDValue NewArg;
2952 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2953 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2954 // In this case the argument is packed into the previous preload SGPR.
2955 int64_t AlignDownOffset = alignDown(Offset, 4);
2956 int64_t OffsetDiff = Offset - AlignDownOffset;
2957 EVT IntVT = MemVT.changeTypeToInteger();
2958
2962 Register Reg =
2963 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2964
2965 assert(Reg);
2966 Register VReg = MRI.getLiveInVirtReg(Reg);
2967 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2968
2969 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2970 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2971
2972 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2973 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2974 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2975 Ins[i].Flags.isSExt(), &Ins[i]);
2976
2977 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2978 } else {
2982 const SmallVectorImpl<MCRegister> &PreloadRegs =
2983 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2984
2985 SDValue Copy;
2986 if (PreloadRegs.size() == 1) {
2987 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2988 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2989 NewArg = DAG.getCopyFromReg(
2990 Chain, DL, VReg,
2992 TRI->getRegSizeInBits(*RC)));
2993
2994 } else {
2995 // If the kernarg alignment does not match the alignment of the SGPR
2996 // tuple RC that can accommodate this argument, it will be built up
2997 // via copies from from the individual SGPRs that the argument was
2998 // preloaded to.
3000 for (auto Reg : PreloadRegs) {
3001 Register VReg = MRI.getLiveInVirtReg(Reg);
3002 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3003 Elts.push_back(Copy);
3004 }
3005 NewArg =
3006 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3007 PreloadRegs.size()),
3008 DL, Elts);
3009 }
3010
3011 // If the argument was preloaded to multiple consecutive 32-bit
3012 // registers because of misalignment between addressable SGPR tuples
3013 // and the argument size, we can still assume that because of kernarg
3014 // segment alignment restrictions that NewArg's size is the same as
3015 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3016 // truncate since we cannot preload to less than a single SGPR and the
3017 // MemVT may be smaller.
3018 EVT MemVTInt =
3020 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3021 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3022
3023 NewArg = DAG.getBitcast(MemVT, NewArg);
3024 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3025 Ins[i].Flags.isSExt(), &Ins[i]);
3026 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3027 }
3028 } else {
3029 NewArg =
3030 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3031 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3032 }
3033 Chains.push_back(NewArg.getValue(1));
3034
3035 auto *ParamTy =
3036 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3038 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3039 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3040 // On SI local pointers are just offsets into LDS, so they are always
3041 // less than 16-bits. On CI and newer they could potentially be
3042 // real pointers, so we can't guarantee their size.
3043 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3044 DAG.getValueType(MVT::i16));
3045 }
3046
3047 InVals.push_back(NewArg);
3048 continue;
3049 }
3050 if (!IsEntryFunc && VA.isMemLoc()) {
3051 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3052 InVals.push_back(Val);
3053 if (!Arg.Flags.isByVal())
3054 Chains.push_back(Val.getValue(1));
3055 continue;
3056 }
3057
3058 assert(VA.isRegLoc() && "Parameter must be in a register!");
3059
3060 Register Reg = VA.getLocReg();
3061 const TargetRegisterClass *RC = nullptr;
3062 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3063 RC = &AMDGPU::VGPR_32RegClass;
3064 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3065 RC = &AMDGPU::SGPR_32RegClass;
3066 else
3067 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3068 EVT ValVT = VA.getValVT();
3069
3070 Reg = MF.addLiveIn(Reg, RC);
3071 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3072
3073 if (Arg.Flags.isSRet()) {
3074 // The return object should be reasonably addressable.
3075
3076 // FIXME: This helps when the return is a real sret. If it is a
3077 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3078 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3079 unsigned NumBits
3081 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3082 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3083 }
3084
3085 // If this is an 8 or 16-bit value, it is really passed promoted
3086 // to 32 bits. Insert an assert[sz]ext to capture this, then
3087 // truncate to the right size.
3088 switch (VA.getLocInfo()) {
3089 case CCValAssign::Full:
3090 break;
3091 case CCValAssign::BCvt:
3092 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3093 break;
3094 case CCValAssign::SExt:
3095 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3096 DAG.getValueType(ValVT));
3097 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3098 break;
3099 case CCValAssign::ZExt:
3100 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3101 DAG.getValueType(ValVT));
3102 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3103 break;
3104 case CCValAssign::AExt:
3105 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3106 break;
3107 default:
3108 llvm_unreachable("Unknown loc info!");
3109 }
3110
3111 InVals.push_back(Val);
3112 }
3113
3114 // Start adding system SGPRs.
3115 if (IsEntryFunc)
3116 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3117
3118 // DAG.getPass() returns nullptr when using new pass manager.
3119 // TODO: Use DAG.getMFAM() to access analysis result.
3120 if (DAG.getPass()) {
3121 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3122 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3123 }
3124
3125 unsigned StackArgSize = CCInfo.getStackSize();
3126 Info->setBytesInStackArgArea(StackArgSize);
3127
3128 return Chains.empty() ? Chain :
3129 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3130}
3131
3132// TODO: If return values can't fit in registers, we should return as many as
3133// possible in registers before passing on stack.
3135 CallingConv::ID CallConv,
3136 MachineFunction &MF, bool IsVarArg,
3138 LLVMContext &Context) const {
3139 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3140 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3141 // for shaders. Vector types should be explicitly handled by CC.
3142 if (AMDGPU::isEntryFunctionCC(CallConv))
3143 return true;
3144
3146 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3147 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3148 return false;
3149
3150 // We must use the stack if return would require unavailable registers.
3151 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3152 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3153 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3154 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3155 return false;
3156
3157 return true;
3158}
3159
3160SDValue
3162 bool isVarArg,
3164 const SmallVectorImpl<SDValue> &OutVals,
3165 const SDLoc &DL, SelectionDAG &DAG) const {
3168
3169 if (AMDGPU::isKernel(CallConv)) {
3170 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3171 OutVals, DL, DAG);
3172 }
3173
3174 bool IsShader = AMDGPU::isShader(CallConv);
3175
3176 Info->setIfReturnsVoid(Outs.empty());
3177 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3178
3179 // CCValAssign - represent the assignment of the return value to a location.
3182
3183 // CCState - Info about the registers and stack slots.
3184 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3185 *DAG.getContext());
3186
3187 // Analyze outgoing return values.
3188 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3189
3190 SDValue Glue;
3192 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3193
3194 // Copy the result values into the output registers.
3195 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3196 ++I, ++RealRVLocIdx) {
3197 CCValAssign &VA = RVLocs[I];
3198 assert(VA.isRegLoc() && "Can only return in registers!");
3199 // TODO: Partially return in registers if return values don't fit.
3200 SDValue Arg = OutVals[RealRVLocIdx];
3201
3202 // Copied from other backends.
3203 switch (VA.getLocInfo()) {
3204 case CCValAssign::Full:
3205 break;
3206 case CCValAssign::BCvt:
3207 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3208 break;
3209 case CCValAssign::SExt:
3210 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3211 break;
3212 case CCValAssign::ZExt:
3213 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3214 break;
3215 case CCValAssign::AExt:
3216 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3217 break;
3218 default:
3219 llvm_unreachable("Unknown loc info!");
3220 }
3221
3222 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3223 Glue = Chain.getValue(1);
3224 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3225 }
3226
3227 // FIXME: Does sret work properly?
3228 if (!Info->isEntryFunction()) {
3229 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3230 const MCPhysReg *I =
3231 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3232 if (I) {
3233 for (; *I; ++I) {
3234 if (AMDGPU::SReg_64RegClass.contains(*I))
3235 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3236 else if (AMDGPU::SReg_32RegClass.contains(*I))
3237 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3238 else
3239 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3240 }
3241 }
3242 }
3243
3244 // Update chain and glue.
3245 RetOps[0] = Chain;
3246 if (Glue.getNode())
3247 RetOps.push_back(Glue);
3248
3249 unsigned Opc = AMDGPUISD::ENDPGM;
3250 if (!IsWaveEnd)
3252 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3253}
3254
3256 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3257 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3258 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3259 SDValue ThisVal) const {
3260 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3261
3262 // Assign locations to each value returned by this call.
3264 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3265 *DAG.getContext());
3266 CCInfo.AnalyzeCallResult(Ins, RetCC);
3267
3268 // Copy all of the result registers out of their specified physreg.
3269 for (CCValAssign VA : RVLocs) {
3270 SDValue Val;
3271
3272 if (VA.isRegLoc()) {
3273 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3274 Chain = Val.getValue(1);
3275 InGlue = Val.getValue(2);
3276 } else if (VA.isMemLoc()) {
3277 report_fatal_error("TODO: return values in memory");
3278 } else
3279 llvm_unreachable("unknown argument location type");
3280
3281 switch (VA.getLocInfo()) {
3282 case CCValAssign::Full:
3283 break;
3284 case CCValAssign::BCvt:
3285 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3286 break;
3287 case CCValAssign::ZExt:
3288 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3289 DAG.getValueType(VA.getValVT()));
3290 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3291 break;
3292 case CCValAssign::SExt:
3293 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3294 DAG.getValueType(VA.getValVT()));
3295 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3296 break;
3297 case CCValAssign::AExt:
3298 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3299 break;
3300 default:
3301 llvm_unreachable("Unknown loc info!");
3302 }
3303
3304 InVals.push_back(Val);
3305 }
3306
3307 return Chain;
3308}
3309
3310// Add code to pass special inputs required depending on used features separate
3311// from the explicit user arguments present in the IR.
3313 CallLoweringInfo &CLI,
3314 CCState &CCInfo,
3315 const SIMachineFunctionInfo &Info,
3316 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3317 SmallVectorImpl<SDValue> &MemOpChains,
3318 SDValue Chain) const {
3319 // If we don't have a call site, this was a call inserted by
3320 // legalization. These can never use special inputs.
3321 if (!CLI.CB)
3322 return;
3323
3324 SelectionDAG &DAG = CLI.DAG;
3325 const SDLoc &DL = CLI.DL;
3326 const Function &F = DAG.getMachineFunction().getFunction();
3327
3328 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3329 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3330
3331 const AMDGPUFunctionArgInfo *CalleeArgInfo
3333 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3334 // DAG.getPass() returns nullptr when using new pass manager.
3335 // TODO: Use DAG.getMFAM() to access analysis result.
3336 if (DAG.getPass()) {
3337 auto &ArgUsageInfo =
3339 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3340 }
3341 }
3342
3343 // TODO: Unify with private memory register handling. This is complicated by
3344 // the fact that at least in kernels, the input argument is not necessarily
3345 // in the same location as the input.
3346 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3348 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3349 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3350 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3351 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3352 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3353 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3354 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3355 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3356 };
3357
3358 for (auto Attr : ImplicitAttrs) {
3359 const ArgDescriptor *OutgoingArg;
3360 const TargetRegisterClass *ArgRC;
3361 LLT ArgTy;
3362
3363 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3364
3365 // If the callee does not use the attribute value, skip copying the value.
3366 if (CLI.CB->hasFnAttr(Attr.second))
3367 continue;
3368
3369 std::tie(OutgoingArg, ArgRC, ArgTy) =
3370 CalleeArgInfo->getPreloadedValue(InputID);
3371 if (!OutgoingArg)
3372 continue;
3373
3374 const ArgDescriptor *IncomingArg;
3375 const TargetRegisterClass *IncomingArgRC;
3376 LLT Ty;
3377 std::tie(IncomingArg, IncomingArgRC, Ty) =
3378 CallerArgInfo.getPreloadedValue(InputID);
3379 assert(IncomingArgRC == ArgRC);
3380
3381 // All special arguments are ints for now.
3382 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3383 SDValue InputReg;
3384
3385 if (IncomingArg) {
3386 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3387 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3388 // The implicit arg ptr is special because it doesn't have a corresponding
3389 // input for kernels, and is computed from the kernarg segment pointer.
3390 InputReg = getImplicitArgPtr(DAG, DL);
3391 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3392 std::optional<uint32_t> Id =
3394 if (Id.has_value()) {
3395 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3396 } else {
3397 InputReg = DAG.getUNDEF(ArgVT);
3398 }
3399 } else {
3400 // We may have proven the input wasn't needed, although the ABI is
3401 // requiring it. We just need to allocate the register appropriately.
3402 InputReg = DAG.getUNDEF(ArgVT);
3403 }
3404
3405 if (OutgoingArg->isRegister()) {
3406 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3407 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3408 report_fatal_error("failed to allocate implicit input argument");
3409 } else {
3410 unsigned SpecialArgOffset =
3411 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3412 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3413 SpecialArgOffset);
3414 MemOpChains.push_back(ArgStore);
3415 }
3416 }
3417
3418 // Pack workitem IDs into a single register or pass it as is if already
3419 // packed.
3420 const ArgDescriptor *OutgoingArg;
3421 const TargetRegisterClass *ArgRC;
3422 LLT Ty;
3423
3424 std::tie(OutgoingArg, ArgRC, Ty) =
3426 if (!OutgoingArg)
3427 std::tie(OutgoingArg, ArgRC, Ty) =
3429 if (!OutgoingArg)
3430 std::tie(OutgoingArg, ArgRC, Ty) =
3432 if (!OutgoingArg)
3433 return;
3434
3435 const ArgDescriptor *IncomingArgX = std::get<0>(
3437 const ArgDescriptor *IncomingArgY = std::get<0>(
3439 const ArgDescriptor *IncomingArgZ = std::get<0>(
3441
3442 SDValue InputReg;
3443 SDLoc SL;
3444
3445 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3446 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3447 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3448
3449 // If incoming ids are not packed we need to pack them.
3450 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3451 NeedWorkItemIDX) {
3452 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3453 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3454 } else {
3455 InputReg = DAG.getConstant(0, DL, MVT::i32);
3456 }
3457 }
3458
3459 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3460 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3461 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3462 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3463 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3464 InputReg = InputReg.getNode() ?
3465 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3466 }
3467
3468 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3469 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3470 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3471 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3472 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3473 InputReg = InputReg.getNode() ?
3474 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3475 }
3476
3477 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3478 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3479 // We're in a situation where the outgoing function requires the workitem
3480 // ID, but the calling function does not have it (e.g a graphics function
3481 // calling a C calling convention function). This is illegal, but we need
3482 // to produce something.
3483 InputReg = DAG.getUNDEF(MVT::i32);
3484 } else {
3485 // Workitem ids are already packed, any of present incoming arguments
3486 // will carry all required fields.
3488 IncomingArgX ? *IncomingArgX :
3489 IncomingArgY ? *IncomingArgY :
3490 *IncomingArgZ, ~0u);
3491 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3492 }
3493 }
3494
3495 if (OutgoingArg->isRegister()) {
3496 if (InputReg)
3497 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3498
3499 CCInfo.AllocateReg(OutgoingArg->getRegister());
3500 } else {
3501 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3502 if (InputReg) {
3503 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3504 SpecialArgOffset);
3505 MemOpChains.push_back(ArgStore);
3506 }
3507 }
3508}
3509
3511 return CC == CallingConv::Fast;
3512}
3513
3514/// Return true if we might ever do TCO for calls with this calling convention.
3516 switch (CC) {
3517 case CallingConv::C:
3519 return true;
3520 default:
3521 return canGuaranteeTCO(CC);
3522 }
3523}
3524
3526 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3528 const SmallVectorImpl<SDValue> &OutVals,
3529 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3530 if (AMDGPU::isChainCC(CalleeCC))
3531 return true;
3532
3533 if (!mayTailCallThisCC(CalleeCC))
3534 return false;
3535
3536 // For a divergent call target, we need to do a waterfall loop over the
3537 // possible callees which precludes us from using a simple jump.
3538 if (Callee->isDivergent())
3539 return false;
3540
3542 const Function &CallerF = MF.getFunction();
3543 CallingConv::ID CallerCC = CallerF.getCallingConv();
3545 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3546
3547 // Kernels aren't callable, and don't have a live in return address so it
3548 // doesn't make sense to do a tail call with entry functions.
3549 if (!CallerPreserved)
3550 return false;
3551
3552 bool CCMatch = CallerCC == CalleeCC;
3553
3555 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3556 return true;
3557 return false;
3558 }
3559
3560 // TODO: Can we handle var args?
3561 if (IsVarArg)
3562 return false;
3563
3564 for (const Argument &Arg : CallerF.args()) {
3565 if (Arg.hasByValAttr())
3566 return false;
3567 }
3568
3569 LLVMContext &Ctx = *DAG.getContext();
3570
3571 // Check that the call results are passed in the same way.
3572 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3573 CCAssignFnForCall(CalleeCC, IsVarArg),
3574 CCAssignFnForCall(CallerCC, IsVarArg)))
3575 return false;
3576
3577 // The callee has to preserve all registers the caller needs to preserve.
3578 if (!CCMatch) {
3579 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3580 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3581 return false;
3582 }
3583
3584 // Nothing more to check if the callee is taking no arguments.
3585 if (Outs.empty())
3586 return true;
3587
3589 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3590
3591 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3592
3593 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3594 // If the stack arguments for this call do not fit into our own save area then
3595 // the call cannot be made tail.
3596 // TODO: Is this really necessary?
3597 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3598 return false;
3599
3600 const MachineRegisterInfo &MRI = MF.getRegInfo();
3601 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3602}
3603
3605 if (!CI->isTailCall())
3606 return false;
3607
3608 const Function *ParentFn = CI->getParent()->getParent();
3610 return false;
3611 return true;
3612}
3613
3614// The wave scratch offset register is used as the global base pointer.
3616 SmallVectorImpl<SDValue> &InVals) const {
3617 CallingConv::ID CallConv = CLI.CallConv;
3618 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3619
3620 SelectionDAG &DAG = CLI.DAG;
3621
3622 TargetLowering::ArgListEntry RequestedExec;
3623 if (IsChainCallConv) {
3624 // The last argument should be the value that we need to put in EXEC.
3625 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3626 // don't treat it like the rest of the arguments.
3627 RequestedExec = CLI.Args.back();
3628 assert(RequestedExec.Node && "No node for EXEC");
3629
3630 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3631 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3632
3633 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3634 CLI.Outs.pop_back();
3635 CLI.OutVals.pop_back();
3636
3637 if (RequestedExec.Ty->isIntegerTy(64)) {
3638 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3639 CLI.Outs.pop_back();
3640 CLI.OutVals.pop_back();
3641 }
3642
3643 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3644 "Haven't popped all the pieces of the EXEC mask");
3645 }
3646
3647 const SDLoc &DL = CLI.DL;
3649 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3651 SDValue Chain = CLI.Chain;
3652 SDValue Callee = CLI.Callee;
3653 bool &IsTailCall = CLI.IsTailCall;
3654 bool IsVarArg = CLI.IsVarArg;
3655 bool IsSibCall = false;
3657
3658 if (Callee.isUndef() || isNullConstant(Callee)) {
3659 if (!CLI.IsTailCall) {
3660 for (ISD::InputArg &Arg : CLI.Ins)
3661 InVals.push_back(DAG.getUNDEF(Arg.VT));
3662 }
3663
3664 return Chain;
3665 }
3666
3667 if (IsVarArg) {
3668 return lowerUnhandledCall(CLI, InVals,
3669 "unsupported call to variadic function ");
3670 }
3671
3672 if (!CLI.CB)
3673 report_fatal_error("unsupported libcall legalization");
3674
3675 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3676 return lowerUnhandledCall(CLI, InVals,
3677 "unsupported required tail call to function ");
3678 }
3679
3680 if (IsTailCall) {
3682 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3683 if (!IsTailCall &&
3684 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3685 report_fatal_error("failed to perform tail call elimination on a call "
3686 "site marked musttail or on llvm.amdgcn.cs.chain");
3687 }
3688
3689 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3690
3691 // A sibling call is one where we're under the usual C ABI and not planning
3692 // to change that but can still do a tail call:
3693 if (!TailCallOpt && IsTailCall)
3694 IsSibCall = true;
3695
3696 if (IsTailCall)
3697 ++NumTailCalls;
3698 }
3699
3702 SmallVector<SDValue, 8> MemOpChains;
3703
3704 // Analyze operands of the call, assigning locations to each operand.
3706 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3707 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3708
3709 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3710 // With a fixed ABI, allocate fixed registers before user arguments.
3711 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3712 }
3713
3714 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3715
3716 // Get a count of how many bytes are to be pushed on the stack.
3717 unsigned NumBytes = CCInfo.getStackSize();
3718
3719 if (IsSibCall) {
3720 // Since we're not changing the ABI to make this a tail call, the memory
3721 // operands are already available in the caller's incoming argument space.
3722 NumBytes = 0;
3723 }
3724
3725 // FPDiff is the byte offset of the call's argument area from the callee's.
3726 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3727 // by this amount for a tail call. In a sibling call it must be 0 because the
3728 // caller will deallocate the entire stack and the callee still expects its
3729 // arguments to begin at SP+0. Completely unused for non-tail calls.
3730 int32_t FPDiff = 0;
3731 MachineFrameInfo &MFI = MF.getFrameInfo();
3732
3733 // Adjust the stack pointer for the new arguments...
3734 // These operations are automatically eliminated by the prolog/epilog pass
3735 if (!IsSibCall)
3736 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3737
3738 if (!IsSibCall || IsChainCallConv) {
3739 if (!Subtarget->enableFlatScratch()) {
3740 SmallVector<SDValue, 4> CopyFromChains;
3741
3742 // In the HSA case, this should be an identity copy.
3743 SDValue ScratchRSrcReg
3744 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3745 RegsToPass.emplace_back(IsChainCallConv
3746 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3747 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3748 ScratchRSrcReg);
3749 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3750 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3751 }
3752 }
3753
3754 MVT PtrVT = MVT::i32;
3755
3756 // Walk the register/memloc assignments, inserting copies/loads.
3757 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3758 CCValAssign &VA = ArgLocs[i];
3759 SDValue Arg = OutVals[i];
3760
3761 // Promote the value if needed.
3762 switch (VA.getLocInfo()) {
3763 case CCValAssign::Full:
3764 break;
3765 case CCValAssign::BCvt:
3766 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3767 break;
3768 case CCValAssign::ZExt:
3769 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3770 break;
3771 case CCValAssign::SExt:
3772 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3773 break;
3774 case CCValAssign::AExt:
3775 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3776 break;
3777 case CCValAssign::FPExt:
3778 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3779 break;
3780 default:
3781 llvm_unreachable("Unknown loc info!");
3782 }
3783
3784 if (VA.isRegLoc()) {
3785 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3786 } else {
3787 assert(VA.isMemLoc());
3788
3789 SDValue DstAddr;
3790 MachinePointerInfo DstInfo;
3791
3792 unsigned LocMemOffset = VA.getLocMemOffset();
3793 int32_t Offset = LocMemOffset;
3794
3795 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3796 MaybeAlign Alignment;
3797
3798 if (IsTailCall) {
3799 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3800 unsigned OpSize = Flags.isByVal() ?
3801 Flags.getByValSize() : VA.getValVT().getStoreSize();
3802
3803 // FIXME: We can have better than the minimum byval required alignment.
3804 Alignment =
3805 Flags.isByVal()
3806 ? Flags.getNonZeroByValAlign()
3807 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3808
3809 Offset = Offset + FPDiff;
3810 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3811
3812 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3813 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3814
3815 // Make sure any stack arguments overlapping with where we're storing
3816 // are loaded before this eventual operation. Otherwise they'll be
3817 // clobbered.
3818
3819 // FIXME: Why is this really necessary? This seems to just result in a
3820 // lot of code to copy the stack and write them back to the same
3821 // locations, which are supposed to be immutable?
3822 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3823 } else {
3824 // Stores to the argument stack area are relative to the stack pointer.
3825 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3826 MVT::i32);
3827 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3828 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3829 Alignment =
3830 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3831 }
3832
3833 if (Outs[i].Flags.isByVal()) {
3834 SDValue SizeNode =
3835 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3836 SDValue Cpy =
3837 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3838 Outs[i].Flags.getNonZeroByValAlign(),
3839 /*isVol = */ false, /*AlwaysInline = */ true,
3840 /*CI=*/nullptr, std::nullopt, DstInfo,
3842
3843 MemOpChains.push_back(Cpy);
3844 } else {
3845 SDValue Store =
3846 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3847 MemOpChains.push_back(Store);
3848 }
3849 }
3850 }
3851
3852 if (!MemOpChains.empty())
3853 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3854
3855 // Build a sequence of copy-to-reg nodes chained together with token chain
3856 // and flag operands which copy the outgoing args into the appropriate regs.
3857 SDValue InGlue;
3858 for (auto &RegToPass : RegsToPass) {
3859 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3860 RegToPass.second, InGlue);
3861 InGlue = Chain.getValue(1);
3862 }
3863
3864
3865 // We don't usually want to end the call-sequence here because we would tidy
3866 // the frame up *after* the call, however in the ABI-changing tail-call case
3867 // we've carefully laid out the parameters so that when sp is reset they'll be
3868 // in the correct location.
3869 if (IsTailCall && !IsSibCall) {
3870 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3871 InGlue = Chain.getValue(1);
3872 }
3873
3874 std::vector<SDValue> Ops;
3875 Ops.push_back(Chain);
3876 Ops.push_back(Callee);
3877 // Add a redundant copy of the callee global which will not be legalized, as
3878 // we need direct access to the callee later.
3879 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3880 const GlobalValue *GV = GSD->getGlobal();
3881 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3882 } else {
3883 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3884 }
3885
3886 if (IsTailCall) {
3887 // Each tail call may have to adjust the stack by a different amount, so
3888 // this information must travel along with the operation for eventual
3889 // consumption by emitEpilogue.
3890 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3891 }
3892
3893 if (IsChainCallConv)
3894 Ops.push_back(RequestedExec.Node);
3895
3896 // Add argument registers to the end of the list so that they are known live
3897 // into the call.
3898 for (auto &RegToPass : RegsToPass) {
3899 Ops.push_back(DAG.getRegister(RegToPass.first,
3900 RegToPass.second.getValueType()));
3901 }
3902
3903 // Add a register mask operand representing the call-preserved registers.
3904 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3905 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3906 assert(Mask && "Missing call preserved mask for calling convention");
3907 Ops.push_back(DAG.getRegisterMask(Mask));
3908
3909 if (SDValue Token = CLI.ConvergenceControlToken) {
3911 GlueOps.push_back(Token);
3912 if (InGlue)
3913 GlueOps.push_back(InGlue);
3914
3915 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3916 MVT::Glue, GlueOps),
3917 0);
3918 }
3919
3920 if (InGlue)
3921 Ops.push_back(InGlue);
3922
3923 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3924
3925 // If we're doing a tall call, use a TC_RETURN here rather than an
3926 // actual call instruction.
3927 if (IsTailCall) {
3928 MFI.setHasTailCall();
3929 unsigned OPC = AMDGPUISD::TC_RETURN;
3930 switch (CallConv) {
3933 break;
3937 break;
3938 }
3939
3940 return DAG.getNode(OPC, DL, NodeTys, Ops);
3941 }
3942
3943 // Returns a chain and a flag for retval copy to use.
3944 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3945 Chain = Call.getValue(0);
3946 InGlue = Call.getValue(1);
3947
3948 uint64_t CalleePopBytes = NumBytes;
3949 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3950 if (!Ins.empty())
3951 InGlue = Chain.getValue(1);
3952
3953 // Handle result values, copying them out of physregs into vregs that we
3954 // return.
3955 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3956 InVals, /*IsThisReturn=*/false, SDValue());
3957}
3958
3959// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3960// except for applying the wave size scale to the increment amount.
3962 SDValue Op, SelectionDAG &DAG) const {
3963 const MachineFunction &MF = DAG.getMachineFunction();
3965
3966 SDLoc dl(Op);
3967 EVT VT = Op.getValueType();
3968 SDValue Tmp1 = Op;
3969 SDValue Tmp2 = Op.getValue(1);
3970 SDValue Tmp3 = Op.getOperand(2);
3971 SDValue Chain = Tmp1.getOperand(0);
3972
3973 Register SPReg = Info->getStackPtrOffsetReg();
3974
3975 // Chain the dynamic stack allocation so that it doesn't modify the stack
3976 // pointer when other instructions are using the stack.
3977 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3978
3979 SDValue Size = Tmp2.getOperand(1);
3980 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3981 Chain = SP.getValue(1);
3982 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3983 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3984 unsigned Opc =
3987
3988 SDValue ScaledSize = DAG.getNode(
3989 ISD::SHL, dl, VT, Size,
3990 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3991
3992 Align StackAlign = TFL->getStackAlign();
3993 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3994 if (Alignment && *Alignment > StackAlign) {
3995 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3996 DAG.getConstant(-(uint64_t)Alignment->value()
3997 << Subtarget->getWavefrontSizeLog2(),
3998 dl, VT));
3999 }
4000
4001 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4002 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4003
4004 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4005}
4006
4008 SelectionDAG &DAG) const {
4009 // We only handle constant sizes here to allow non-entry block, static sized
4010 // allocas. A truly dynamic value is more difficult to support because we
4011 // don't know if the size value is uniform or not. If the size isn't uniform,
4012 // we would need to do a wave reduction to get the maximum size to know how
4013 // much to increment the uniform stack pointer.
4014 SDValue Size = Op.getOperand(1);
4015 if (isa<ConstantSDNode>(Size))
4016 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4017
4019}
4020
4022 if (Op.getValueType() != MVT::i32)
4023 return Op; // Defer to cannot select error.
4024
4026 SDLoc SL(Op);
4027
4028 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4029
4030 // Convert from wave uniform to swizzled vector address. This should protect
4031 // from any edge cases where the stacksave result isn't directly used with
4032 // stackrestore.
4033 SDValue VectorAddress =
4034 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4035 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4036}
4037
4039 SelectionDAG &DAG) const {
4040 SDLoc SL(Op);
4041 assert(Op.getValueType() == MVT::i32);
4042
4043 uint32_t BothRoundHwReg =
4045 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4046
4047 SDValue IntrinID =
4048 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4049 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4050 Op.getOperand(0), IntrinID, GetRoundBothImm);
4051
4052 // There are two rounding modes, one for f32 and one for f64/f16. We only
4053 // report in the standard value range if both are the same.
4054 //
4055 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4056 // ties away from zero is not supported, and the other values are rotated by
4057 // 1.
4058 //
4059 // If the two rounding modes are not the same, report a target defined value.
4060
4061 // Mode register rounding mode fields:
4062 //
4063 // [1:0] Single-precision round mode.
4064 // [3:2] Double/Half-precision round mode.
4065 //
4066 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4067 //
4068 // Hardware Spec
4069 // Toward-0 3 0
4070 // Nearest Even 0 1
4071 // +Inf 1 2
4072 // -Inf 2 3
4073 // NearestAway0 N/A 4
4074 //
4075 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4076 // table we can index by the raw hardware mode.
4077 //
4078 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4079
4080 SDValue BitTable =
4082
4083 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4084 SDValue RoundModeTimesNumBits =
4085 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4086
4087 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4088 // knew only one mode was demanded.
4089 SDValue TableValue =
4090 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4091 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4092
4093 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4094 SDValue TableEntry =
4095 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4096
4097 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4098 // if it's an extended value.
4099 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4100 SDValue IsStandardValue =
4101 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4102 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4103 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4104 TableEntry, EnumOffset);
4105
4106 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4107}
4108
4110 SelectionDAG &DAG) const {
4111 SDLoc SL(Op);
4112
4113 SDValue NewMode = Op.getOperand(1);
4114 assert(NewMode.getValueType() == MVT::i32);
4115
4116 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4117 // hardware MODE.fp_round values.
4118 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4119 uint32_t ClampedVal = std::min(
4120 static_cast<uint32_t>(ConstMode->getZExtValue()),
4122 NewMode = DAG.getConstant(
4123 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4124 } else {
4125 // If we know the input can only be one of the supported standard modes in
4126 // the range 0-3, we can use a simplified mapping to hardware values.
4127 KnownBits KB = DAG.computeKnownBits(NewMode);
4128 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4129 // The supported standard values are 0-3. The extended values start at 8. We
4130 // need to offset by 4 if the value is in the extended range.
4131
4132 if (UseReducedTable) {
4133 // Truncate to the low 32-bits.
4134 SDValue BitTable = DAG.getConstant(
4135 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4136
4137 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4138 SDValue RoundModeTimesNumBits =
4139 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4140
4141 NewMode =
4142 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4143
4144 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4145 // the table extracted bits into inline immediates.
4146 } else {
4147 // table_index = umin(value, value - 4)
4148 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4149 SDValue BitTable =
4151
4152 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4153 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4154 SDValue IndexVal =
4155 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4156
4157 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4158 SDValue RoundModeTimesNumBits =
4159 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4160
4161 SDValue TableValue =
4162 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4163 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4164
4165 // No need to mask out the high bits since the setreg will ignore them
4166 // anyway.
4167 NewMode = TruncTable;
4168 }
4169
4170 // Insert a readfirstlane in case the value is a VGPR. We could do this
4171 // earlier and keep more operations scalar, but that interferes with
4172 // combining the source.
4173 SDValue ReadFirstLaneID =
4174 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4175 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4176 ReadFirstLaneID, NewMode);
4177 }
4178
4179 // N.B. The setreg will be later folded into s_round_mode on supported
4180 // targets.
4181 SDValue IntrinID =
4182 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4183 uint32_t BothRoundHwReg =
4185 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4186
4187 SDValue SetReg =
4188 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4189 IntrinID, RoundBothImm, NewMode);
4190
4191 return SetReg;
4192}
4193
4195 if (Op->isDivergent())
4196 return SDValue();
4197
4198 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4203 break;
4204 default:
4205 return SDValue();
4206 }
4207
4208 return Op;
4209}
4210
4211// Work around DAG legality rules only based on the result type.
4213 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4214 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4215 EVT SrcVT = Src.getValueType();
4216
4217 if (SrcVT.getScalarType() != MVT::bf16)
4218 return Op;
4219
4220 SDLoc SL(Op);
4221 SDValue BitCast =
4222 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4223
4224 EVT DstVT = Op.getValueType();
4225 if (IsStrict)
4226 llvm_unreachable("Need STRICT_BF16_TO_FP");
4227
4228 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4229}
4230
4232 SDLoc SL(Op);
4233 if (Op.getValueType() != MVT::i64)
4234 return Op;
4235
4236 uint32_t ModeHwReg =
4238 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4239 uint32_t TrapHwReg =
4241 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4242
4243 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4244 SDValue IntrinID =
4245 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4246 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4247 Op.getOperand(0), IntrinID, ModeHwRegImm);
4248 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4249 Op.getOperand(0), IntrinID, TrapHwRegImm);
4250 SDValue TokenReg =
4251 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4252 GetTrapReg.getValue(1));
4253
4254 SDValue CvtPtr =
4255 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4256 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4257
4258 return DAG.getMergeValues({Result, TokenReg}, SL);
4259}
4260
4262 SDLoc SL(Op);
4263 if (Op.getOperand(1).getValueType() != MVT::i64)
4264 return Op;
4265
4266 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4267 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4268 DAG.getConstant(0, SL, MVT::i32));
4269 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4270 DAG.getConstant(1, SL, MVT::i32));
4271
4272 SDValue ReadFirstLaneID =
4273 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4274 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4275 ReadFirstLaneID, NewModeReg);
4276 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4277 ReadFirstLaneID, NewTrapReg);
4278
4279 unsigned ModeHwReg =
4281 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4282 unsigned TrapHwReg =
4284 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4285
4286 SDValue IntrinID =
4287 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4288 SDValue SetModeReg =
4289 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4290 IntrinID, ModeHwRegImm, NewModeReg);
4291 SDValue SetTrapReg =
4292 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4293 IntrinID, TrapHwRegImm, NewTrapReg);
4294 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4295}
4296
4298 const MachineFunction &MF) const {
4300 .Case("m0", AMDGPU::M0)
4301 .Case("exec", AMDGPU::EXEC)
4302 .Case("exec_lo", AMDGPU::EXEC_LO)
4303 .Case("exec_hi", AMDGPU::EXEC_HI)
4304 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4305 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4306 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4307 .Default(Register());
4308
4309 if (Reg == AMDGPU::NoRegister) {
4310 report_fatal_error(Twine("invalid register name \""
4311 + StringRef(RegName) + "\"."));
4312
4313 }
4314
4315 if (!Subtarget->hasFlatScrRegister() &&
4316 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4317 report_fatal_error(Twine("invalid register \""
4318 + StringRef(RegName) + "\" for subtarget."));
4319 }
4320
4321 switch (Reg) {
4322 case AMDGPU::M0:
4323 case AMDGPU::EXEC_LO:
4324 case AMDGPU::EXEC_HI:
4325 case AMDGPU::FLAT_SCR_LO:
4326 case AMDGPU::FLAT_SCR_HI:
4327 if (VT.getSizeInBits() == 32)
4328 return Reg;
4329 break;
4330 case AMDGPU::EXEC:
4331 case AMDGPU::FLAT_SCR:
4332 if (VT.getSizeInBits() == 64)
4333 return Reg;
4334 break;
4335 default:
4336 llvm_unreachable("missing register type checking");
4337 }
4338
4339 report_fatal_error(Twine("invalid type for register \""
4340 + StringRef(RegName) + "\"."));
4341}
4342
4343// If kill is not the last instruction, split the block so kill is always a
4344// proper terminator.
4347 MachineBasicBlock *BB) const {
4348 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4350 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4351 return SplitBB;
4352}
4353
4354// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4355// \p MI will be the only instruction in the loop body block. Otherwise, it will
4356// be the first instruction in the remainder block.
4357//
4358/// \returns { LoopBody, Remainder }
4359static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4363
4364 // To insert the loop we need to split the block. Move everything after this
4365 // point to a new block, and insert a new empty block between the two.
4367 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4369 ++MBBI;
4370
4371 MF->insert(MBBI, LoopBB);
4372 MF->insert(MBBI, RemainderBB);
4373
4374 LoopBB->addSuccessor(LoopBB);
4375 LoopBB->addSuccessor(RemainderBB);
4376
4377 // Move the rest of the block into a new block.
4378 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4379
4380 if (InstInLoop) {
4381 auto Next = std::next(I);
4382
4383 // Move instruction to loop body.
4384 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4385
4386 // Move the rest of the block.
4387 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4388 } else {
4389 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4390 }
4391
4392 MBB.addSuccessor(LoopBB);
4393
4394 return std::pair(LoopBB, RemainderBB);
4395}
4396
4397/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4399 MachineBasicBlock *MBB = MI.getParent();
4401 auto I = MI.getIterator();
4402 auto E = std::next(I);
4403
4404 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4405 .addImm(0);
4406
4407 MIBundleBuilder Bundler(*MBB, I, E);
4408 finalizeBundle(*MBB, Bundler.begin());
4409}
4410
4413 MachineBasicBlock *BB) const {
4414 const DebugLoc &DL = MI.getDebugLoc();
4415
4417
4418 MachineBasicBlock *LoopBB;
4419 MachineBasicBlock *RemainderBB;
4421
4422 // Apparently kill flags are only valid if the def is in the same block?
4423 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4424 Src->setIsKill(false);
4425
4426 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4427
4428 MachineBasicBlock::iterator I = LoopBB->end();
4429
4430 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4432
4433 // Clear TRAP_STS.MEM_VIOL
4434 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4435 .addImm(0)
4436 .addImm(EncodedReg);
4437
4439
4440 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4441
4442 // Load and check TRAP_STS.MEM_VIOL
4443 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4444 .addImm(EncodedReg);
4445
4446 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4447 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4448 .addReg(Reg, RegState::Kill)
4449 .addImm(0);
4450 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4451 .addMBB(LoopBB);
4452
4453 return RemainderBB;
4454}
4455
4456// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4457// wavefront. If the value is uniform and just happens to be in a VGPR, this
4458// will only do one iteration. In the worst case, this will loop 64 times.
4459//
4460// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4463 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4464 const DebugLoc &DL, const MachineOperand &Idx,
4465 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4466 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4467 Register &SGPRIdxReg) {
4468
4469 MachineFunction *MF = OrigBB.getParent();
4470 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4471 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4473
4474 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4475 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4476 Register NewExec = MRI.createVirtualRegister(BoolRC);
4477 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4478 Register CondReg = MRI.createVirtualRegister(BoolRC);
4479
4480 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4481 .addReg(InitReg)
4482 .addMBB(&OrigBB)
4483 .addReg(ResultReg)
4484 .addMBB(&LoopBB);
4485
4486 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4487 .addReg(InitSaveExecReg)
4488 .addMBB(&OrigBB)
4489 .addReg(NewExec)
4490 .addMBB(&LoopBB);
4491
4492 // Read the next variant <- also loop target.
4493 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4494 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4495
4496 // Compare the just read M0 value to all possible Idx values.
4497 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4498 .addReg(CurrentIdxReg)
4499 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4500
4501 // Update EXEC, save the original EXEC value to VCC.
4502 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4503 : AMDGPU::S_AND_SAVEEXEC_B64),
4504 NewExec)
4505 .addReg(CondReg, RegState::Kill);
4506
4507 MRI.setSimpleHint(NewExec, CondReg);
4508
4509 if (UseGPRIdxMode) {
4510 if (Offset == 0) {
4511 SGPRIdxReg = CurrentIdxReg;
4512 } else {
4513 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4514 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4515 .addReg(CurrentIdxReg, RegState::Kill)
4516 .addImm(Offset);
4517 }
4518 } else {
4519 // Move index from VCC into M0
4520 if (Offset == 0) {
4521 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4522 .addReg(CurrentIdxReg, RegState::Kill);
4523 } else {
4524 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4525 .addReg(CurrentIdxReg, RegState::Kill)
4526 .addImm(Offset);
4527 }
4528 }
4529
4530 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4531 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4532 MachineInstr *InsertPt =
4533 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4534 : AMDGPU::S_XOR_B64_term), Exec)
4535 .addReg(Exec)
4536 .addReg(NewExec);
4537
4538 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4539 // s_cbranch_scc0?
4540
4541 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4542 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4543 .addMBB(&LoopBB);
4544
4545 return InsertPt->getIterator();
4546}
4547
4548// This has slightly sub-optimal regalloc when the source vector is killed by
4549// the read. The register allocator does not understand that the kill is
4550// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4551// subregister from it, using 1 more VGPR than necessary. This was saved when
4552// this was expanded after register allocation.
4555 unsigned InitResultReg, unsigned PhiReg, int Offset,
4556 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4558 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4559 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4561 const DebugLoc &DL = MI.getDebugLoc();
4563
4564 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4565 Register DstReg = MI.getOperand(0).getReg();
4566 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4567 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4568 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4569 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4570
4571 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4572
4573 // Save the EXEC mask
4574 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4575 .addReg(Exec);
4576
4577 MachineBasicBlock *LoopBB;
4578 MachineBasicBlock *RemainderBB;
4579 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4580
4581 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4582
4583 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4584 InitResultReg, DstReg, PhiReg, TmpExec,
4585 Offset, UseGPRIdxMode, SGPRIdxReg);
4586
4587 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4589 ++MBBI;
4590 MF->insert(MBBI, LandingPad);
4591 LoopBB->removeSuccessor(RemainderBB);
4592 LandingPad->addSuccessor(RemainderBB);
4593 LoopBB->addSuccessor(LandingPad);
4594 MachineBasicBlock::iterator First = LandingPad->begin();
4595 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4596 .addReg(SaveExec);
4597
4598 return InsPt;
4599}
4600
4601// Returns subreg index, offset
4602static std::pair<unsigned, int>
4604 const TargetRegisterClass *SuperRC,
4605 unsigned VecReg,
4606 int Offset) {
4607 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4608
4609 // Skip out of bounds offsets, or else we would end up using an undefined
4610 // register.
4611 if (Offset >= NumElts || Offset < 0)
4612 return std::pair(AMDGPU::sub0, Offset);
4613
4614 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4615}
4616
4619 int Offset) {
4620 MachineBasicBlock *MBB = MI.getParent();
4621 const DebugLoc &DL = MI.getDebugLoc();
4623
4624 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4625
4626 assert(Idx->getReg() != AMDGPU::NoRegister);
4627
4628 if (Offset == 0) {
4629 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4630 } else {
4631 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4632 .add(*Idx)
4633 .addImm(Offset);
4634 }
4635}
4636
4639 int Offset) {
4640 MachineBasicBlock *MBB = MI.getParent();
4641 const DebugLoc &DL = MI.getDebugLoc();
4643
4644 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4645
4646 if (Offset == 0)
4647 return Idx->getReg();
4648
4649 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4650 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4651 .add(*Idx)
4652 .addImm(Offset);
4653 return Tmp;
4654}
4655
4658 const GCNSubtarget &ST) {
4659 const SIInstrInfo *TII = ST.getInstrInfo();
4660 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4663
4664 Register Dst = MI.getOperand(0).getReg();
4665 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4666 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4667 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4668
4669 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4670 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4671
4672 unsigned SubReg;
4673 std::tie(SubReg, Offset)
4674 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4675
4676 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4677
4678 // Check for a SGPR index.
4679 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4681 const DebugLoc &DL = MI.getDebugLoc();
4682
4683 if (UseGPRIdxMode) {
4684 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4685 // to avoid interfering with other uses, so probably requires a new
4686 // optimization pass.
4688
4689 const MCInstrDesc &GPRIDXDesc =
4690 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4691 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4692 .addReg(SrcReg)
4693 .addReg(Idx)
4694 .addImm(SubReg);
4695 } else {
4697
4698 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4699 .addReg(SrcReg, 0, SubReg)
4700 .addReg(SrcReg, RegState::Implicit);
4701 }
4702
4703 MI.eraseFromParent();
4704
4705 return &MBB;
4706 }
4707
4708 // Control flow needs to be inserted if indexing with a VGPR.
4709 const DebugLoc &DL = MI.getDebugLoc();
4711
4712 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4713 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4714
4715 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4716
4717 Register SGPRIdxReg;
4718 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4719 UseGPRIdxMode, SGPRIdxReg);
4720
4721 MachineBasicBlock *LoopBB = InsPt->getParent();
4722
4723 if (UseGPRIdxMode) {
4724 const MCInstrDesc &GPRIDXDesc =
4725 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4726
4727 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4728 .addReg(SrcReg)
4729 .addReg(SGPRIdxReg)
4730 .addImm(SubReg);
4731 } else {
4732 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4733 .addReg(SrcReg, 0, SubReg)
4734 .addReg(SrcReg, RegState::Implicit);
4735 }
4736
4737 MI.eraseFromParent();
4738
4739 return LoopBB;
4740}
4741
4744 const GCNSubtarget &ST) {
4745 const SIInstrInfo *TII = ST.getInstrInfo();
4746 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4749
4750 Register Dst = MI.getOperand(0).getReg();
4751 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4752 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4753 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4754 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4755 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4756 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4757
4758 // This can be an immediate, but will be folded later.
4759 assert(Val->getReg());
4760
4761 unsigned SubReg;
4762 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4763 SrcVec->getReg(),
4764 Offset);
4765 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4766
4767 if (Idx->getReg() == AMDGPU::NoRegister) {
4769 const DebugLoc &DL = MI.getDebugLoc();
4770
4771 assert(Offset == 0);
4772
4773 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4774 .add(*SrcVec)
4775 .add(*Val)
4776 .addImm(SubReg);
4777
4778 MI.eraseFromParent();
4779 return &MBB;
4780 }
4781
4782 // Check for a SGPR index.
4783 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4785 const DebugLoc &DL = MI.getDebugLoc();
4786
4787 if (UseGPRIdxMode) {
4789
4790 const MCInstrDesc &GPRIDXDesc =
4791 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4792 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4793 .addReg(SrcVec->getReg())
4794 .add(*Val)
4795 .addReg(Idx)
4796 .addImm(SubReg);
4797 } else {
4799
4800 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4801 TRI.getRegSizeInBits(*VecRC), 32, false);
4802 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4803 .addReg(SrcVec->getReg())
4804 .add(*Val)
4805 .addImm(SubReg);
4806 }
4807 MI.eraseFromParent();
4808 return &MBB;
4809 }
4810
4811 // Control flow needs to be inserted if indexing with a VGPR.
4812 if (Val->isReg())
4813 MRI.clearKillFlags(Val->getReg());
4814
4815 const DebugLoc &DL = MI.getDebugLoc();
4816
4817 Register PhiReg = MRI.createVirtualRegister(VecRC);
4818
4819 Register SGPRIdxReg;
4820 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4821 UseGPRIdxMode, SGPRIdxReg);
4822 MachineBasicBlock *LoopBB = InsPt->getParent();
4823
4824 if (UseGPRIdxMode) {
4825 const MCInstrDesc &GPRIDXDesc =
4826 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4827
4828 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4829 .addReg(PhiReg)
4830 .add(*Val)
4831 .addReg(SGPRIdxReg)
4832 .addImm(SubReg);
4833 } else {
4834 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4835 TRI.getRegSizeInBits(*VecRC), 32, false);
4836 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4837 .addReg(PhiReg)
4838 .add(*Val)
4839 .addImm(SubReg);
4840 }
4841
4842 MI.eraseFromParent();
4843 return LoopBB;
4844}
4845
4848 const GCNSubtarget &ST,
4849 unsigned Opc) {
4851 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4852 const DebugLoc &DL = MI.getDebugLoc();
4853 const SIInstrInfo *TII = ST.getInstrInfo();
4854
4855 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4856 Register SrcReg = MI.getOperand(1).getReg();
4857 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4858 Register DstReg = MI.getOperand(0).getReg();
4859 MachineBasicBlock *RetBB = nullptr;
4860 if (isSGPR) {
4861 // These operations with a uniform value i.e. SGPR are idempotent.
4862 // Reduced value will be same as given sgpr.
4863 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4864 RetBB = &BB;
4865 } else {
4866 // TODO: Implement DPP Strategy and switch based on immediate strategy
4867 // operand. For now, for all the cases (default, Iterative and DPP we use
4868 // iterative approach by default.)
4869
4870 // To reduce the VGPR using iterative approach, we need to iterate
4871 // over all the active lanes. Lowering consists of ComputeLoop,
4872 // which iterate over only active lanes. We use copy of EXEC register
4873 // as induction variable and every active lane modifies it using bitset0
4874 // so that we will get the next active lane for next iteration.
4876 Register SrcReg = MI.getOperand(1).getReg();
4877
4878 // Create Control flow for loop
4879 // Split MI's Machine Basic block into For loop
4880 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4881
4882 // Create virtual registers required for lowering.
4883 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4884 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4885 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4886 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4887
4888 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4889 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4890 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4891
4892 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4893 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4894
4895 bool IsWave32 = ST.isWave32();
4896 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4897 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4898
4899 // Create initail values of induction variable from Exec, Accumulator and
4900 // insert branch instr to newly created ComputeBlockk
4901 uint32_t InitalValue =
4902 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4903 auto TmpSReg =
4904 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4905 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4906 .addImm(InitalValue);
4907 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4908
4909 // Start constructing ComputeLoop
4910 I = ComputeLoop->end();
4911 auto Accumulator =
4912 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4913 .addReg(InitalValReg)
4914 .addMBB(&BB);
4915 auto ActiveBits =
4916 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4917 .addReg(TmpSReg->getOperand(0).getReg())
4918 .addMBB(&BB);
4919
4920 // Perform the computations
4921 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4922 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4923 .addReg(ActiveBits->getOperand(0).getReg());
4924 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4925 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4926 .addReg(SrcReg)
4927 .addReg(FF1->getOperand(0).getReg());
4928 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4929 .addReg(Accumulator->getOperand(0).getReg())
4930 .addReg(LaneValue->getOperand(0).getReg());
4931
4932 // Manipulate the iterator to get the next active lane
4933 unsigned BITSETOpc =
4934 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4935 auto NewActiveBits =
4936 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4937 .addReg(FF1->getOperand(0).getReg())
4938 .addReg(ActiveBits->getOperand(0).getReg());
4939
4940 // Add phi nodes
4941 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4942 .addMBB(ComputeLoop);
4943 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4944 .addMBB(ComputeLoop);
4945
4946 // Creating branching
4947 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4948 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4949 .addReg(NewActiveBits->getOperand(0).getReg())
4950 .addImm(0);
4951 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4952 .addMBB(ComputeLoop);
4953
4954 RetBB = ComputeEnd;
4955 }
4956 MI.eraseFromParent();
4957 return RetBB;
4958}
4959
4961 MachineInstr &MI, MachineBasicBlock *BB) const {
4962
4964 MachineFunction *MF = BB->getParent();
4966
4967 switch (MI.getOpcode()) {
4968 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4969 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4970 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4971 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4972 case AMDGPU::S_UADDO_PSEUDO:
4973 case AMDGPU::S_USUBO_PSEUDO: {
4974 const DebugLoc &DL = MI.getDebugLoc();
4975 MachineOperand &Dest0 = MI.getOperand(0);
4976 MachineOperand &Dest1 = MI.getOperand(1);
4977 MachineOperand &Src0 = MI.getOperand(2);
4978 MachineOperand &Src1 = MI.getOperand(3);
4979
4980 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4981 ? AMDGPU::S_ADD_I32
4982 : AMDGPU::S_SUB_I32;
4983 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4984
4985 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4986 .addImm(1)
4987 .addImm(0);
4988
4989 MI.eraseFromParent();
4990 return BB;
4991 }
4992 case AMDGPU::S_ADD_U64_PSEUDO:
4993 case AMDGPU::S_SUB_U64_PSEUDO: {
4994 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4995 // For GFX12, we emit s_add_u64 and s_sub_u64.
4996 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4998 const DebugLoc &DL = MI.getDebugLoc();
4999 MachineOperand &Dest = MI.getOperand(0);
5000 MachineOperand &Src0 = MI.getOperand(1);
5001 MachineOperand &Src1 = MI.getOperand(2);
5002 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5003 if (Subtarget->hasScalarAddSub64()) {
5004 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5005 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5006 .add(Src0)
5007 .add(Src1);
5008 } else {
5009 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5010 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5011
5012 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5013 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5014
5015 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5016 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5017 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5018 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5019
5020 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5021 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5022 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5023 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5024
5025 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5026 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5027 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5028 .add(Src0Sub0)
5029 .add(Src1Sub0);
5030 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5031 .add(Src0Sub1)
5032 .add(Src1Sub1);
5033 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5034 .addReg(DestSub0)
5035 .addImm(AMDGPU::sub0)
5036 .addReg(DestSub1)
5037 .addImm(AMDGPU::sub1);
5038 }
5039 MI.eraseFromParent();
5040 return BB;
5041 }
5042 case AMDGPU::V_ADD_U64_PSEUDO:
5043 case AMDGPU::V_SUB_U64_PSEUDO: {
5045 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5046 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5047 const DebugLoc &DL = MI.getDebugLoc();
5048
5049 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5050
5051 MachineOperand &Dest = MI.getOperand(0);
5052 MachineOperand &Src0 = MI.getOperand(1);
5053 MachineOperand &Src1 = MI.getOperand(2);
5054
5055 if (IsAdd && ST.hasLshlAddB64()) {
5056 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5057 Dest.getReg())
5058 .add(Src0)
5059 .addImm(0)
5060 .add(Src1);
5061 TII->legalizeOperands(*Add);
5062 MI.eraseFromParent();
5063 return BB;
5064 }
5065
5066 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5067
5068 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5069 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5070
5071 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5072 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5073
5074 const TargetRegisterClass *Src0RC = Src0.isReg()
5075 ? MRI.getRegClass(Src0.getReg())
5076 : &AMDGPU::VReg_64RegClass;
5077 const TargetRegisterClass *Src1RC = Src1.isReg()
5078 ? MRI.getRegClass(Src1.getReg())
5079 : &AMDGPU::VReg_64RegClass;
5080
5081 const TargetRegisterClass *Src0SubRC =
5082 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5083 const TargetRegisterClass *Src1SubRC =
5084 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5085
5086 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5087 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5088 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5089 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5090
5091 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5092 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5093 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5094 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5095
5096 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5097 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5098 .addReg(CarryReg, RegState::Define)
5099 .add(SrcReg0Sub0)
5100 .add(SrcReg1Sub0)
5101 .addImm(0); // clamp bit
5102
5103 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5104 MachineInstr *HiHalf =
5105 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5106 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5107 .add(SrcReg0Sub1)
5108 .add(SrcReg1Sub1)
5109 .addReg(CarryReg, RegState::Kill)
5110 .addImm(0); // clamp bit
5111
5112 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5113 .addReg(DestSub0)
5114 .addImm(AMDGPU::sub0)
5115 .addReg(DestSub1)
5116 .addImm(AMDGPU::sub1);
5117 TII->legalizeOperands(*LoHalf);
5118 TII->legalizeOperands(*HiHalf);
5119 MI.eraseFromParent();
5120 return BB;
5121 }
5122 case AMDGPU::S_ADD_CO_PSEUDO:
5123 case AMDGPU::S_SUB_CO_PSEUDO: {
5124 // This pseudo has a chance to be selected
5125 // only from uniform add/subcarry node. All the VGPR operands
5126 // therefore assumed to be splat vectors.
5128 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5129 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5131 const DebugLoc &DL = MI.getDebugLoc();
5132 MachineOperand &Dest = MI.getOperand(0);
5133 MachineOperand &CarryDest = MI.getOperand(1);
5134 MachineOperand &Src0 = MI.getOperand(2);
5135 MachineOperand &Src1 = MI.getOperand(3);
5136 MachineOperand &Src2 = MI.getOperand(4);
5137 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5138 ? AMDGPU::S_ADDC_U32
5139 : AMDGPU::S_SUBB_U32;
5140 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5141 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5142 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5143 .addReg(Src0.getReg());
5144 Src0.setReg(RegOp0);
5145 }
5146 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5147 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5148 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5149 .addReg(Src1.getReg());
5150 Src1.setReg(RegOp1);
5151 }
5152 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5153 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5154 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5155 .addReg(Src2.getReg());
5156 Src2.setReg(RegOp2);
5157 }
5158
5159 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5160 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5161 assert(WaveSize == 64 || WaveSize == 32);
5162
5163 if (WaveSize == 64) {
5164 if (ST.hasScalarCompareEq64()) {
5165 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5166 .addReg(Src2.getReg())
5167 .addImm(0);
5168 } else {
5169 const TargetRegisterClass *SubRC =
5170 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5171 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5172 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5173 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5174 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5175 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5176
5177 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5178 .add(Src2Sub0)
5179 .add(Src2Sub1);
5180
5181 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5182 .addReg(Src2_32, RegState::Kill)
5183 .addImm(0);
5184 }
5185 } else {
5186 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5187 .addReg(Src2.getReg())
5188 .addImm(0);
5189 }
5190
5191 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5192
5193 unsigned SelOpc =
5194 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5195
5196 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5197 .addImm(-1)
5198 .addImm(0);
5199
5200 MI.eraseFromParent();
5201 return BB;
5202 }
5203 case AMDGPU::SI_INIT_M0: {
5204 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5205 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5206 .add(MI.getOperand(0));
5207 MI.eraseFromParent();
5208 return BB;
5209 }
5210 case AMDGPU::GET_GROUPSTATICSIZE: {
5211 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5212 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5213 DebugLoc DL = MI.getDebugLoc();
5214 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5215 .add(MI.getOperand(0))
5216 .addImm(MFI->getLDSSize());
5217 MI.eraseFromParent();
5218 return BB;
5219 }
5220 case AMDGPU::GET_SHADERCYCLESHILO: {
5223 const DebugLoc &DL = MI.getDebugLoc();
5224 // The algorithm is:
5225 //
5226 // hi1 = getreg(SHADER_CYCLES_HI)
5227 // lo1 = getreg(SHADER_CYCLES_LO)
5228 // hi2 = getreg(SHADER_CYCLES_HI)
5229 //
5230 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5231 // Otherwise there was overflow and the result is hi2:0. In both cases the
5232 // result should represent the actual time at some point during the sequence
5233 // of three getregs.
5234 using namespace AMDGPU::Hwreg;
5235 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5236 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5237 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5238 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5239 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5240 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5241 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5242 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5243 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5244 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5245 .addReg(RegHi1)
5246 .addReg(RegHi2);
5247 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5248 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5249 .addReg(RegLo1)
5250 .addImm(0);
5251 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5252 .add(MI.getOperand(0))
5253 .addReg(RegLo)
5254 .addImm(AMDGPU::sub0)
5255 .addReg(RegHi2)
5256 .addImm(AMDGPU::sub1);
5257 MI.eraseFromParent();
5258 return BB;
5259 }
5260 case AMDGPU::SI_INDIRECT_SRC_V1:
5261 case AMDGPU::SI_INDIRECT_SRC_V2:
5262 case AMDGPU::SI_INDIRECT_SRC_V4:
5263 case AMDGPU::SI_INDIRECT_SRC_V8:
5264 case AMDGPU::SI_INDIRECT_SRC_V9:
5265 case AMDGPU::SI_INDIRECT_SRC_V10:
5266 case AMDGPU::SI_INDIRECT_SRC_V11:
5267 case AMDGPU::SI_INDIRECT_SRC_V12:
5268 case AMDGPU::SI_INDIRECT_SRC_V16:
5269 case AMDGPU::SI_INDIRECT_SRC_V32:
5270 return emitIndirectSrc(MI, *BB, *getSubtarget());
5271 case AMDGPU::SI_INDIRECT_DST_V1:
5272 case AMDGPU::SI_INDIRECT_DST_V2:
5273 case AMDGPU::SI_INDIRECT_DST_V4:
5274 case AMDGPU::SI_INDIRECT_DST_V8:
5275 case AMDGPU::SI_INDIRECT_DST_V9:
5276 case AMDGPU::SI_INDIRECT_DST_V10:
5277 case AMDGPU::SI_INDIRECT_DST_V11:
5278 case AMDGPU::SI_INDIRECT_DST_V12:
5279 case AMDGPU::SI_INDIRECT_DST_V16:
5280 case AMDGPU::SI_INDIRECT_DST_V32:
5281 return emitIndirectDst(MI, *BB, *getSubtarget());
5282 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5283 case AMDGPU::SI_KILL_I1_PSEUDO:
5284 return splitKillBlock(MI, BB);
5285 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5287 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5288 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5289
5290 Register Dst = MI.getOperand(0).getReg();
5291 const MachineOperand &Src0 = MI.getOperand(1);
5292 const MachineOperand &Src1 = MI.getOperand(2);
5293 const DebugLoc &DL = MI.getDebugLoc();
5294 Register SrcCond = MI.getOperand(3).getReg();
5295
5296 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5297 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5298 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5299 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5300
5301 const TargetRegisterClass *Src0RC = Src0.isReg()
5302 ? MRI.getRegClass(Src0.getReg())
5303 : &AMDGPU::VReg_64RegClass;
5304 const TargetRegisterClass *Src1RC = Src1.isReg()
5305 ? MRI.getRegClass(Src1.getReg())
5306 : &AMDGPU::VReg_64RegClass;
5307
5308 const TargetRegisterClass *Src0SubRC =
5309 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5310 const TargetRegisterClass *Src1SubRC =
5311 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5312
5313 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5314 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5315 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5316 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5317
5318 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5319 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5320 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5321 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5322
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5324 .addReg(SrcCond);
5325 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5326 .addImm(0)
5327 .add(Src0Sub0)
5328 .addImm(0)
5329 .add(Src1Sub0)
5330 .addReg(SrcCondCopy);
5331 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5332 .addImm(0)
5333 .add(Src0Sub1)
5334 .addImm(0)
5335 .add(Src1Sub1)
5336 .addReg(SrcCondCopy);
5337
5338 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5339 .addReg(DstLo)
5340 .addImm(AMDGPU::sub0)
5341 .addReg(DstHi)
5342 .addImm(AMDGPU::sub1);
5343 MI.eraseFromParent();
5344 return BB;
5345 }
5346 case AMDGPU::SI_BR_UNDEF: {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5350 .add(MI.getOperand(0));
5351 Br->getOperand(1).setIsUndef(); // read undef SCC
5352 MI.eraseFromParent();
5353 return BB;
5354 }
5355 case AMDGPU::ADJCALLSTACKUP:
5356 case AMDGPU::ADJCALLSTACKDOWN: {
5358 MachineInstrBuilder MIB(*MF, &MI);
5359 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5360 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5361 return BB;
5362 }
5363 case AMDGPU::SI_CALL_ISEL: {
5365 const DebugLoc &DL = MI.getDebugLoc();
5366
5367 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5368
5370 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5371
5372 for (const MachineOperand &MO : MI.operands())
5373 MIB.add(MO);
5374
5375 MIB.cloneMemRefs(MI);
5376 MI.eraseFromParent();
5377 return BB;
5378 }
5379 case AMDGPU::V_ADD_CO_U32_e32:
5380 case AMDGPU::V_SUB_CO_U32_e32:
5381 case AMDGPU::V_SUBREV_CO_U32_e32: {
5382 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5383 const DebugLoc &DL = MI.getDebugLoc();
5384 unsigned Opc = MI.getOpcode();
5385
5386 bool NeedClampOperand = false;
5387 if (TII->pseudoToMCOpcode(Opc) == -1) {
5388 Opc = AMDGPU::getVOPe64(Opc);
5389 NeedClampOperand = true;
5390 }
5391
5392 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5393 if (TII->isVOP3(*I)) {
5394 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5395 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5396 I.addReg(TRI->getVCC(), RegState::Define);
5397 }
5398 I.add(MI.getOperand(1))
5399 .add(MI.getOperand(2));
5400 if (NeedClampOperand)
5401 I.addImm(0); // clamp bit for e64 encoding
5402
5403 TII->legalizeOperands(*I);
5404
5405 MI.eraseFromParent();
5406 return BB;
5407 }
5408 case AMDGPU::V_ADDC_U32_e32:
5409 case AMDGPU::V_SUBB_U32_e32:
5410 case AMDGPU::V_SUBBREV_U32_e32:
5411 // These instructions have an implicit use of vcc which counts towards the
5412 // constant bus limit.
5413 TII->legalizeOperands(MI);
5414 return BB;
5415 case AMDGPU::DS_GWS_INIT:
5416 case AMDGPU::DS_GWS_SEMA_BR:
5417 case AMDGPU::DS_GWS_BARRIER:
5418 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5419 [[fallthrough]];
5420 case AMDGPU::DS_GWS_SEMA_V:
5421 case AMDGPU::DS_GWS_SEMA_P:
5422 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5423 // A s_waitcnt 0 is required to be the instruction immediately following.
5424 if (getSubtarget()->hasGWSAutoReplay()) {
5426 return BB;
5427 }
5428
5429 return emitGWSMemViolTestLoop(MI, BB);
5430 case AMDGPU::S_SETREG_B32: {
5431 // Try to optimize cases that only set the denormal mode or rounding mode.
5432 //
5433 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5434 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5435 // instead.
5436 //
5437 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5438 // allow you to have a no side effect instruction in the output of a
5439 // sideeffecting pattern.
5440 auto [ID, Offset, Width] =
5441 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5443 return BB;
5444
5445 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5446 const unsigned SetMask = WidthMask << Offset;
5447
5448 if (getSubtarget()->hasDenormModeInst()) {
5449 unsigned SetDenormOp = 0;
5450 unsigned SetRoundOp = 0;
5451
5452 // The dedicated instructions can only set the whole denorm or round mode
5453 // at once, not a subset of bits in either.
5454 if (SetMask ==
5456 // If this fully sets both the round and denorm mode, emit the two
5457 // dedicated instructions for these.
5458 SetRoundOp = AMDGPU::S_ROUND_MODE;
5459 SetDenormOp = AMDGPU::S_DENORM_MODE;
5460 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5461 SetRoundOp = AMDGPU::S_ROUND_MODE;
5462 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5463 SetDenormOp = AMDGPU::S_DENORM_MODE;
5464 }
5465
5466 if (SetRoundOp || SetDenormOp) {
5468 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5469 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5470 unsigned ImmVal = Def->getOperand(1).getImm();
5471 if (SetRoundOp) {
5472 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5473 .addImm(ImmVal & 0xf);
5474
5475 // If we also have the denorm mode, get just the denorm mode bits.
5476 ImmVal >>= 4;
5477 }
5478
5479 if (SetDenormOp) {
5480 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5481 .addImm(ImmVal & 0xf);
5482 }
5483
5484 MI.eraseFromParent();
5485 return BB;
5486 }
5487 }
5488 }
5489
5490 // If only FP bits are touched, used the no side effects pseudo.
5491 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5492 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5493 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5494
5495 return BB;
5496 }
5497 case AMDGPU::S_INVERSE_BALLOT_U32:
5498 case AMDGPU::S_INVERSE_BALLOT_U64:
5499 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5500 // necessary. After that they are equivalent to a COPY.
5501 MI.setDesc(TII->get(AMDGPU::COPY));
5502 return BB;
5503 case AMDGPU::ENDPGM_TRAP: {
5504 const DebugLoc &DL = MI.getDebugLoc();
5505 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5506 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5507 MI.addOperand(MachineOperand::CreateImm(0));
5508 return BB;
5509 }
5510
5511 // We need a block split to make the real endpgm a terminator. We also don't
5512 // want to break phis in successor blocks, so we can't just delete to the
5513 // end of the block.
5514
5515 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5517 MF->push_back(TrapBB);
5518 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5519 .addImm(0);
5520 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5521 .addMBB(TrapBB);
5522
5523 BB->addSuccessor(TrapBB);
5524 MI.eraseFromParent();
5525 return SplitBB;
5526 }
5527 case AMDGPU::SIMULATED_TRAP: {
5528 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5530 MachineBasicBlock *SplitBB =
5531 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5532 MI.eraseFromParent();
5533 return SplitBB;
5534 }
5535 default:
5536 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5537 if (!MI.mayStore())
5539 return BB;
5540 }
5542 }
5543}
5544
5546 // This currently forces unfolding various combinations of fsub into fma with
5547 // free fneg'd operands. As long as we have fast FMA (controlled by
5548 // isFMAFasterThanFMulAndFAdd), we should perform these.
5549
5550 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5551 // most of these combines appear to be cycle neutral but save on instruction
5552 // count / code size.
5553 return true;
5554}
5555
5557
5559 EVT VT) const {
5560 if (!VT.isVector()) {
5561 return MVT::i1;
5562 }
5563 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5564}
5565
5567 // TODO: Should i16 be used always if legal? For now it would force VALU
5568 // shifts.
5569 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5570}
5571
5573 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5574 ? Ty.changeElementSize(16)
5575 : Ty.changeElementSize(32);
5576}
5577
5578// Answering this is somewhat tricky and depends on the specific device which
5579// have different rates for fma or all f64 operations.
5580//
5581// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5582// regardless of which device (although the number of cycles differs between
5583// devices), so it is always profitable for f64.
5584//
5585// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5586// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5587// which we can always do even without fused FP ops since it returns the same
5588// result as the separate operations and since it is always full
5589// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5590// however does not support denormals, so we do report fma as faster if we have
5591// a fast fma device and require denormals.
5592//
5594 EVT VT) const {
5595 VT = VT.getScalarType();
5596
5597 switch (VT.getSimpleVT().SimpleTy) {
5598 case MVT::f32: {
5599 // If mad is not available this depends only on if f32 fma is full rate.
5600 if (!Subtarget->hasMadMacF32Insts())
5601 return Subtarget->hasFastFMAF32();
5602
5603 // Otherwise f32 mad is always full rate and returns the same result as
5604 // the separate operations so should be preferred over fma.
5605 // However does not support denormals.
5607 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5608
5609 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5610 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5611 }
5612 case MVT::f64:
5613 return true;
5614 case MVT::f16:
5615 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5616 default:
5617 break;
5618 }
5619
5620 return false;
5621}
5622
5624 LLT Ty) const {
5625 switch (Ty.getScalarSizeInBits()) {
5626 case 16:
5627 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5628 case 32:
5629 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5630 case 64:
5631 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5632 default:
5633 break;
5634 }
5635
5636 return false;
5637}
5638
5640 if (!Ty.isScalar())
5641 return false;
5642
5643 if (Ty.getScalarSizeInBits() == 16)
5644 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5645 if (Ty.getScalarSizeInBits() == 32)
5646 return Subtarget->hasMadMacF32Insts() &&
5647 denormalModeIsFlushAllF32(*MI.getMF());
5648
5649 return false;
5650}
5651
5653 const SDNode *N) const {
5654 // TODO: Check future ftz flag
5655 // v_mad_f32/v_mac_f32 do not support denormals.
5656 EVT VT = N->getValueType(0);
5657 if (VT == MVT::f32)
5658 return Subtarget->hasMadMacF32Insts() &&
5660 if (VT == MVT::f16) {
5661 return Subtarget->hasMadF16() &&
5663 }
5664
5665 return false;
5666}
5667
5668//===----------------------------------------------------------------------===//
5669// Custom DAG Lowering Operations
5670//===----------------------------------------------------------------------===//
5671
5672// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5673// wider vector type is legal.
5675 SelectionDAG &DAG) const {
5676 unsigned Opc = Op.getOpcode();
5677 EVT VT = Op.getValueType();
5678 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5679 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5680 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5681 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5682
5683 SDValue Lo, Hi;
5684 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5685
5686 SDLoc SL(Op);
5687 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5688 Op->getFlags());
5689 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5690 Op->getFlags());
5691
5692 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5693}
5694
5695// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5696// wider vector type is legal.
5698 SelectionDAG &DAG) const {
5699 unsigned Opc = Op.getOpcode();
5700 EVT VT = Op.getValueType();
5701 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5702 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5703 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5704 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5705
5706 SDValue Lo0, Hi0;
5707 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5708 SDValue Lo1, Hi1;
5709 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5710
5711 SDLoc SL(Op);
5712
5713 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5714 Op->getFlags());
5715 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5716 Op->getFlags());
5717
5718 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5719}
5720
5722 SelectionDAG &DAG) const {
5723 unsigned Opc = Op.getOpcode();
5724 EVT VT = Op.getValueType();
5725 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5726 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5727 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5728 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5729 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5730 VT == MVT::v32bf16);
5731
5732 SDValue Lo0, Hi0;
5733 SDValue Op0 = Op.getOperand(0);
5734 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5735 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5736 : std::pair(Op0, Op0);
5737 SDValue Lo1, Hi1;
5738 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5739 SDValue Lo2, Hi2;
5740 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5741
5742 SDLoc SL(Op);
5743 auto ResVT = DAG.GetSplitDestVTs(VT);
5744
5745 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5746 Op->getFlags());
5747 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5748 Op->getFlags());
5749
5750 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5751}
5752
5753
5755 switch (Op.getOpcode()) {
5756 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5757 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5758 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5759 case ISD::LOAD: {
5760 SDValue Result = LowerLOAD(Op, DAG);
5761 assert((!Result.getNode() ||
5762 Result.getNode()->getNumValues() == 2) &&
5763 "Load should return a value and a chain");
5764 return Result;
5765 }
5766 case ISD::FSQRT: {
5767 EVT VT = Op.getValueType();
5768 if (VT == MVT::f32)
5769 return lowerFSQRTF32(Op, DAG);
5770 if (VT == MVT::f64)
5771 return lowerFSQRTF64(Op, DAG);
5772 return SDValue();
5773 }
5774 case ISD::FSIN:
5775 case ISD::FCOS:
5776 return LowerTrig(Op, DAG);
5777 case ISD::SELECT: return LowerSELECT(Op, DAG);
5778 case ISD::FDIV: return LowerFDIV(Op, DAG);
5779 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5780 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5781 case ISD::STORE: return LowerSTORE(Op, DAG);
5782 case ISD::GlobalAddress: {
5785 return LowerGlobalAddress(MFI, Op, DAG);
5786 }
5787 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5788 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5789 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5790 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5792 return lowerINSERT_SUBVECTOR(Op, DAG);
5794 return lowerINSERT_VECTOR_ELT(Op, DAG);
5796 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5798 return lowerVECTOR_SHUFFLE(Op, DAG);
5800 return lowerSCALAR_TO_VECTOR(Op, DAG);
5801 case ISD::BUILD_VECTOR:
5802 return lowerBUILD_VECTOR(Op, DAG);
5803 case ISD::FP_ROUND:
5805 return lowerFP_ROUND(Op, DAG);
5806 case ISD::FPTRUNC_ROUND: {
5807 unsigned Opc;
5808 SDLoc DL(Op);
5809
5810 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5811 return SDValue();
5812
5813 // Get the rounding mode from the last operand
5814 int RoundMode = Op.getConstantOperandVal(1);
5815 if (RoundMode == (int)RoundingMode::TowardPositive)
5817 else if (RoundMode == (int)RoundingMode::TowardNegative)
5819 else
5820 return SDValue();
5821
5822 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5823 }
5824 case ISD::TRAP:
5825 return lowerTRAP(Op, DAG);
5826 case ISD::DEBUGTRAP:
5827 return lowerDEBUGTRAP(Op, DAG);
5828 case ISD::ABS:
5829 case ISD::FABS:
5830 case ISD::FNEG:
5831 case ISD::FCANONICALIZE:
5832 case ISD::BSWAP:
5833 return splitUnaryVectorOp(Op, DAG);
5834 case ISD::FMINNUM:
5835 case ISD::FMAXNUM:
5836 return lowerFMINNUM_FMAXNUM(Op, DAG);
5837 case ISD::FLDEXP:
5838 case ISD::STRICT_FLDEXP:
5839 return lowerFLDEXP(Op, DAG);
5840 case ISD::FMA:
5841 return splitTernaryVectorOp(Op, DAG);
5842 case ISD::FP_TO_SINT:
5843 case ISD::FP_TO_UINT:
5844 return LowerFP_TO_INT(Op, DAG);
5845 case ISD::SHL:
5846 case ISD::SRA:
5847 case ISD::SRL:
5848 case ISD::ADD:
5849 case ISD::SUB:
5850 case ISD::SMIN:
5851 case ISD::SMAX:
5852 case ISD::UMIN:
5853 case ISD::UMAX:
5854 case ISD::FADD:
5855 case ISD::FMUL:
5856 case ISD::FMINNUM_IEEE:
5857 case ISD::FMAXNUM_IEEE:
5858 case ISD::FMINIMUM:
5859 case ISD::FMAXIMUM:
5860 case ISD::UADDSAT:
5861 case ISD::USUBSAT:
5862 case ISD::SADDSAT:
5863 case ISD::SSUBSAT:
5864 return splitBinaryVectorOp(Op, DAG);
5865 case ISD::MUL:
5866 return lowerMUL(Op, DAG);
5867 case ISD::SMULO:
5868 case ISD::UMULO:
5869 return lowerXMULO(Op, DAG);
5870 case ISD::SMUL_LOHI:
5871 case ISD::UMUL_LOHI:
5872 return lowerXMUL_LOHI(Op, DAG);
5874 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5875 case ISD::STACKSAVE:
5876 return LowerSTACKSAVE(Op, DAG);
5877 case ISD::GET_ROUNDING:
5878 return lowerGET_ROUNDING(Op, DAG);
5879 case ISD::SET_ROUNDING:
5880 return lowerSET_ROUNDING(Op, DAG);
5881 case ISD::PREFETCH:
5882 return lowerPREFETCH(Op, DAG);
5883 case ISD::FP_EXTEND:
5885 return lowerFP_EXTEND(Op, DAG);
5886 case ISD::GET_FPENV:
5887 return lowerGET_FPENV(Op, DAG);
5888 case ISD::SET_FPENV:
5889 return lowerSET_FPENV(Op, DAG);
5890 }
5891 return SDValue();
5892}
5893
5894// Used for D16: Casts the result of an instruction into the right vector,
5895// packs values if loads return unpacked values.
5897 const SDLoc &DL,
5898 SelectionDAG &DAG, bool Unpacked) {
5899 if (!LoadVT.isVector())
5900 return Result;
5901
5902 // Cast back to the original packed type or to a larger type that is a
5903 // multiple of 32 bit for D16. Widening the return type is a required for
5904 // legalization.
5905 EVT FittingLoadVT = LoadVT;
5906 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5907 FittingLoadVT =
5909 LoadVT.getVectorNumElements() + 1);
5910 }
5911
5912 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5913 // Truncate to v2i16/v4i16.
5914 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5915
5916 // Workaround legalizer not scalarizing truncate after vector op
5917 // legalization but not creating intermediate vector trunc.
5919 DAG.ExtractVectorElements(Result, Elts);
5920 for (SDValue &Elt : Elts)
5921 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5922
5923 // Pad illegal v1i16/v3fi6 to v4i16
5924 if ((LoadVT.getVectorNumElements() % 2) == 1)
5925 Elts.push_back(DAG.getUNDEF(MVT::i16));
5926
5927 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5928
5929 // Bitcast to original type (v2f16/v4f16).
5930 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5931 }
5932
5933 // Cast back to the original packed type.
5934 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5935}
5936
5937SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5938 MemSDNode *M,
5939 SelectionDAG &DAG,
5941 bool IsIntrinsic) const {
5942 SDLoc DL(M);
5943
5944 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5945 EVT LoadVT = M->getValueType(0);
5946
5947 EVT EquivLoadVT = LoadVT;
5948 if (LoadVT.isVector()) {
5949 if (Unpacked) {
5950 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5951 LoadVT.getVectorNumElements());
5952 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5953 // Widen v3f16 to legal type
5954 EquivLoadVT =
5956 LoadVT.getVectorNumElements() + 1);
5957 }
5958 }
5959
5960 // Change from v4f16/v2f16 to EquivLoadVT.
5961 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5962
5964 = DAG.getMemIntrinsicNode(
5965 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5966 VTList, Ops, M->getMemoryVT(),
5967 M->getMemOperand());
5968
5969 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5970
5971 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5972}
5973
5974SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5975 SelectionDAG &DAG,
5976 ArrayRef<SDValue> Ops) const {
5977 SDLoc DL(M);
5978 EVT LoadVT = M->getValueType(0);
5979 EVT EltType = LoadVT.getScalarType();
5980 EVT IntVT = LoadVT.changeTypeToInteger();
5981
5982 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5983
5984 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5985 bool IsTFE = M->getNumValues() == 3;
5986
5987 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5991
5992 if (IsD16) {
5993 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5994 }
5995
5996 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5997 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5998 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5999 IsTFE);
6000
6001 if (isTypeLegal(LoadVT)) {
6002 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6003 M->getMemOperand(), DAG);
6004 }
6005
6006 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6007 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6008 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6009 M->getMemOperand(), DAG);
6010 return DAG.getMergeValues(
6011 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6012 DL);
6013}
6014
6016 SDNode *N, SelectionDAG &DAG) {
6017 EVT VT = N->getValueType(0);
6018 unsigned CondCode = N->getConstantOperandVal(3);
6019 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6020 return DAG.getUNDEF(VT);
6021
6022 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6023
6024 SDValue LHS = N->getOperand(1);
6025 SDValue RHS = N->getOperand(2);
6026
6027 SDLoc DL(N);
6028
6029 EVT CmpVT = LHS.getValueType();
6030 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6031 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6033 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6034 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6035 }
6036
6037 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6038
6039 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6040 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6041
6042 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6043 DAG.getCondCode(CCOpcode));
6044 if (VT.bitsEq(CCVT))
6045 return SetCC;
6046 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6047}
6048
6050 SDNode *N, SelectionDAG &DAG) {
6051 EVT VT = N->getValueType(0);
6052
6053 unsigned CondCode = N->getConstantOperandVal(3);
6054 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6055 return DAG.getUNDEF(VT);
6056
6057 SDValue Src0 = N->getOperand(1);
6058 SDValue Src1 = N->getOperand(2);
6059 EVT CmpVT = Src0.getValueType();
6060 SDLoc SL(N);
6061
6062 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6063 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6064 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6065 }
6066
6067 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6068 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6069 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6070 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6071 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6072 Src1, DAG.getCondCode(CCOpcode));
6073 if (VT.bitsEq(CCVT))
6074 return SetCC;
6075 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6076}
6077
6079 SelectionDAG &DAG) {
6080 EVT VT = N->getValueType(0);
6081 SDValue Src = N->getOperand(1);
6082 SDLoc SL(N);
6083
6084 if (Src.getOpcode() == ISD::SETCC) {
6085 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6086 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6087 Src.getOperand(1), Src.getOperand(2));
6088 }
6089 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6090 // (ballot 0) -> 0
6091 if (Arg->isZero())
6092 return DAG.getConstant(0, SL, VT);
6093
6094 // (ballot 1) -> EXEC/EXEC_LO
6095 if (Arg->isOne()) {
6096 Register Exec;
6097 if (VT.getScalarSizeInBits() == 32)
6098 Exec = AMDGPU::EXEC_LO;
6099 else if (VT.getScalarSizeInBits() == 64)
6100 Exec = AMDGPU::EXEC;
6101 else
6102 return SDValue();
6103
6104 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6105 }
6106 }
6107
6108 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6109 // ISD::SETNE)
6110 return DAG.getNode(
6111 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6112 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6113}
6114
6116 SelectionDAG &DAG) {
6117 EVT VT = N->getValueType(0);
6118 unsigned ValSize = VT.getSizeInBits();
6119 unsigned IID = N->getConstantOperandVal(0);
6120 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6121 IID == Intrinsic::amdgcn_permlanex16;
6122 SDLoc SL(N);
6123 MVT IntVT = MVT::getIntegerVT(ValSize);
6124
6125 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6126 SDValue Src2, MVT ValT) -> SDValue {
6128 switch (IID) {
6129 case Intrinsic::amdgcn_permlane16:
6130 case Intrinsic::amdgcn_permlanex16:
6131 Operands.push_back(N->getOperand(6));
6132 Operands.push_back(N->getOperand(5));
6133 Operands.push_back(N->getOperand(4));
6134 [[fallthrough]];
6135 case Intrinsic::amdgcn_writelane:
6136 Operands.push_back(Src2);
6137 [[fallthrough]];
6138 case Intrinsic::amdgcn_readlane:
6139 Operands.push_back(Src1);
6140 [[fallthrough]];
6141 case Intrinsic::amdgcn_readfirstlane:
6142 case Intrinsic::amdgcn_permlane64:
6143 Operands.push_back(Src0);
6144 break;
6145 default:
6146 llvm_unreachable("unhandled lane op");
6147 }
6148
6149 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6150 std::reverse(Operands.begin(), Operands.end());
6151
6152 if (SDNode *GL = N->getGluedNode()) {
6153 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6154 GL = GL->getOperand(0).getNode();
6155 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6156 SDValue(GL, 0)));
6157 }
6158
6159 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6160 };
6161
6162 SDValue Src0 = N->getOperand(1);
6163 SDValue Src1, Src2;
6164 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6165 IsPermLane16) {
6166 Src1 = N->getOperand(2);
6167 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6168 Src2 = N->getOperand(3);
6169 }
6170
6171 if (ValSize == 32) {
6172 // Already legal
6173 return SDValue();
6174 }
6175
6176 if (ValSize < 32) {
6177 bool IsFloat = VT.isFloatingPoint();
6178 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6179 SL, MVT::i32);
6180
6181 if (IsPermLane16) {
6182 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6183 SL, MVT::i32);
6184 }
6185
6186 if (IID == Intrinsic::amdgcn_writelane) {
6187 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6188 SL, MVT::i32);
6189 }
6190
6191 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6192 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6193 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6194 }
6195
6196 if (ValSize % 32 != 0)
6197 return SDValue();
6198
6199 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6200 EVT VT = N->getValueType(0);
6201 unsigned NE = VT.getVectorNumElements();
6202 EVT EltVT = VT.getVectorElementType();
6204 unsigned NumOperands = N->getNumOperands();
6205 SmallVector<SDValue, 4> Operands(NumOperands);
6206 SDNode *GL = N->getGluedNode();
6207
6208 // only handle convergencectrl_glue
6210
6211 for (unsigned i = 0; i != NE; ++i) {
6212 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6213 ++j) {
6214 SDValue Operand = N->getOperand(j);
6215 EVT OperandVT = Operand.getValueType();
6216 if (OperandVT.isVector()) {
6217 // A vector operand; extract a single element.
6218 EVT OperandEltVT = OperandVT.getVectorElementType();
6219 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6220 Operand, DAG.getVectorIdxConstant(i, SL));
6221 } else {
6222 // A scalar operand; just use it as is.
6223 Operands[j] = Operand;
6224 }
6225 }
6226
6227 if (GL)
6228 Operands[NumOperands - 1] =
6229 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6230 SDValue(GL->getOperand(0).getNode(), 0));
6231
6232 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6233 }
6234
6235 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6236 return DAG.getBuildVector(VecVT, SL, Scalars);
6237 };
6238
6239 if (VT.isVector()) {
6240 switch (MVT::SimpleValueType EltTy =
6242 case MVT::i32:
6243 case MVT::f32: {
6244 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6245 return unrollLaneOp(LaneOp.getNode());
6246 }
6247 case MVT::i16:
6248 case MVT::f16:
6249 case MVT::bf16: {
6250 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6252 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6253 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6254 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6255 DAG.getConstant(EltIdx, SL, MVT::i32));
6256
6257 if (IsPermLane16)
6258 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6259 DAG.getConstant(EltIdx, SL, MVT::i32));
6260
6261 if (IID == Intrinsic::amdgcn_writelane)
6262 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6263 DAG.getConstant(EltIdx, SL, MVT::i32));
6264
6265 Pieces.push_back(
6266 IsPermLane16
6267 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6268 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6269 EltIdx += 2;
6270 }
6271 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6272 }
6273 default:
6274 // Handle all other cases by bitcasting to i32 vectors
6275 break;
6276 }
6277 }
6278
6279 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6280 Src0 = DAG.getBitcast(VecVT, Src0);
6281
6282 if (IsPermLane16)
6283 Src1 = DAG.getBitcast(VecVT, Src1);
6284
6285 if (IID == Intrinsic::amdgcn_writelane)
6286 Src2 = DAG.getBitcast(VecVT, Src2);
6287
6288 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6289 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6290 return DAG.getBitcast(VT, UnrolledLaneOp);
6291}
6292
6295 SelectionDAG &DAG) const {
6296 switch (N->getOpcode()) {
6298 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6299 Results.push_back(Res);
6300 return;
6301 }
6303 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6304 Results.push_back(Res);
6305 return;
6306 }
6308 unsigned IID = N->getConstantOperandVal(0);
6309 switch (IID) {
6310 case Intrinsic::amdgcn_make_buffer_rsrc:
6311 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6312 return;
6313 case Intrinsic::amdgcn_cvt_pkrtz: {
6314 SDValue Src0 = N->getOperand(1);
6315 SDValue Src1 = N->getOperand(2);
6316 SDLoc SL(N);
6317 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6318 Src0, Src1);
6319 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6320 return;
6321 }
6322 case Intrinsic::amdgcn_cvt_pknorm_i16:
6323 case Intrinsic::amdgcn_cvt_pknorm_u16:
6324 case Intrinsic::amdgcn_cvt_pk_i16:
6325 case Intrinsic::amdgcn_cvt_pk_u16: {
6326 SDValue Src0 = N->getOperand(1);
6327 SDValue Src1 = N->getOperand(2);
6328 SDLoc SL(N);
6329 unsigned Opcode;
6330
6331 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6333 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6335 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6337 else
6339
6340 EVT VT = N->getValueType(0);
6341 if (isTypeLegal(VT))
6342 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6343 else {
6344 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6345 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6346 }
6347 return;
6348 }
6349 case Intrinsic::amdgcn_s_buffer_load: {
6350 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6351 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6352 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6353 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6354 // s_buffer_load_i8.
6355 if (!Subtarget->hasScalarSubwordLoads())
6356 return;
6357 SDValue Op = SDValue(N, 0);
6358 SDValue Rsrc = Op.getOperand(1);
6359 SDValue Offset = Op.getOperand(2);
6360 SDValue CachePolicy = Op.getOperand(3);
6361 EVT VT = Op.getValueType();
6362 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6363 SDLoc DL(Op);
6365 const DataLayout &DataLayout = DAG.getDataLayout();
6366 Align Alignment =
6372 VT.getStoreSize(), Alignment);
6373 SDValue LoadVal;
6374 if (!Offset->isDivergent()) {
6375 SDValue Ops[] = {Rsrc, // source register
6376 Offset, CachePolicy};
6377 SDValue BufferLoad =
6379 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6380 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6381 } else {
6382 SDValue Ops[] = {
6383 DAG.getEntryNode(), // Chain
6384 Rsrc, // rsrc
6385 DAG.getConstant(0, DL, MVT::i32), // vindex
6386 {}, // voffset
6387 {}, // soffset
6388 {}, // offset
6389 CachePolicy, // cachepolicy
6390 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6391 };
6392 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6393 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6394 }
6395 Results.push_back(LoadVal);
6396 return;
6397 }
6398 }
6399 break;
6400 }
6402 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6403 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6404 // FIXME: Hacky
6405 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6406 Results.push_back(Res.getOperand(I));
6407 }
6408 } else {
6409 Results.push_back(Res);
6410 Results.push_back(Res.getValue(1));
6411 }
6412 return;
6413 }
6414
6415 break;
6416 }
6417 case ISD::SELECT: {
6418 SDLoc SL(N);
6419 EVT VT = N->getValueType(0);
6420 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6421 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6422 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6423
6424 EVT SelectVT = NewVT;
6425 if (NewVT.bitsLT(MVT::i32)) {
6426 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6427 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6428 SelectVT = MVT::i32;
6429 }
6430
6431 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6432 N->getOperand(0), LHS, RHS);
6433
6434 if (NewVT != SelectVT)
6435 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6436 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6437 return;
6438 }
6439 case ISD::FNEG: {
6440 if (N->getValueType(0) != MVT::v2f16)
6441 break;
6442
6443 SDLoc SL(N);
6444 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6445
6446 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6447 BC,
6448 DAG.getConstant(0x80008000, SL, MVT::i32));
6449 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6450 return;
6451 }
6452 case ISD::FABS: {
6453 if (N->getValueType(0) != MVT::v2f16)
6454 break;
6455
6456 SDLoc SL(N);
6457 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6458
6459 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6460 BC,
6461 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6462 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6463 return;
6464 }
6465 case ISD::FSQRT: {
6466 if (N->getValueType(0) != MVT::f16)
6467 break;
6468 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6469 break;
6470 }
6471 default:
6473 break;
6474 }
6475}
6476
6477/// Helper function for LowerBRCOND
6478static SDNode *findUser(SDValue Value, unsigned Opcode) {
6479
6480 SDNode *Parent = Value.getNode();
6481 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6482 I != E; ++I) {
6483
6484 if (I.getUse().get() != Value)
6485 continue;
6486
6487 if (I->getOpcode() == Opcode)
6488 return *I;
6489 }
6490 return nullptr;
6491}
6492
6493unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6494 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6495 switch (Intr->getConstantOperandVal(1)) {
6496 case Intrinsic::amdgcn_if:
6497 return AMDGPUISD::IF;
6498 case Intrinsic::amdgcn_else:
6499 return AMDGPUISD::ELSE;
6500 case Intrinsic::amdgcn_loop:
6501 return AMDGPUISD::LOOP;
6502 case Intrinsic::amdgcn_end_cf:
6503 llvm_unreachable("should not occur");
6504 default:
6505 return 0;
6506 }
6507 }
6508
6509 // break, if_break, else_break are all only used as inputs to loop, not
6510 // directly as branch conditions.
6511 return 0;
6512}
6513
6515 const Triple &TT = getTargetMachine().getTargetTriple();
6519}
6520
6522 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6523 return false;
6524
6525 // FIXME: Either avoid relying on address space here or change the default
6526 // address space for functions to avoid the explicit check.
6527 return (GV->getValueType()->isFunctionTy() ||
6530}
6531
6533 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6534}
6535
6537 if (!GV->hasExternalLinkage())
6538 return true;
6539
6540 const auto OS = getTargetMachine().getTargetTriple().getOS();
6541 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6542}
6543
6544/// This transforms the control flow intrinsics to get the branch destination as
6545/// last parameter, also switches branch target with BR if the need arise
6546SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6547 SelectionDAG &DAG) const {
6548 SDLoc DL(BRCOND);
6549
6550 SDNode *Intr = BRCOND.getOperand(1).getNode();
6551 SDValue Target = BRCOND.getOperand(2);
6552 SDNode *BR = nullptr;
6553 SDNode *SetCC = nullptr;
6554
6555 if (Intr->getOpcode() == ISD::SETCC) {
6556 // As long as we negate the condition everything is fine
6557 SetCC = Intr;
6558 Intr = SetCC->getOperand(0).getNode();
6559
6560 } else {
6561 // Get the target from BR if we don't negate the condition
6562 BR = findUser(BRCOND, ISD::BR);
6563 assert(BR && "brcond missing unconditional branch user");
6564 Target = BR->getOperand(1);
6565 }
6566
6567 unsigned CFNode = isCFIntrinsic(Intr);
6568 if (CFNode == 0) {
6569 // This is a uniform branch so we don't need to legalize.
6570 return BRCOND;
6571 }
6572
6573 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6574 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6575
6576 assert(!SetCC ||
6577 (SetCC->getConstantOperandVal(1) == 1 &&
6578 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6579 ISD::SETNE));
6580
6581 // operands of the new intrinsic call
6583 if (HaveChain)
6584 Ops.push_back(BRCOND.getOperand(0));
6585
6586 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6587 Ops.push_back(Target);
6588
6589 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6590
6591 // build the new intrinsic call
6592 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6593
6594 if (!HaveChain) {
6595 SDValue Ops[] = {
6596 SDValue(Result, 0),
6597 BRCOND.getOperand(0)
6598 };
6599
6600 Result = DAG.getMergeValues(Ops, DL).getNode();
6601 }
6602
6603 if (BR) {
6604 // Give the branch instruction our target
6605 SDValue Ops[] = {
6606 BR->getOperand(0),
6607 BRCOND.getOperand(2)
6608 };
6609 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6610 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6611 }
6612
6613 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6614
6615 // Copy the intrinsic results to registers
6616 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6618 if (!CopyToReg)
6619 continue;
6620
6621 Chain = DAG.getCopyToReg(
6622 Chain, DL,
6623 CopyToReg->getOperand(1),
6624 SDValue(Result, i - 1),
6625 SDValue());
6626
6627 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6628 }
6629
6630 // Remove the old intrinsic from the chain
6632 SDValue(Intr, Intr->getNumValues() - 1),
6633 Intr->getOperand(0));
6634
6635 return Chain;
6636}
6637
6638SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6639 SelectionDAG &DAG) const {
6640 MVT VT = Op.getSimpleValueType();
6641 SDLoc DL(Op);
6642 // Checking the depth
6643 if (Op.getConstantOperandVal(0) != 0)
6644 return DAG.getConstant(0, DL, VT);
6645
6648 // Check for kernel and shader functions
6649 if (Info->isEntryFunction())
6650 return DAG.getConstant(0, DL, VT);
6651
6652 MachineFrameInfo &MFI = MF.getFrameInfo();
6653 // There is a call to @llvm.returnaddress in this function
6654 MFI.setReturnAddressIsTaken(true);
6655
6657 // Get the return address reg and mark it as an implicit live-in
6658 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6659
6660 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6661}
6662
6663SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6664 SDValue Op,
6665 const SDLoc &DL,
6666 EVT VT) const {
6667 return Op.getValueType().bitsLE(VT) ?
6668 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6669 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6670 DAG.getTargetConstant(0, DL, MVT::i32));
6671}
6672
6673SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6674 assert(Op.getValueType() == MVT::f16 &&
6675 "Do not know how to custom lower FP_ROUND for non-f16 type");
6676
6677 SDValue Src = Op.getOperand(0);
6678 EVT SrcVT = Src.getValueType();
6679 if (SrcVT != MVT::f64)
6680 return Op;
6681
6682 // TODO: Handle strictfp
6683 if (Op.getOpcode() != ISD::FP_ROUND)
6684 return Op;
6685
6686 SDLoc DL(Op);
6687
6688 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6689 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6690 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6691}
6692
6693SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6694 SelectionDAG &DAG) const {
6695 EVT VT = Op.getValueType();
6696 const MachineFunction &MF = DAG.getMachineFunction();
6698 bool IsIEEEMode = Info->getMode().IEEE;
6699
6700 // FIXME: Assert during selection that this is only selected for
6701 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6702 // mode functions, but this happens to be OK since it's only done in cases
6703 // where there is known no sNaN.
6704 if (IsIEEEMode)
6705 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6706
6707 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6708 VT == MVT::v16bf16)
6709 return splitBinaryVectorOp(Op, DAG);
6710 return Op;
6711}
6712
6713SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6714 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6715 EVT VT = Op.getValueType();
6716 assert(VT == MVT::f16);
6717
6718 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6719 EVT ExpVT = Exp.getValueType();
6720 if (ExpVT == MVT::i16)
6721 return Op;
6722
6723 SDLoc DL(Op);
6724
6725 // Correct the exponent type for f16 to i16.
6726 // Clamp the range of the exponent to the instruction's range.
6727
6728 // TODO: This should be a generic narrowing legalization, and can easily be
6729 // for GlobalISel.
6730
6731 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6732 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6733
6734 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6735 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6736
6737 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6738
6739 if (IsStrict) {
6740 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6741 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6742 }
6743
6744 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6745}
6746
6747// Custom lowering for vector multiplications and s_mul_u64.
6748SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6749 EVT VT = Op.getValueType();
6750
6751 // Split vector operands.
6752 if (VT.isVector())
6753 return splitBinaryVectorOp(Op, DAG);
6754
6755 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6756
6757 // There are four ways to lower s_mul_u64:
6758 //
6759 // 1. If all the operands are uniform, then we lower it as it is.
6760 //
6761 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6762 // multiplications because there is not a vector equivalent of s_mul_u64.
6763 //
6764 // 3. If the cost model decides that it is more efficient to use vector
6765 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6766 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6767 //
6768 // 4. If the cost model decides to use vector registers and both of the
6769 // operands are zero-extended/sign-extended from 32-bits, then we split the
6770 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6771 // possible to check if the operands are zero-extended or sign-extended in
6772 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6773 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6774 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6775 // If the cost model decides that we have to use vector registers, then
6776 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6777 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6778 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6779 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6780 // SIInstrInfo.cpp .
6781
6782 if (Op->isDivergent())
6783 return SDValue();
6784
6785 SDValue Op0 = Op.getOperand(0);
6786 SDValue Op1 = Op.getOperand(1);
6787 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6788 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6789 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6790 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6791 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6792 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6793 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6794 SDLoc SL(Op);
6795 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6796 return SDValue(
6797 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6798 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6799 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6800 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6801 return SDValue(
6802 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6803 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6804 return Op;
6805}
6806
6807SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6808 EVT VT = Op.getValueType();
6809 SDLoc SL(Op);
6810 SDValue LHS = Op.getOperand(0);
6811 SDValue RHS = Op.getOperand(1);
6812 bool isSigned = Op.getOpcode() == ISD::SMULO;
6813
6814 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6815 const APInt &C = RHSC->getAPIntValue();
6816 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6817 if (C.isPowerOf2()) {
6818 // smulo(x, signed_min) is same as umulo(x, signed_min).
6819 bool UseArithShift = isSigned && !C.isMinSignedValue();
6820 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6821 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6822 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6823 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6824 SL, VT, Result, ShiftAmt),
6825 LHS, ISD::SETNE);
6826 return DAG.getMergeValues({ Result, Overflow }, SL);
6827 }
6828 }
6829
6830 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6832 SL, VT, LHS, RHS);
6833
6834 SDValue Sign = isSigned
6835 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6836 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6837 : DAG.getConstant(0, SL, VT);
6838 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6839
6840 return DAG.getMergeValues({ Result, Overflow }, SL);
6841}
6842
6843SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6844 if (Op->isDivergent()) {
6845 // Select to V_MAD_[IU]64_[IU]32.
6846 return Op;
6847 }
6848 if (Subtarget->hasSMulHi()) {
6849 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6850 return SDValue();
6851 }
6852 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6853 // calculate the high part, so we might as well do the whole thing with
6854 // V_MAD_[IU]64_[IU]32.
6855 return Op;
6856}
6857
6858SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6859 if (!Subtarget->isTrapHandlerEnabled() ||
6861 return lowerTrapEndpgm(Op, DAG);
6862
6863 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6864 lowerTrapHsaQueuePtr(Op, DAG);
6865}
6866
6867SDValue SITargetLowering::lowerTrapEndpgm(
6868 SDValue Op, SelectionDAG &DAG) const {
6869 SDLoc SL(Op);
6870 SDValue Chain = Op.getOperand(0);
6871 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6872}
6873
6874SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6875 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6878 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6880 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6883}
6884
6885SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6886 SDValue Op, SelectionDAG &DAG) const {
6887 SDLoc SL(Op);
6888 SDValue Chain = Op.getOperand(0);
6889
6890 SDValue QueuePtr;
6891 // For code object version 5, QueuePtr is passed through implicit kernarg.
6892 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6894 QueuePtr =
6895 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6896 } else {
6899 Register UserSGPR = Info->getQueuePtrUserSGPR();
6900
6901 if (UserSGPR == AMDGPU::NoRegister) {
6902 // We probably are in a function incorrectly marked with
6903 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6904 // trap, so just use a null pointer.
6905 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6906 } else {
6907 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6908 MVT::i64);
6909 }
6910 }
6911
6912 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6913 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6914 QueuePtr, SDValue());
6915
6917 SDValue Ops[] = {
6918 ToReg,
6919 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6920 SGPR01,
6921 ToReg.getValue(1)
6922 };
6923 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6924}
6925
6926SDValue SITargetLowering::lowerTrapHsa(
6927 SDValue Op, SelectionDAG &DAG) const {
6928 SDLoc SL(Op);
6929 SDValue Chain = Op.getOperand(0);
6930
6931 // We need to simulate the 's_trap 2' instruction on targets that run in
6932 // PRIV=1 (where it is treated as a nop).
6933 if (Subtarget->hasPrivEnabledTrap2NopBug())
6934 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6935
6937 SDValue Ops[] = {
6938 Chain,
6939 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6940 };
6941 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6942}
6943
6944SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6945 SDLoc SL(Op);
6946 SDValue Chain = Op.getOperand(0);
6948
6949 if (!Subtarget->isTrapHandlerEnabled() ||
6952 "debugtrap handler not supported",
6953 Op.getDebugLoc(),
6954 DS_Warning);
6955 LLVMContext &Ctx = MF.getFunction().getContext();
6956 Ctx.diagnose(NoTrap);
6957 return Chain;
6958 }
6959
6961 SDValue Ops[] = {
6962 Chain,
6963 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6964 };
6965 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6966}
6967
6968SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6969 SelectionDAG &DAG) const {
6970 if (Subtarget->hasApertureRegs()) {
6971 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6972 ? AMDGPU::SRC_SHARED_BASE
6973 : AMDGPU::SRC_PRIVATE_BASE;
6974 // Note: this feature (register) is broken. When used as a 32-bit operand,
6975 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6976 // bits.
6977 //
6978 // To work around the issue, directly emit a 64 bit mov from this register
6979 // then extract the high bits. Note that this shouldn't even result in a
6980 // shift being emitted and simply become a pair of registers (e.g.):
6981 // s_mov_b64 s[6:7], src_shared_base
6982 // v_mov_b32_e32 v1, s7
6983 //
6984 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6985 // coalescing would kick in and it would think it's okay to use the "HI"
6986 // subregister directly (instead of extracting the HI 32 bits) which is an
6987 // artificial (unusable) register.
6988 // Register TableGen definitions would need an overhaul to get rid of the
6989 // artificial "HI" aperture registers and prevent this kind of issue from
6990 // happening.
6991 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6992 DAG.getRegister(ApertureRegNo, MVT::i64));
6993 return DAG.getNode(
6994 ISD::TRUNCATE, DL, MVT::i32,
6995 DAG.getNode(ISD::SRL, DL, MVT::i64,
6996 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6997 }
6998
6999 // For code object version 5, private_base and shared_base are passed through
7000 // implicit kernargs.
7001 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7005 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7006 }
7007
7010 Register UserSGPR = Info->getQueuePtrUserSGPR();
7011 if (UserSGPR == AMDGPU::NoRegister) {
7012 // We probably are in a function incorrectly marked with
7013 // amdgpu-no-queue-ptr. This is undefined.
7014 return DAG.getUNDEF(MVT::i32);
7015 }
7016
7017 SDValue QueuePtr = CreateLiveInRegister(
7018 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7019
7020 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7021 // private_segment_aperture_base_hi.
7022 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7023
7024 SDValue Ptr =
7025 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7026
7027 // TODO: Use custom target PseudoSourceValue.
7028 // TODO: We should use the value from the IR intrinsic call, but it might not
7029 // be available and how do we get it?
7031 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7032 commonAlignment(Align(64), StructOffset),
7035}
7036
7037/// Return true if the value is a known valid address, such that a null check is
7038/// not necessary.
7040 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7041 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7042 isa<BasicBlockSDNode>(Val))
7043 return true;
7044
7045 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7046 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7047
7048 // TODO: Search through arithmetic, handle arguments and loads
7049 // marked nonnull.
7050 return false;
7051}
7052
7053SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7054 SelectionDAG &DAG) const {
7055 SDLoc SL(Op);
7056
7057 const AMDGPUTargetMachine &TM =
7058 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7059
7060 unsigned DestAS, SrcAS;
7061 SDValue Src;
7062 bool IsNonNull = false;
7063 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7064 SrcAS = ASC->getSrcAddressSpace();
7065 Src = ASC->getOperand(0);
7066 DestAS = ASC->getDestAddressSpace();
7067 } else {
7068 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7069 Op.getConstantOperandVal(0) ==
7070 Intrinsic::amdgcn_addrspacecast_nonnull);
7071 Src = Op->getOperand(1);
7072 SrcAS = Op->getConstantOperandVal(2);
7073 DestAS = Op->getConstantOperandVal(3);
7074 IsNonNull = true;
7075 }
7076
7077 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7078
7079 // flat -> local/private
7080 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7081 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7082 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7083 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7084
7085 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7086 return Ptr;
7087
7088 unsigned NullVal = TM.getNullPointerValue(DestAS);
7089 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7090 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7091
7092 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7093 SegmentNullPtr);
7094 }
7095 }
7096
7097 // local/private -> flat
7098 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7099 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7100 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7101
7102 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7103 SDValue CvtPtr =
7104 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7105 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7106
7107 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7108 return CvtPtr;
7109
7110 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7111 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7112
7113 SDValue NonNull
7114 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7115
7116 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7117 FlatNullPtr);
7118 }
7119 }
7120
7121 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7122 Op.getValueType() == MVT::i64) {
7125 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7126 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7127 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7128 }
7129
7130 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7131 Src.getValueType() == MVT::i64)
7132 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7133
7134 // global <-> flat are no-ops and never emitted.
7135
7136 const MachineFunction &MF = DAG.getMachineFunction();
7137 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7138 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7139 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7140
7141 return DAG.getUNDEF(Op->getValueType(0));
7142}
7143
7144// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7145// the small vector and inserting them into the big vector. That is better than
7146// the default expansion of doing it via a stack slot. Even though the use of
7147// the stack slot would be optimized away afterwards, the stack slot itself
7148// remains.
7149SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7150 SelectionDAG &DAG) const {
7151 SDValue Vec = Op.getOperand(0);
7152 SDValue Ins = Op.getOperand(1);
7153 SDValue Idx = Op.getOperand(2);
7154 EVT VecVT = Vec.getValueType();
7155 EVT InsVT = Ins.getValueType();
7156 EVT EltVT = VecVT.getVectorElementType();
7157 unsigned InsNumElts = InsVT.getVectorNumElements();
7158 unsigned IdxVal = Idx->getAsZExtVal();
7159 SDLoc SL(Op);
7160
7161 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7162 // Insert 32-bit registers at a time.
7163 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7164
7165 unsigned VecNumElts = VecVT.getVectorNumElements();
7166 EVT NewVecVT =
7167 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7168 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7170 MVT::i32, InsNumElts / 2);
7171
7172 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7173 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7174
7175 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7176 SDValue Elt;
7177 if (InsNumElts == 2) {
7178 Elt = Ins;
7179 } else {
7180 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7181 DAG.getConstant(I, SL, MVT::i32));
7182 }
7183 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7184 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7185 }
7186
7187 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7188 }
7189
7190 for (unsigned I = 0; I != InsNumElts; ++I) {
7191 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7192 DAG.getConstant(I, SL, MVT::i32));
7193 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7194 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7195 }
7196 return Vec;
7197}
7198
7199SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7200 SelectionDAG &DAG) const {
7201 SDValue Vec = Op.getOperand(0);
7202 SDValue InsVal = Op.getOperand(1);
7203 SDValue Idx = Op.getOperand(2);
7204 EVT VecVT = Vec.getValueType();
7205 EVT EltVT = VecVT.getVectorElementType();
7206 unsigned VecSize = VecVT.getSizeInBits();
7207 unsigned EltSize = EltVT.getSizeInBits();
7208 SDLoc SL(Op);
7209
7210 // Specially handle the case of v4i16 with static indexing.
7211 unsigned NumElts = VecVT.getVectorNumElements();
7212 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7213 if (NumElts == 4 && EltSize == 16 && KIdx) {
7214 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7215
7216 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7217 DAG.getConstant(0, SL, MVT::i32));
7218 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7219 DAG.getConstant(1, SL, MVT::i32));
7220
7221 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7222 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7223
7224 unsigned Idx = KIdx->getZExtValue();
7225 bool InsertLo = Idx < 2;
7226 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7227 InsertLo ? LoVec : HiVec,
7228 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7229 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7230
7231 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7232
7233 SDValue Concat = InsertLo ?
7234 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7235 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7236
7237 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7238 }
7239
7240 // Static indexing does not lower to stack access, and hence there is no need
7241 // for special custom lowering to avoid stack access.
7242 if (isa<ConstantSDNode>(Idx))
7243 return SDValue();
7244
7245 // Avoid stack access for dynamic indexing by custom lowering to
7246 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7247
7248 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7249
7250 MVT IntVT = MVT::getIntegerVT(VecSize);
7251
7252 // Convert vector index to bit-index and get the required bit mask.
7253 assert(isPowerOf2_32(EltSize));
7254 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7255 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7256 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7257 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7258 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7259
7260 // 1. Create a congruent vector with the target value in each element.
7261 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7262 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7263
7264 // 2. Mask off all other indices except the required index within (1).
7265 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7266
7267 // 3. Mask off the required index within the target vector.
7268 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7269 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7270 DAG.getNOT(SL, BFM, IntVT), BCVec);
7271
7272 // 4. Get (2) and (3) ORed into the target vector.
7273 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7274
7275 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7276}
7277
7278SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7279 SelectionDAG &DAG) const {
7280 SDLoc SL(Op);
7281
7282 EVT ResultVT = Op.getValueType();
7283 SDValue Vec = Op.getOperand(0);
7284 SDValue Idx = Op.getOperand(1);
7285 EVT VecVT = Vec.getValueType();
7286 unsigned VecSize = VecVT.getSizeInBits();
7287 EVT EltVT = VecVT.getVectorElementType();
7288
7289 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7290
7291 // Make sure we do any optimizations that will make it easier to fold
7292 // source modifiers before obscuring it with bit operations.
7293
7294 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7295 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7296 return Combined;
7297
7298 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7299 SDValue Lo, Hi;
7300 EVT LoVT, HiVT;
7301 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7302
7303 if (VecSize == 128) {
7304 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7305 Lo = DAG.getBitcast(LoVT,
7306 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7307 DAG.getConstant(0, SL, MVT::i32)));
7308 Hi = DAG.getBitcast(HiVT,
7309 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7310 DAG.getConstant(1, SL, MVT::i32)));
7311 } else if (VecSize == 256) {
7312 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7313 SDValue Parts[4];
7314 for (unsigned P = 0; P < 4; ++P) {
7315 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7316 DAG.getConstant(P, SL, MVT::i32));
7317 }
7318
7319 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7320 Parts[0], Parts[1]));
7321 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7322 Parts[2], Parts[3]));
7323 } else {
7324 assert(VecSize == 512);
7325
7326 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7327 SDValue Parts[8];
7328 for (unsigned P = 0; P < 8; ++P) {
7329 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7330 DAG.getConstant(P, SL, MVT::i32));
7331 }
7332
7333 Lo = DAG.getBitcast(LoVT,
7334 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7335 Parts[0], Parts[1], Parts[2], Parts[3]));
7336 Hi = DAG.getBitcast(HiVT,
7337 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7338 Parts[4], Parts[5],Parts[6], Parts[7]));
7339 }
7340
7341 EVT IdxVT = Idx.getValueType();
7342 unsigned NElem = VecVT.getVectorNumElements();
7343 assert(isPowerOf2_32(NElem));
7344 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7345 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7346 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7347 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7348 }
7349
7350 assert(VecSize <= 64);
7351
7352 MVT IntVT = MVT::getIntegerVT(VecSize);
7353
7354 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7355 SDValue VecBC = peekThroughBitcasts(Vec);
7356 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7357 SDValue Src = VecBC.getOperand(0);
7358 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7359 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7360 }
7361
7362 unsigned EltSize = EltVT.getSizeInBits();
7363 assert(isPowerOf2_32(EltSize));
7364
7365 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7366
7367 // Convert vector index to bit-index (* EltSize)
7368 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7369
7370 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7371 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7372
7373 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7374 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7375 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7376 }
7377
7378 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7379}
7380
7381static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7382 assert(Elt % 2 == 0);
7383 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7384}
7385
7386SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7387 SelectionDAG &DAG) const {
7388 SDLoc SL(Op);
7389 EVT ResultVT = Op.getValueType();
7390 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7391
7392 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7393 EVT EltVT = PackVT.getVectorElementType();
7394 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7395
7396 // vector_shuffle <0,1,6,7> lhs, rhs
7397 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7398 //
7399 // vector_shuffle <6,7,2,3> lhs, rhs
7400 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7401 //
7402 // vector_shuffle <6,7,0,1> lhs, rhs
7403 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7404
7405 // Avoid scalarizing when both halves are reading from consecutive elements.
7407 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7408 if (elementPairIsContiguous(SVN->getMask(), I)) {
7409 const int Idx = SVN->getMaskElt(I);
7410 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7411 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7412 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7413 PackVT, SVN->getOperand(VecIdx),
7414 DAG.getConstant(EltIdx, SL, MVT::i32));
7415 Pieces.push_back(SubVec);
7416 } else {
7417 const int Idx0 = SVN->getMaskElt(I);
7418 const int Idx1 = SVN->getMaskElt(I + 1);
7419 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7420 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7421 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7422 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7423
7424 SDValue Vec0 = SVN->getOperand(VecIdx0);
7425 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7426 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7427
7428 SDValue Vec1 = SVN->getOperand(VecIdx1);
7429 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7430 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7431 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7432 }
7433 }
7434
7435 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7436}
7437
7438SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7439 SelectionDAG &DAG) const {
7440 SDValue SVal = Op.getOperand(0);
7441 EVT ResultVT = Op.getValueType();
7442 EVT SValVT = SVal.getValueType();
7443 SDValue UndefVal = DAG.getUNDEF(SValVT);
7444 SDLoc SL(Op);
7445
7447 VElts.push_back(SVal);
7448 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7449 VElts.push_back(UndefVal);
7450
7451 return DAG.getBuildVector(ResultVT, SL, VElts);
7452}
7453
7454SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7455 SelectionDAG &DAG) const {
7456 SDLoc SL(Op);
7457 EVT VT = Op.getValueType();
7458
7459 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7460 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7462 VT.getVectorNumElements() / 2);
7463 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7464
7465 // Turn into pair of packed build_vectors.
7466 // TODO: Special case for constants that can be materialized with s_mov_b64.
7467 SmallVector<SDValue, 4> LoOps, HiOps;
7468 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7469 LoOps.push_back(Op.getOperand(I));
7470 HiOps.push_back(Op.getOperand(I + E));
7471 }
7472 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7473 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7474
7475 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7476 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7477
7478 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7479 { CastLo, CastHi });
7480 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7481 }
7482
7483 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7485 VT.getVectorNumElements() / 4);
7486 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7487
7488 SmallVector<SDValue, 4> Parts[4];
7489 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7490 for (unsigned P = 0; P < 4; ++P)
7491 Parts[P].push_back(Op.getOperand(I + P * E));
7492 }
7493 SDValue Casts[4];
7494 for (unsigned P = 0; P < 4; ++P) {
7495 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7496 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7497 }
7498
7499 SDValue Blend =
7500 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7501 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7502 }
7503
7504 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7506 VT.getVectorNumElements() / 8);
7507 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7508
7509 SmallVector<SDValue, 8> Parts[8];
7510 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7511 for (unsigned P = 0; P < 8; ++P)
7512 Parts[P].push_back(Op.getOperand(I + P * E));
7513 }
7514 SDValue Casts[8];
7515 for (unsigned P = 0; P < 8; ++P) {
7516 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7517 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7518 }
7519
7520 SDValue Blend =
7521 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7522 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7523 }
7524
7525 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7526 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7527
7528 SDValue Lo = Op.getOperand(0);
7529 SDValue Hi = Op.getOperand(1);
7530
7531 // Avoid adding defined bits with the zero_extend.
7532 if (Hi.isUndef()) {
7533 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7534 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7535 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7536 }
7537
7538 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7539 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7540
7541 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7542 DAG.getConstant(16, SL, MVT::i32));
7543 if (Lo.isUndef())
7544 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7545
7546 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7547 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7548
7549 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7550 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7551}
7552
7553bool
7555 // OSes that use ELF REL relocations (instead of RELA) can only store a
7556 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7557 // which can create arbitrary 64-bit addends. (This is only a problem for
7558 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7559 // the high 32 bits of the addend.)
7560 //
7561 // This should be kept in sync with how HasRelocationAddend is initialized in
7562 // the constructor of ELFAMDGPUAsmBackend.
7563 if (!Subtarget->isAmdHsaOS())
7564 return false;
7565
7566 // We can fold offsets for anything that doesn't require a GOT relocation.
7567 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7571}
7572
7573static SDValue
7575 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7576 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7577 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7578 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7579 // lowered to the following code sequence:
7580 //
7581 // For constant address space:
7582 // s_getpc_b64 s[0:1]
7583 // s_add_u32 s0, s0, $symbol
7584 // s_addc_u32 s1, s1, 0
7585 //
7586 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7587 // a fixup or relocation is emitted to replace $symbol with a literal
7588 // constant, which is a pc-relative offset from the encoding of the $symbol
7589 // operand to the global variable.
7590 //
7591 // For global address space:
7592 // s_getpc_b64 s[0:1]
7593 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7594 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7595 //
7596 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7597 // fixups or relocations are emitted to replace $symbol@*@lo and
7598 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7599 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7600 // operand to the global variable.
7601 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7602 SDValue PtrHi;
7603 if (GAFlags == SIInstrInfo::MO_NONE)
7604 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7605 else
7606 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7607 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7608}
7609
7610SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7611 SDValue Op,
7612 SelectionDAG &DAG) const {
7613 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7614 SDLoc DL(GSD);
7615 EVT PtrVT = Op.getValueType();
7616
7617 const GlobalValue *GV = GSD->getGlobal();
7623 GV->hasExternalLinkage()) {
7624 Type *Ty = GV->getValueType();
7625 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7626 // zero-sized type in other languages to declare the dynamic shared
7627 // memory which size is not known at the compile time. They will be
7628 // allocated by the runtime and placed directly after the static
7629 // allocated ones. They all share the same offset.
7630 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7631 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7632 // Adjust alignment for that dynamic shared memory array.
7634 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7635 MFI->setUsesDynamicLDS(true);
7636 return SDValue(
7637 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7638 }
7639 }
7641 }
7642
7644 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7646 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7647 }
7648
7649 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7650 SDValue AddrLo = DAG.getTargetGlobalAddress(
7651 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7652 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7653
7654 SDValue AddrHi = DAG.getTargetGlobalAddress(
7655 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7656 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7657
7658 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7659 }
7660
7661 if (shouldEmitFixup(GV))
7662 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7663
7664 if (shouldEmitPCReloc(GV))
7665 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7667
7668 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7670
7671 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7673 const DataLayout &DataLayout = DAG.getDataLayout();
7674 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7675 MachinePointerInfo PtrInfo
7677
7678 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7681}
7682
7684 const SDLoc &DL, SDValue V) const {
7685 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7686 // the destination register.
7687 //
7688 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7689 // so we will end up with redundant moves to m0.
7690 //
7691 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7692
7693 // A Null SDValue creates a glue result.
7694 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7695 V, Chain);
7696 return SDValue(M0, 0);
7697}
7698
7699SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7700 SDValue Op,
7701 MVT VT,
7702 unsigned Offset) const {
7703 SDLoc SL(Op);
7704 SDValue Param = lowerKernargMemParameter(
7705 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7706 // The local size values will have the hi 16-bits as zero.
7707 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7708 DAG.getValueType(VT));
7709}
7710
7712 EVT VT) {
7714 "non-hsa intrinsic with hsa target",
7715 DL.getDebugLoc());
7716 DAG.getContext()->diagnose(BadIntrin);
7717 return DAG.getUNDEF(VT);
7718}
7719
7721 EVT VT) {
7723 "intrinsic not supported on subtarget",
7724 DL.getDebugLoc());
7725 DAG.getContext()->diagnose(BadIntrin);
7726 return DAG.getUNDEF(VT);
7727}
7728
7730 ArrayRef<SDValue> Elts) {
7731 assert(!Elts.empty());
7732 MVT Type;
7733 unsigned NumElts = Elts.size();
7734
7735 if (NumElts <= 12) {
7736 Type = MVT::getVectorVT(MVT::f32, NumElts);
7737 } else {
7738 assert(Elts.size() <= 16);
7739 Type = MVT::v16f32;
7740 NumElts = 16;
7741 }
7742
7743 SmallVector<SDValue, 16> VecElts(NumElts);
7744 for (unsigned i = 0; i < Elts.size(); ++i) {
7745 SDValue Elt = Elts[i];
7746 if (Elt.getValueType() != MVT::f32)
7747 Elt = DAG.getBitcast(MVT::f32, Elt);
7748 VecElts[i] = Elt;
7749 }
7750 for (unsigned i = Elts.size(); i < NumElts; ++i)
7751 VecElts[i] = DAG.getUNDEF(MVT::f32);
7752
7753 if (NumElts == 1)
7754 return VecElts[0];
7755 return DAG.getBuildVector(Type, DL, VecElts);
7756}
7757
7758static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7759 SDValue Src, int ExtraElts) {
7760 EVT SrcVT = Src.getValueType();
7761
7763
7764 if (SrcVT.isVector())
7765 DAG.ExtractVectorElements(Src, Elts);
7766 else
7767 Elts.push_back(Src);
7768
7769 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7770 while (ExtraElts--)
7771 Elts.push_back(Undef);
7772
7773 return DAG.getBuildVector(CastVT, DL, Elts);
7774}
7775
7776// Re-construct the required return value for a image load intrinsic.
7777// This is more complicated due to the optional use TexFailCtrl which means the required
7778// return type is an aggregate
7780 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7781 bool Unpacked, bool IsD16, int DMaskPop,
7782 int NumVDataDwords, bool IsAtomicPacked16Bit,
7783 const SDLoc &DL) {
7784 // Determine the required return type. This is the same regardless of IsTexFail flag
7785 EVT ReqRetVT = ResultTypes[0];
7786 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7787 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7788 ? (ReqRetNumElts + 1) / 2
7789 : ReqRetNumElts;
7790
7791 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7792
7793 MVT DataDwordVT = NumDataDwords == 1 ?
7794 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7795
7796 MVT MaskPopVT = MaskPopDwords == 1 ?
7797 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7798
7799 SDValue Data(Result, 0);
7800 SDValue TexFail;
7801
7802 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7803 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7804 if (MaskPopVT.isVector()) {
7805 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7806 SDValue(Result, 0), ZeroIdx);
7807 } else {
7808 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7809 SDValue(Result, 0), ZeroIdx);
7810 }
7811 }
7812
7813 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7814 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7815 NumDataDwords - MaskPopDwords);
7816
7817 if (IsD16)
7818 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7819
7820 EVT LegalReqRetVT = ReqRetVT;
7821 if (!ReqRetVT.isVector()) {
7822 if (!Data.getValueType().isInteger())
7823 Data = DAG.getNode(ISD::BITCAST, DL,
7824 Data.getValueType().changeTypeToInteger(), Data);
7825 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7826 } else {
7827 // We need to widen the return vector to a legal type
7828 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7829 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7830 LegalReqRetVT =
7832 ReqRetVT.getVectorNumElements() + 1);
7833 }
7834 }
7835 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7836
7837 if (IsTexFail) {
7838 TexFail =
7839 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7840 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7841
7842 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7843 }
7844
7845 if (Result->getNumValues() == 1)
7846 return Data;
7847
7848 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7849}
7850
7851static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7852 SDValue *LWE, bool &IsTexFail) {
7853 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7854
7855 uint64_t Value = TexFailCtrlConst->getZExtValue();
7856 if (Value) {
7857 IsTexFail = true;
7858 }
7859
7860 SDLoc DL(TexFailCtrlConst);
7861 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7862 Value &= ~(uint64_t)0x1;
7863 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7864 Value &= ~(uint64_t)0x2;
7865
7866 return Value == 0;
7867}
7868
7870 MVT PackVectorVT,
7871 SmallVectorImpl<SDValue> &PackedAddrs,
7872 unsigned DimIdx, unsigned EndIdx,
7873 unsigned NumGradients) {
7874 SDLoc DL(Op);
7875 for (unsigned I = DimIdx; I < EndIdx; I++) {
7876 SDValue Addr = Op.getOperand(I);
7877
7878 // Gradients are packed with undef for each coordinate.
7879 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7880 // 1D: undef,dx/dh; undef,dx/dv
7881 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7882 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7883 if (((I + 1) >= EndIdx) ||
7884 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7885 I == DimIdx + NumGradients - 1))) {
7886 if (Addr.getValueType() != MVT::i16)
7887 Addr = DAG.getBitcast(MVT::i16, Addr);
7888 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7889 } else {
7890 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7891 I++;
7892 }
7893 Addr = DAG.getBitcast(MVT::f32, Addr);
7894 PackedAddrs.push_back(Addr);
7895 }
7896}
7897
7898SDValue SITargetLowering::lowerImage(SDValue Op,
7900 SelectionDAG &DAG, bool WithChain) const {
7901 SDLoc DL(Op);
7903 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7904 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7906 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7907 unsigned IntrOpcode = Intr->BaseOpcode;
7908 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7909 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7910 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7911
7912 SmallVector<EVT, 3> ResultTypes(Op->values());
7913 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7914 bool IsD16 = false;
7915 bool IsG16 = false;
7916 bool IsA16 = false;
7917 SDValue VData;
7918 int NumVDataDwords = 0;
7919 bool AdjustRetType = false;
7920 bool IsAtomicPacked16Bit = false;
7921
7922 // Offset of intrinsic arguments
7923 const unsigned ArgOffset = WithChain ? 2 : 1;
7924
7925 unsigned DMask;
7926 unsigned DMaskLanes = 0;
7927
7928 if (BaseOpcode->Atomic) {
7929 VData = Op.getOperand(2);
7930
7931 IsAtomicPacked16Bit =
7932 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7933 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7934
7935 bool Is64Bit = VData.getValueSizeInBits() == 64;
7936 if (BaseOpcode->AtomicX2) {
7937 SDValue VData2 = Op.getOperand(3);
7938 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7939 {VData, VData2});
7940 if (Is64Bit)
7941 VData = DAG.getBitcast(MVT::v4i32, VData);
7942
7943 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7944 DMask = Is64Bit ? 0xf : 0x3;
7945 NumVDataDwords = Is64Bit ? 4 : 2;
7946 } else {
7947 DMask = Is64Bit ? 0x3 : 0x1;
7948 NumVDataDwords = Is64Bit ? 2 : 1;
7949 }
7950 } else {
7951 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7952 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7953
7954 if (BaseOpcode->Store) {
7955 VData = Op.getOperand(2);
7956
7957 MVT StoreVT = VData.getSimpleValueType();
7958 if (StoreVT.getScalarType() == MVT::f16) {
7959 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7960 return Op; // D16 is unsupported for this instruction
7961
7962 IsD16 = true;
7963 VData = handleD16VData(VData, DAG, true);
7964 }
7965
7966 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7967 } else if (!BaseOpcode->NoReturn) {
7968 // Work out the num dwords based on the dmask popcount and underlying type
7969 // and whether packing is supported.
7970 MVT LoadVT = ResultTypes[0].getSimpleVT();
7971 if (LoadVT.getScalarType() == MVT::f16) {
7972 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7973 return Op; // D16 is unsupported for this instruction
7974
7975 IsD16 = true;
7976 }
7977
7978 // Confirm that the return type is large enough for the dmask specified
7979 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7980 (!LoadVT.isVector() && DMaskLanes > 1))
7981 return Op;
7982
7983 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7984 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7985 // instructions.
7986 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7987 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7988 NumVDataDwords = (DMaskLanes + 1) / 2;
7989 else
7990 NumVDataDwords = DMaskLanes;
7991
7992 AdjustRetType = true;
7993 }
7994 }
7995
7996 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7998
7999 // Check for 16 bit addresses or derivatives and pack if true.
8000 MVT VAddrVT =
8001 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8002 MVT VAddrScalarVT = VAddrVT.getScalarType();
8003 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8004 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8005
8006 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8007 VAddrScalarVT = VAddrVT.getScalarType();
8008 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8009 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8010
8011 // Push back extra arguments.
8012 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8013 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8014 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8015 // Special handling of bias when A16 is on. Bias is of type half but
8016 // occupies full 32-bit.
8017 SDValue Bias = DAG.getBuildVector(
8018 MVT::v2f16, DL,
8019 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8020 VAddrs.push_back(Bias);
8021 } else {
8022 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8023 "Bias needs to be converted to 16 bit in A16 mode");
8024 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8025 }
8026 }
8027
8028 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8029 // 16 bit gradients are supported, but are tied to the A16 control
8030 // so both gradients and addresses must be 16 bit
8031 LLVM_DEBUG(
8032 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8033 "require 16 bit args for both gradients and addresses");
8034 return Op;
8035 }
8036
8037 if (IsA16) {
8038 if (!ST->hasA16()) {
8039 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8040 "support 16 bit addresses\n");
8041 return Op;
8042 }
8043 }
8044
8045 // We've dealt with incorrect input so we know that if IsA16, IsG16
8046 // are set then we have to compress/pack operands (either address,
8047 // gradient or both)
8048 // In the case where a16 and gradients are tied (no G16 support) then we
8049 // have already verified that both IsA16 and IsG16 are true
8050 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8051 // Activate g16
8052 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8054 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8055 }
8056
8057 // Add gradients (packed or unpacked)
8058 if (IsG16) {
8059 // Pack the gradients
8060 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8061 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8062 ArgOffset + Intr->GradientStart,
8063 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8064 } else {
8065 for (unsigned I = ArgOffset + Intr->GradientStart;
8066 I < ArgOffset + Intr->CoordStart; I++)
8067 VAddrs.push_back(Op.getOperand(I));
8068 }
8069
8070 // Add addresses (packed or unpacked)
8071 if (IsA16) {
8072 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8073 ArgOffset + Intr->CoordStart, VAddrEnd,
8074 0 /* No gradients */);
8075 } else {
8076 // Add uncompressed address
8077 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8078 VAddrs.push_back(Op.getOperand(I));
8079 }
8080
8081 // If the register allocator cannot place the address registers contiguously
8082 // without introducing moves, then using the non-sequential address encoding
8083 // is always preferable, since it saves VALU instructions and is usually a
8084 // wash in terms of code size or even better.
8085 //
8086 // However, we currently have no way of hinting to the register allocator that
8087 // MIMG addresses should be placed contiguously when it is possible to do so,
8088 // so force non-NSA for the common 2-address case as a heuristic.
8089 //
8090 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8091 // allocation when possible.
8092 //
8093 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8094 // set of the remaining addresses.
8095 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8096 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8097 const bool UseNSA = ST->hasNSAEncoding() &&
8098 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8099 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8100 const bool UsePartialNSA =
8101 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8102
8103 SDValue VAddr;
8104 if (UsePartialNSA) {
8105 VAddr = getBuildDwordsVector(DAG, DL,
8106 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8107 }
8108 else if (!UseNSA) {
8109 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8110 }
8111
8112 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8113 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8114 SDValue Unorm;
8115 if (!BaseOpcode->Sampler) {
8116 Unorm = True;
8117 } else {
8118 uint64_t UnormConst =
8119 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8120
8121 Unorm = UnormConst ? True : False;
8122 }
8123
8124 SDValue TFE;
8125 SDValue LWE;
8126 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8127 bool IsTexFail = false;
8128 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8129 return Op;
8130
8131 if (IsTexFail) {
8132 if (!DMaskLanes) {
8133 // Expecting to get an error flag since TFC is on - and dmask is 0
8134 // Force dmask to be at least 1 otherwise the instruction will fail
8135 DMask = 0x1;
8136 DMaskLanes = 1;
8137 NumVDataDwords = 1;
8138 }
8139 NumVDataDwords += 1;
8140 AdjustRetType = true;
8141 }
8142
8143 // Has something earlier tagged that the return type needs adjusting
8144 // This happens if the instruction is a load or has set TexFailCtrl flags
8145 if (AdjustRetType) {
8146 // NumVDataDwords reflects the true number of dwords required in the return type
8147 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8148 // This is a no-op load. This can be eliminated
8149 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8150 if (isa<MemSDNode>(Op))
8151 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8152 return Undef;
8153 }
8154
8155 EVT NewVT = NumVDataDwords > 1 ?
8156 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8157 : MVT::i32;
8158
8159 ResultTypes[0] = NewVT;
8160 if (ResultTypes.size() == 3) {
8161 // Original result was aggregate type used for TexFailCtrl results
8162 // The actual instruction returns as a vector type which has now been
8163 // created. Remove the aggregate result.
8164 ResultTypes.erase(&ResultTypes[1]);
8165 }
8166 }
8167
8168 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8169 if (BaseOpcode->Atomic)
8170 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8171 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8173 return Op;
8174
8176 if (BaseOpcode->Store || BaseOpcode->Atomic)
8177 Ops.push_back(VData); // vdata
8178 if (UsePartialNSA) {
8179 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8180 Ops.push_back(VAddr);
8181 }
8182 else if (UseNSA)
8183 append_range(Ops, VAddrs);
8184 else
8185 Ops.push_back(VAddr);
8186 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8187 if (BaseOpcode->Sampler)
8188 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8189 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8190 if (IsGFX10Plus)
8191 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8192 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8193 Ops.push_back(Unorm);
8194 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8195 Ops.push_back(IsA16 && // r128, a16 for gfx9
8196 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8197 if (IsGFX10Plus)
8198 Ops.push_back(IsA16 ? True : False);
8199 if (!Subtarget->hasGFX90AInsts()) {
8200 Ops.push_back(TFE); //tfe
8201 } else if (TFE->getAsZExtVal()) {
8202 report_fatal_error("TFE is not supported on this GPU");
8203 }
8204 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8205 Ops.push_back(LWE); // lwe
8206 if (!IsGFX10Plus)
8207 Ops.push_back(DimInfo->DA ? True : False);
8208 if (BaseOpcode->HasD16)
8209 Ops.push_back(IsD16 ? True : False);
8210 if (isa<MemSDNode>(Op))
8211 Ops.push_back(Op.getOperand(0)); // chain
8212
8213 int NumVAddrDwords =
8214 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8215 int Opcode = -1;
8216
8217 if (IsGFX12Plus) {
8218 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8219 NumVDataDwords, NumVAddrDwords);
8220 } else if (IsGFX11Plus) {
8221 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8222 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8223 : AMDGPU::MIMGEncGfx11Default,
8224 NumVDataDwords, NumVAddrDwords);
8225 } else if (IsGFX10Plus) {
8226 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8227 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8228 : AMDGPU::MIMGEncGfx10Default,
8229 NumVDataDwords, NumVAddrDwords);
8230 } else {
8231 if (Subtarget->hasGFX90AInsts()) {
8232 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8233 NumVDataDwords, NumVAddrDwords);
8234 if (Opcode == -1)
8236 "requested image instruction is not supported on this GPU");
8237 }
8238 if (Opcode == -1 &&
8240 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8241 NumVDataDwords, NumVAddrDwords);
8242 if (Opcode == -1)
8243 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8244 NumVDataDwords, NumVAddrDwords);
8245 }
8246 if (Opcode == -1)
8247 return Op;
8248
8249 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8250 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8251 MachineMemOperand *MemRef = MemOp->getMemOperand();
8252 DAG.setNodeMemRefs(NewNode, {MemRef});
8253 }
8254
8255 if (BaseOpcode->AtomicX2) {
8257 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8258 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8259 }
8260 if (BaseOpcode->NoReturn)
8261 return SDValue(NewNode, 0);
8262 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8263 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8264 NumVDataDwords, IsAtomicPacked16Bit, DL);
8265}
8266
8267SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8268 SDValue Offset, SDValue CachePolicy,
8269 SelectionDAG &DAG) const {
8271
8272 const DataLayout &DataLayout = DAG.getDataLayout();
8273 Align Alignment =
8275
8280 VT.getStoreSize(), Alignment);
8281
8282 if (!Offset->isDivergent()) {
8283 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8284
8285 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8286 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8287 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8288 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8289 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8290 SDValue BufferLoad =
8292 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8293 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8294 }
8295
8296 // Widen vec3 load to vec4.
8297 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8298 !Subtarget->hasScalarDwordx3Loads()) {
8299 EVT WidenedVT =
8301 auto WidenedOp = DAG.getMemIntrinsicNode(
8302 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8303 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8304 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8305 DAG.getVectorIdxConstant(0, DL));
8306 return Subvector;
8307 }
8308
8310 DAG.getVTList(VT), Ops, VT, MMO);
8311 }
8312
8313 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8314 // assume that the buffer is unswizzled.
8315 SDValue Ops[] = {
8316 DAG.getEntryNode(), // Chain
8317 Rsrc, // rsrc
8318 DAG.getConstant(0, DL, MVT::i32), // vindex
8319 {}, // voffset
8320 {}, // soffset
8321 {}, // offset
8322 CachePolicy, // cachepolicy
8323 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8324 };
8325 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8326 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8327 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8328 }
8329
8331 unsigned NumLoads = 1;
8332 MVT LoadVT = VT.getSimpleVT();
8333 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8334 assert((LoadVT.getScalarType() == MVT::i32 ||
8335 LoadVT.getScalarType() == MVT::f32));
8336
8337 if (NumElts == 8 || NumElts == 16) {
8338 NumLoads = NumElts / 4;
8339 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8340 }
8341
8342 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8343
8344 // Use the alignment to ensure that the required offsets will fit into the
8345 // immediate offsets.
8346 setBufferOffsets(Offset, DAG, &Ops[3],
8347 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8348
8349 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8350 for (unsigned i = 0; i < NumLoads; ++i) {
8351 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8352 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8353 LoadVT, MMO, DAG));
8354 }
8355
8356 if (NumElts == 8 || NumElts == 16)
8357 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8358
8359 return Loads[0];
8360}
8361
8362SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8363 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8364 if (!Subtarget->hasArchitectedSGPRs())
8365 return {};
8366 SDLoc SL(Op);
8367 MVT VT = MVT::i32;
8368 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8369 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8370 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8371}
8372
8373SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8374 unsigned Dim,
8375 const ArgDescriptor &Arg) const {
8376 SDLoc SL(Op);
8378 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8379 if (MaxID == 0)
8380 return DAG.getConstant(0, SL, MVT::i32);
8381
8382 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8383 SDLoc(DAG.getEntryNode()), Arg);
8384
8385 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8386 // masking operations anyway.
8387 //
8388 // TODO: We could assert the top bit is 0 for the source copy.
8389 if (Arg.isMasked())
8390 return Val;
8391
8392 // Preserve the known bits after expansion to a copy.
8394 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8395 DAG.getValueType(SmallVT));
8396}
8397
8398SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8399 SelectionDAG &DAG) const {
8401 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8402
8403 EVT VT = Op.getValueType();
8404 SDLoc DL(Op);
8405 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8406
8407 // TODO: Should this propagate fast-math-flags?
8408
8409 switch (IntrinsicID) {
8410 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8411 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8412 return emitNonHSAIntrinsicError(DAG, DL, VT);
8413 return getPreloadedValue(DAG, *MFI, VT,
8415 }
8416 case Intrinsic::amdgcn_dispatch_ptr:
8417 case Intrinsic::amdgcn_queue_ptr: {
8418 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8419 DiagnosticInfoUnsupported BadIntrin(
8420 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8421 DL.getDebugLoc());
8422 DAG.getContext()->diagnose(BadIntrin);
8423 return DAG.getUNDEF(VT);
8424 }
8425
8426 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8428 return getPreloadedValue(DAG, *MFI, VT, RegID);
8429 }
8430 case Intrinsic::amdgcn_implicitarg_ptr: {
8431 if (MFI->isEntryFunction())
8432 return getImplicitArgPtr(DAG, DL);
8433 return getPreloadedValue(DAG, *MFI, VT,
8435 }
8436 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8438 // This only makes sense to call in a kernel, so just lower to null.
8439 return DAG.getConstant(0, DL, VT);
8440 }
8441
8442 return getPreloadedValue(DAG, *MFI, VT,
8444 }
8445 case Intrinsic::amdgcn_dispatch_id: {
8446 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8447 }
8448 case Intrinsic::amdgcn_rcp:
8449 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8450 case Intrinsic::amdgcn_rsq:
8451 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8452 case Intrinsic::amdgcn_rsq_legacy:
8454 return emitRemovedIntrinsicError(DAG, DL, VT);
8455 return SDValue();
8456 case Intrinsic::amdgcn_rcp_legacy:
8458 return emitRemovedIntrinsicError(DAG, DL, VT);
8459 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8460 case Intrinsic::amdgcn_rsq_clamp: {
8462 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8463
8464 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8467
8468 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8469 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8470 DAG.getConstantFP(Max, DL, VT));
8471 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8472 DAG.getConstantFP(Min, DL, VT));
8473 }
8474 case Intrinsic::r600_read_ngroups_x:
8475 if (Subtarget->isAmdHsaOS())
8476 return emitNonHSAIntrinsicError(DAG, DL, VT);
8477
8478 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8480 false);
8481 case Intrinsic::r600_read_ngroups_y:
8482 if (Subtarget->isAmdHsaOS())
8483 return emitNonHSAIntrinsicError(DAG, DL, VT);
8484
8485 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8487 false);
8488 case Intrinsic::r600_read_ngroups_z:
8489 if (Subtarget->isAmdHsaOS())
8490 return emitNonHSAIntrinsicError(DAG, DL, VT);
8491
8492 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8494 false);
8495 case Intrinsic::r600_read_global_size_x:
8496 if (Subtarget->isAmdHsaOS())
8497 return emitNonHSAIntrinsicError(DAG, DL, VT);
8498
8499 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8501 Align(4), false);
8502 case Intrinsic::r600_read_global_size_y:
8503 if (Subtarget->isAmdHsaOS())
8504 return emitNonHSAIntrinsicError(DAG, DL, VT);
8505
8506 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8508 Align(4), false);
8509 case Intrinsic::r600_read_global_size_z:
8510 if (Subtarget->isAmdHsaOS())
8511 return emitNonHSAIntrinsicError(DAG, DL, VT);
8512
8513 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8515 Align(4), false);
8516 case Intrinsic::r600_read_local_size_x:
8517 if (Subtarget->isAmdHsaOS())
8518 return emitNonHSAIntrinsicError(DAG, DL, VT);
8519
8520 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8522 case Intrinsic::r600_read_local_size_y:
8523 if (Subtarget->isAmdHsaOS())
8524 return emitNonHSAIntrinsicError(DAG, DL, VT);
8525
8526 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8528 case Intrinsic::r600_read_local_size_z:
8529 if (Subtarget->isAmdHsaOS())
8530 return emitNonHSAIntrinsicError(DAG, DL, VT);
8531
8532 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8534 case Intrinsic::amdgcn_workgroup_id_x:
8535 return getPreloadedValue(DAG, *MFI, VT,
8537 case Intrinsic::amdgcn_workgroup_id_y:
8538 return getPreloadedValue(DAG, *MFI, VT,
8540 case Intrinsic::amdgcn_workgroup_id_z:
8541 return getPreloadedValue(DAG, *MFI, VT,
8543 case Intrinsic::amdgcn_wave_id:
8544 return lowerWaveID(DAG, Op);
8545 case Intrinsic::amdgcn_lds_kernel_id: {
8546 if (MFI->isEntryFunction())
8547 return getLDSKernelId(DAG, DL);
8548 return getPreloadedValue(DAG, *MFI, VT,
8550 }
8551 case Intrinsic::amdgcn_workitem_id_x:
8552 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8553 case Intrinsic::amdgcn_workitem_id_y:
8554 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8555 case Intrinsic::amdgcn_workitem_id_z:
8556 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8557 case Intrinsic::amdgcn_wavefrontsize:
8559 SDLoc(Op), MVT::i32);
8560 case Intrinsic::amdgcn_s_buffer_load: {
8561 unsigned CPol = Op.getConstantOperandVal(3);
8562 // s_buffer_load, because of how it's optimized, can't be volatile
8563 // so reject ones with the volatile bit set.
8564 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8567 return Op;
8568 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8569 DAG);
8570 }
8571 case Intrinsic::amdgcn_fdiv_fast:
8572 return lowerFDIV_FAST(Op, DAG);
8573 case Intrinsic::amdgcn_sin:
8574 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8575
8576 case Intrinsic::amdgcn_cos:
8577 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8578
8579 case Intrinsic::amdgcn_mul_u24:
8580 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8581 case Intrinsic::amdgcn_mul_i24:
8582 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8583
8584 case Intrinsic::amdgcn_log_clamp: {
8586 return SDValue();
8587
8588 return emitRemovedIntrinsicError(DAG, DL, VT);
8589 }
8590 case Intrinsic::amdgcn_fract:
8591 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8592
8593 case Intrinsic::amdgcn_class:
8594 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8595 Op.getOperand(1), Op.getOperand(2));
8596 case Intrinsic::amdgcn_div_fmas:
8597 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8598 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8599 Op.getOperand(4));
8600
8601 case Intrinsic::amdgcn_div_fixup:
8602 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8603 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8604
8605 case Intrinsic::amdgcn_div_scale: {
8606 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8607
8608 // Translate to the operands expected by the machine instruction. The
8609 // first parameter must be the same as the first instruction.
8610 SDValue Numerator = Op.getOperand(1);
8611 SDValue Denominator = Op.getOperand(2);
8612
8613 // Note this order is opposite of the machine instruction's operations,
8614 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8615 // intrinsic has the numerator as the first operand to match a normal
8616 // division operation.
8617
8618 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8619
8620 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8621 Denominator, Numerator);
8622 }
8623 case Intrinsic::amdgcn_icmp: {
8624 // There is a Pat that handles this variant, so return it as-is.
8625 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8626 Op.getConstantOperandVal(2) == 0 &&
8627 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8628 return Op;
8629 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8630 }
8631 case Intrinsic::amdgcn_fcmp: {
8632 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8633 }
8634 case Intrinsic::amdgcn_ballot:
8635 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8636 case Intrinsic::amdgcn_fmed3:
8637 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8638 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8639 case Intrinsic::amdgcn_fdot2:
8640 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8641 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8642 Op.getOperand(4));
8643 case Intrinsic::amdgcn_fmul_legacy:
8644 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8645 Op.getOperand(1), Op.getOperand(2));
8646 case Intrinsic::amdgcn_sffbh:
8647 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8648 case Intrinsic::amdgcn_sbfe:
8649 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8650 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8651 case Intrinsic::amdgcn_ubfe:
8652 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8653 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8654 case Intrinsic::amdgcn_cvt_pkrtz:
8655 case Intrinsic::amdgcn_cvt_pknorm_i16:
8656 case Intrinsic::amdgcn_cvt_pknorm_u16:
8657 case Intrinsic::amdgcn_cvt_pk_i16:
8658 case Intrinsic::amdgcn_cvt_pk_u16: {
8659 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8660 EVT VT = Op.getValueType();
8661 unsigned Opcode;
8662
8663 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8665 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8667 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8669 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8671 else
8673
8674 if (isTypeLegal(VT))
8675 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8676
8677 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8678 Op.getOperand(1), Op.getOperand(2));
8679 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8680 }
8681 case Intrinsic::amdgcn_fmad_ftz:
8682 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8683 Op.getOperand(2), Op.getOperand(3));
8684
8685 case Intrinsic::amdgcn_if_break:
8686 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8687 Op->getOperand(1), Op->getOperand(2)), 0);
8688
8689 case Intrinsic::amdgcn_groupstaticsize: {
8691 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8692 return Op;
8693
8694 const Module *M = MF.getFunction().getParent();
8695 const GlobalValue *GV =
8696 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8697 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8699 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8700 }
8701 case Intrinsic::amdgcn_is_shared:
8702 case Intrinsic::amdgcn_is_private: {
8703 SDLoc SL(Op);
8704 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8706 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8707 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8708 Op.getOperand(1));
8709
8710 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8711 DAG.getConstant(1, SL, MVT::i32));
8712 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8713 }
8714 case Intrinsic::amdgcn_perm:
8715 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8716 Op.getOperand(2), Op.getOperand(3));
8717 case Intrinsic::amdgcn_reloc_constant: {
8718 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8719 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8720 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8721 auto RelocSymbol = cast<GlobalVariable>(
8722 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8723 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8725 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8726 }
8727 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8728 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8729 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8730 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8731 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8732 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8733 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8734 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8735 if (Op.getOperand(4).getValueType() == MVT::i32)
8736 return SDValue();
8737
8738 SDLoc SL(Op);
8739 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8740 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8741 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8742 Op.getOperand(3), IndexKeyi32);
8743 }
8744 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8745 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8746 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8747 if (Op.getOperand(6).getValueType() == MVT::i32)
8748 return SDValue();
8749
8750 SDLoc SL(Op);
8751 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8752 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8753 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8754 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8755 IndexKeyi32, Op.getOperand(7)});
8756 }
8757 case Intrinsic::amdgcn_addrspacecast_nonnull:
8758 return lowerADDRSPACECAST(Op, DAG);
8759 case Intrinsic::amdgcn_readlane:
8760 case Intrinsic::amdgcn_readfirstlane:
8761 case Intrinsic::amdgcn_writelane:
8762 case Intrinsic::amdgcn_permlane16:
8763 case Intrinsic::amdgcn_permlanex16:
8764 case Intrinsic::amdgcn_permlane64:
8765 return lowerLaneOp(*this, Op.getNode(), DAG);
8766 default:
8767 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8769 return lowerImage(Op, ImageDimIntr, DAG, false);
8770
8771 return Op;
8772 }
8773}
8774
8775// On targets not supporting constant in soffset field, turn zero to
8776// SGPR_NULL to avoid generating an extra s_mov with zero.
8778 const GCNSubtarget *Subtarget) {
8779 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8780 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8781 return SOffset;
8782}
8783
8784SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8785 SelectionDAG &DAG,
8786 unsigned NewOpcode) const {
8787 SDLoc DL(Op);
8788
8789 SDValue VData = Op.getOperand(2);
8790 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8791 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8792 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8793 SDValue Ops[] = {
8794 Op.getOperand(0), // Chain
8795 VData, // vdata
8796 Rsrc, // rsrc
8797 DAG.getConstant(0, DL, MVT::i32), // vindex
8798 Offsets.first, // voffset
8799 SOffset, // soffset
8800 Offsets.second, // offset
8801 Op.getOperand(6), // cachepolicy
8802 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8803 };
8804
8805 auto *M = cast<MemSDNode>(Op);
8806
8807 EVT MemVT = VData.getValueType();
8808 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8809 M->getMemOperand());
8810}
8811
8812SDValue
8813SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8814 unsigned NewOpcode) const {
8815 SDLoc DL(Op);
8816
8817 SDValue VData = Op.getOperand(2);
8818 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8819 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8820 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8821 SDValue Ops[] = {
8822 Op.getOperand(0), // Chain
8823 VData, // vdata
8824 Rsrc, // rsrc
8825 Op.getOperand(4), // vindex
8826 Offsets.first, // voffset
8827 SOffset, // soffset
8828 Offsets.second, // offset
8829 Op.getOperand(7), // cachepolicy
8830 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8831 };
8832
8833 auto *M = cast<MemSDNode>(Op);
8834
8835 EVT MemVT = VData.getValueType();
8836 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8837 M->getMemOperand());
8838}
8839
8840SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8841 SelectionDAG &DAG) const {
8842 unsigned IntrID = Op.getConstantOperandVal(1);
8843 SDLoc DL(Op);
8844
8845 switch (IntrID) {
8846 case Intrinsic::amdgcn_ds_ordered_add:
8847 case Intrinsic::amdgcn_ds_ordered_swap: {
8848 MemSDNode *M = cast<MemSDNode>(Op);
8849 SDValue Chain = M->getOperand(0);
8850 SDValue M0 = M->getOperand(2);
8851 SDValue Value = M->getOperand(3);
8852 unsigned IndexOperand = M->getConstantOperandVal(7);
8853 unsigned WaveRelease = M->getConstantOperandVal(8);
8854 unsigned WaveDone = M->getConstantOperandVal(9);
8855
8856 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8857 IndexOperand &= ~0x3f;
8858 unsigned CountDw = 0;
8859
8860 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8861 CountDw = (IndexOperand >> 24) & 0xf;
8862 IndexOperand &= ~(0xf << 24);
8863
8864 if (CountDw < 1 || CountDw > 4) {
8866 "ds_ordered_count: dword count must be between 1 and 4");
8867 }
8868 }
8869
8870 if (IndexOperand)
8871 report_fatal_error("ds_ordered_count: bad index operand");
8872
8873 if (WaveDone && !WaveRelease)
8874 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8875
8876 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8877 unsigned ShaderType =
8879 unsigned Offset0 = OrderedCountIndex << 2;
8880 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8881
8882 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8883 Offset1 |= (CountDw - 1) << 6;
8884
8885 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8886 Offset1 |= ShaderType << 2;
8887
8888 unsigned Offset = Offset0 | (Offset1 << 8);
8889
8890 SDValue Ops[] = {
8891 Chain,
8892 Value,
8893 DAG.getTargetConstant(Offset, DL, MVT::i16),
8894 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8895 };
8897 M->getVTList(), Ops, M->getMemoryVT(),
8898 M->getMemOperand());
8899 }
8900 case Intrinsic::amdgcn_raw_buffer_load:
8901 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8902 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8903 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8904 case Intrinsic::amdgcn_raw_buffer_load_format:
8905 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8906 const bool IsFormat =
8907 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8908 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8909
8910 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8911 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8912 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8913 SDValue Ops[] = {
8914 Op.getOperand(0), // Chain
8915 Rsrc, // rsrc
8916 DAG.getConstant(0, DL, MVT::i32), // vindex
8917 Offsets.first, // voffset
8918 SOffset, // soffset
8919 Offsets.second, // offset
8920 Op.getOperand(5), // cachepolicy, swizzled buffer
8921 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8922 };
8923
8924 auto *M = cast<MemSDNode>(Op);
8925 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8926 }
8927 case Intrinsic::amdgcn_struct_buffer_load:
8928 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8929 case Intrinsic::amdgcn_struct_buffer_load_format:
8930 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8931 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8932 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
8933 const bool IsFormat =
8934 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8935 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8936
8937 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8938 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8939 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8940 SDValue Ops[] = {
8941 Op.getOperand(0), // Chain
8942 Rsrc, // rsrc
8943 Op.getOperand(3), // vindex
8944 Offsets.first, // voffset
8945 SOffset, // soffset
8946 Offsets.second, // offset
8947 Op.getOperand(6), // cachepolicy, swizzled buffer
8948 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8949 };
8950
8951 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8952 }
8953 case Intrinsic::amdgcn_raw_tbuffer_load:
8954 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8955 MemSDNode *M = cast<MemSDNode>(Op);
8956 EVT LoadVT = Op.getValueType();
8957 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8958 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8959 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8960
8961 SDValue Ops[] = {
8962 Op.getOperand(0), // Chain
8963 Rsrc, // rsrc
8964 DAG.getConstant(0, DL, MVT::i32), // vindex
8965 Offsets.first, // voffset
8966 SOffset, // soffset
8967 Offsets.second, // offset
8968 Op.getOperand(5), // format
8969 Op.getOperand(6), // cachepolicy, swizzled buffer
8970 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8971 };
8972
8973 if (LoadVT.getScalarType() == MVT::f16)
8974 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8975 M, DAG, Ops);
8976 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8977 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8978 DAG);
8979 }
8980 case Intrinsic::amdgcn_struct_tbuffer_load:
8981 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8982 MemSDNode *M = cast<MemSDNode>(Op);
8983 EVT LoadVT = Op.getValueType();
8984 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8985 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8986 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8987
8988 SDValue Ops[] = {
8989 Op.getOperand(0), // Chain
8990 Rsrc, // rsrc
8991 Op.getOperand(3), // vindex
8992 Offsets.first, // voffset
8993 SOffset, // soffset
8994 Offsets.second, // offset
8995 Op.getOperand(6), // format
8996 Op.getOperand(7), // cachepolicy, swizzled buffer
8997 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8998 };
8999
9000 if (LoadVT.getScalarType() == MVT::f16)
9001 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
9002 M, DAG, Ops);
9003 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9004 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9005 DAG);
9006 }
9007 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9008 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9009 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9010 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9011 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9012 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9013 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9014 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9015 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9016 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9018 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9019 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9020 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9021 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9022 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9023 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9024 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9025 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9026 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9027 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9028 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9030 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9031 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9033 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9034 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9036 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9037 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9038 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9039 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9040 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9042 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9043 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9045 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9046 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9048 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9049 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9050 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9051 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9052 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9054 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9055 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9056 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9057 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9058 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9060 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9061 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9062 return lowerRawBufferAtomicIntrin(Op, DAG,
9064 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9066 return lowerStructBufferAtomicIntrin(Op, DAG,
9068 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9070 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9071 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9073 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9074 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9076 return lowerStructBufferAtomicIntrin(Op, DAG,
9078 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9080 return lowerStructBufferAtomicIntrin(Op, DAG,
9082 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9084 return lowerStructBufferAtomicIntrin(Op, DAG,
9086 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9087 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9088 return lowerStructBufferAtomicIntrin(Op, DAG,
9090 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9091 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9092 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9093 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9094 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9095 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9096 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9097 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9098 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9099 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9100 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9101 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9102 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9104 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9105 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9106 return lowerStructBufferAtomicIntrin(Op, DAG,
9108
9109 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9110 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9111 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9112 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9113 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9114 SDValue Ops[] = {
9115 Op.getOperand(0), // Chain
9116 Op.getOperand(2), // src
9117 Op.getOperand(3), // cmp
9118 Rsrc, // rsrc
9119 DAG.getConstant(0, DL, MVT::i32), // vindex
9120 Offsets.first, // voffset
9121 SOffset, // soffset
9122 Offsets.second, // offset
9123 Op.getOperand(7), // cachepolicy
9124 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9125 };
9126 EVT VT = Op.getValueType();
9127 auto *M = cast<MemSDNode>(Op);
9128
9130 Op->getVTList(), Ops, VT, M->getMemOperand());
9131 }
9132 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9134 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9135 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9136 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9137 SDValue Ops[] = {
9138 Op.getOperand(0), // Chain
9139 Op.getOperand(2), // src
9140 Op.getOperand(3), // cmp
9141 Rsrc, // rsrc
9142 Op.getOperand(5), // vindex
9143 Offsets.first, // voffset
9144 SOffset, // soffset
9145 Offsets.second, // offset
9146 Op.getOperand(8), // cachepolicy
9147 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9148 };
9149 EVT VT = Op.getValueType();
9150 auto *M = cast<MemSDNode>(Op);
9151
9153 Op->getVTList(), Ops, VT, M->getMemOperand());
9154 }
9155 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9156 MemSDNode *M = cast<MemSDNode>(Op);
9157 SDValue NodePtr = M->getOperand(2);
9158 SDValue RayExtent = M->getOperand(3);
9159 SDValue RayOrigin = M->getOperand(4);
9160 SDValue RayDir = M->getOperand(5);
9161 SDValue RayInvDir = M->getOperand(6);
9162 SDValue TDescr = M->getOperand(7);
9163
9164 assert(NodePtr.getValueType() == MVT::i32 ||
9165 NodePtr.getValueType() == MVT::i64);
9166 assert(RayDir.getValueType() == MVT::v3f16 ||
9167 RayDir.getValueType() == MVT::v3f32);
9168
9169 if (!Subtarget->hasGFX10_AEncoding()) {
9170 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9171 return SDValue();
9172 }
9173
9174 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9175 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9176 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9177 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9178 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9179 const unsigned NumVDataDwords = 4;
9180 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9181 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9182 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9183 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9184 IsGFX12Plus;
9185 const unsigned BaseOpcodes[2][2] = {
9186 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9187 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9188 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9189 int Opcode;
9190 if (UseNSA) {
9191 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9192 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9193 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9194 : AMDGPU::MIMGEncGfx10NSA,
9195 NumVDataDwords, NumVAddrDwords);
9196 } else {
9197 assert(!IsGFX12Plus);
9198 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9199 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9200 : AMDGPU::MIMGEncGfx10Default,
9201 NumVDataDwords, NumVAddrDwords);
9202 }
9203 assert(Opcode != -1);
9204
9206
9207 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9209 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9210 if (Lanes[0].getValueSizeInBits() == 32) {
9211 for (unsigned I = 0; I < 3; ++I)
9212 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9213 } else {
9214 if (IsAligned) {
9215 Ops.push_back(
9216 DAG.getBitcast(MVT::i32,
9217 DAG.getBuildVector(MVT::v2f16, DL,
9218 { Lanes[0], Lanes[1] })));
9219 Ops.push_back(Lanes[2]);
9220 } else {
9221 SDValue Elt0 = Ops.pop_back_val();
9222 Ops.push_back(
9223 DAG.getBitcast(MVT::i32,
9224 DAG.getBuildVector(MVT::v2f16, DL,
9225 { Elt0, Lanes[0] })));
9226 Ops.push_back(
9227 DAG.getBitcast(MVT::i32,
9228 DAG.getBuildVector(MVT::v2f16, DL,
9229 { Lanes[1], Lanes[2] })));
9230 }
9231 }
9232 };
9233
9234 if (UseNSA && IsGFX11Plus) {
9235 Ops.push_back(NodePtr);
9236 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9237 Ops.push_back(RayOrigin);
9238 if (IsA16) {
9239 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9240 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9241 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9242 for (unsigned I = 0; I < 3; ++I) {
9243 MergedLanes.push_back(DAG.getBitcast(
9244 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9245 {DirLanes[I], InvDirLanes[I]})));
9246 }
9247 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9248 } else {
9249 Ops.push_back(RayDir);
9250 Ops.push_back(RayInvDir);
9251 }
9252 } else {
9253 if (Is64)
9254 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9255 2);
9256 else
9257 Ops.push_back(NodePtr);
9258
9259 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9260 packLanes(RayOrigin, true);
9261 packLanes(RayDir, true);
9262 packLanes(RayInvDir, false);
9263 }
9264
9265 if (!UseNSA) {
9266 // Build a single vector containing all the operands so far prepared.
9267 if (NumVAddrDwords > 12) {
9268 SDValue Undef = DAG.getUNDEF(MVT::i32);
9269 Ops.append(16 - Ops.size(), Undef);
9270 }
9271 assert(Ops.size() >= 8 && Ops.size() <= 12);
9272 SDValue MergedOps = DAG.getBuildVector(
9273 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9274 Ops.clear();
9275 Ops.push_back(MergedOps);
9276 }
9277
9278 Ops.push_back(TDescr);
9279 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9280 Ops.push_back(M->getChain());
9281
9282 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9283 MachineMemOperand *MemRef = M->getMemOperand();
9284 DAG.setNodeMemRefs(NewNode, {MemRef});
9285 return SDValue(NewNode, 0);
9286 }
9287 case Intrinsic::amdgcn_global_atomic_fmin:
9288 case Intrinsic::amdgcn_global_atomic_fmax:
9289 case Intrinsic::amdgcn_global_atomic_fmin_num:
9290 case Intrinsic::amdgcn_global_atomic_fmax_num:
9291 case Intrinsic::amdgcn_flat_atomic_fmin:
9292 case Intrinsic::amdgcn_flat_atomic_fmax:
9293 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9294 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9295 MemSDNode *M = cast<MemSDNode>(Op);
9296 SDValue Ops[] = {
9297 M->getOperand(0), // Chain
9298 M->getOperand(2), // Ptr
9299 M->getOperand(3) // Value
9300 };
9301 unsigned Opcode = 0;
9302 switch (IntrID) {
9303 case Intrinsic::amdgcn_global_atomic_fmin:
9304 case Intrinsic::amdgcn_global_atomic_fmin_num:
9305 case Intrinsic::amdgcn_flat_atomic_fmin:
9306 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9307 Opcode = ISD::ATOMIC_LOAD_FMIN;
9308 break;
9309 }
9310 case Intrinsic::amdgcn_global_atomic_fmax:
9311 case Intrinsic::amdgcn_global_atomic_fmax_num:
9312 case Intrinsic::amdgcn_flat_atomic_fmax:
9313 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9314 Opcode = ISD::ATOMIC_LOAD_FMAX;
9315 break;
9316 }
9317 default:
9318 llvm_unreachable("unhandled atomic opcode");
9319 }
9320 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9321 Ops, M->getMemOperand());
9322 }
9323 case Intrinsic::amdgcn_s_get_barrier_state: {
9324 SDValue Chain = Op->getOperand(0);
9326 unsigned Opc;
9327 bool IsInlinableBarID = false;
9328 int64_t BarID;
9329
9330 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9331 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9332 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9333 }
9334
9335 if (IsInlinableBarID) {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9337 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9338 Ops.push_back(K);
9339 } else {
9340 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9341 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9342 Ops.push_back(M0Val.getValue(0));
9343 }
9344
9345 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9346 return SDValue(NewMI, 0);
9347 }
9348 default:
9349
9350 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9352 return lowerImage(Op, ImageDimIntr, DAG, true);
9353
9354 return SDValue();
9355 }
9356}
9357
9358// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9359// dwordx4 if on SI and handle TFE loads.
9360SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9361 SDVTList VTList,
9362 ArrayRef<SDValue> Ops, EVT MemVT,
9363 MachineMemOperand *MMO,
9364 SelectionDAG &DAG) const {
9365 LLVMContext &C = *DAG.getContext();
9367 EVT VT = VTList.VTs[0];
9368
9369 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9370 bool IsTFE = VTList.NumVTs == 3;
9371 if (IsTFE) {
9372 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9373 unsigned NumOpDWords = NumValueDWords + 1;
9374 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9375 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9376 MachineMemOperand *OpDWordsMMO =
9377 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9378 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9379 OpDWordsVT, OpDWordsMMO, DAG);
9381 DAG.getVectorIdxConstant(NumValueDWords, DL));
9382 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9383 SDValue ValueDWords =
9384 NumValueDWords == 1
9385 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9387 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9388 ZeroIdx);
9389 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9390 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9391 }
9392
9393 if (!Subtarget->hasDwordx3LoadStores() &&
9394 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9395 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9396 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9397 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9398 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9399 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9400 WidenedMemVT, WidenedMMO);
9402 DAG.getVectorIdxConstant(0, DL));
9403 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9404 }
9405
9406 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9407}
9408
9409SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9410 bool ImageStore) const {
9411 EVT StoreVT = VData.getValueType();
9412
9413 // No change for f16 and legal vector D16 types.
9414 if (!StoreVT.isVector())
9415 return VData;
9416
9417 SDLoc DL(VData);
9418 unsigned NumElements = StoreVT.getVectorNumElements();
9419
9420 if (Subtarget->hasUnpackedD16VMem()) {
9421 // We need to unpack the packed data to store.
9422 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9423 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9424
9425 EVT EquivStoreVT =
9426 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9427 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9428 return DAG.UnrollVectorOp(ZExt.getNode());
9429 }
9430
9431 // The sq block of gfx8.1 does not estimate register use correctly for d16
9432 // image store instructions. The data operand is computed as if it were not a
9433 // d16 image instruction.
9434 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9435 // Bitcast to i16
9436 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9437 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9438
9439 // Decompose into scalars
9441 DAG.ExtractVectorElements(IntVData, Elts);
9442
9443 // Group pairs of i16 into v2i16 and bitcast to i32
9444 SmallVector<SDValue, 4> PackedElts;
9445 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9446 SDValue Pair =
9447 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9448 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9449 PackedElts.push_back(IntPair);
9450 }
9451 if ((NumElements % 2) == 1) {
9452 // Handle v3i16
9453 unsigned I = Elts.size() / 2;
9454 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9455 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9456 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9457 PackedElts.push_back(IntPair);
9458 }
9459
9460 // Pad using UNDEF
9461 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9462
9463 // Build final vector
9464 EVT VecVT =
9465 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9466 return DAG.getBuildVector(VecVT, DL, PackedElts);
9467 }
9468
9469 if (NumElements == 3) {
9470 EVT IntStoreVT =
9472 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9473
9474 EVT WidenedStoreVT = EVT::getVectorVT(
9475 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9476 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9477 WidenedStoreVT.getStoreSizeInBits());
9478 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9479 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9480 }
9481
9482 assert(isTypeLegal(StoreVT));
9483 return VData;
9484}
9485
9486SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9487 SelectionDAG &DAG) const {
9488 SDLoc DL(Op);
9489 SDValue Chain = Op.getOperand(0);
9490 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9492
9493 switch (IntrinsicID) {
9494 case Intrinsic::amdgcn_exp_compr: {
9495 if (!Subtarget->hasCompressedExport()) {
9496 DiagnosticInfoUnsupported BadIntrin(
9498 "intrinsic not supported on subtarget", DL.getDebugLoc());
9499 DAG.getContext()->diagnose(BadIntrin);
9500 }
9501 SDValue Src0 = Op.getOperand(4);
9502 SDValue Src1 = Op.getOperand(5);
9503 // Hack around illegal type on SI by directly selecting it.
9504 if (isTypeLegal(Src0.getValueType()))
9505 return SDValue();
9506
9507 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9508 SDValue Undef = DAG.getUNDEF(MVT::f32);
9509 const SDValue Ops[] = {
9510 Op.getOperand(2), // tgt
9511 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9512 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9513 Undef, // src2
9514 Undef, // src3
9515 Op.getOperand(7), // vm
9516 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9517 Op.getOperand(3), // en
9518 Op.getOperand(0) // Chain
9519 };
9520
9521 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9522 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9523 }
9524 case Intrinsic::amdgcn_s_barrier: {
9527 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9528 if (WGSize <= ST.getWavefrontSize())
9529 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9530 Op.getOperand(0)), 0);
9531 }
9532
9533 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9534 if (ST.hasSplitBarriers()) {
9535 SDValue K =
9537 SDValue BarSignal =
9538 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9539 MVT::Other, K, Op.getOperand(0)),
9540 0);
9541 SDValue BarWait =
9542 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9543 BarSignal.getValue(0)),
9544 0);
9545 return BarWait;
9546 }
9547
9548 return SDValue();
9549 };
9550
9551 case Intrinsic::amdgcn_struct_tbuffer_store:
9552 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9553 SDValue VData = Op.getOperand(2);
9554 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9555 if (IsD16)
9556 VData = handleD16VData(VData, DAG);
9557 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9558 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9559 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9560 SDValue Ops[] = {
9561 Chain,
9562 VData, // vdata
9563 Rsrc, // rsrc
9564 Op.getOperand(4), // vindex
9565 Offsets.first, // voffset
9566 SOffset, // soffset
9567 Offsets.second, // offset
9568 Op.getOperand(7), // format
9569 Op.getOperand(8), // cachepolicy, swizzled buffer
9570 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9571 };
9572 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9574 MemSDNode *M = cast<MemSDNode>(Op);
9575 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9576 M->getMemoryVT(), M->getMemOperand());
9577 }
9578
9579 case Intrinsic::amdgcn_raw_tbuffer_store:
9580 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9581 SDValue VData = Op.getOperand(2);
9582 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9583 if (IsD16)
9584 VData = handleD16VData(VData, DAG);
9585 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9586 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9587 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9588 SDValue Ops[] = {
9589 Chain,
9590 VData, // vdata
9591 Rsrc, // rsrc
9592 DAG.getConstant(0, DL, MVT::i32), // vindex
9593 Offsets.first, // voffset
9594 SOffset, // soffset
9595 Offsets.second, // offset
9596 Op.getOperand(6), // format
9597 Op.getOperand(7), // cachepolicy, swizzled buffer
9598 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9599 };
9600 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9602 MemSDNode *M = cast<MemSDNode>(Op);
9603 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9604 M->getMemoryVT(), M->getMemOperand());
9605 }
9606
9607 case Intrinsic::amdgcn_raw_buffer_store:
9608 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9609 case Intrinsic::amdgcn_raw_buffer_store_format:
9610 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9611 const bool IsFormat =
9612 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9613 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9614
9615 SDValue VData = Op.getOperand(2);
9616 EVT VDataVT = VData.getValueType();
9617 EVT EltType = VDataVT.getScalarType();
9618 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9619 if (IsD16) {
9620 VData = handleD16VData(VData, DAG);
9621 VDataVT = VData.getValueType();
9622 }
9623
9624 if (!isTypeLegal(VDataVT)) {
9625 VData =
9626 DAG.getNode(ISD::BITCAST, DL,
9627 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9628 }
9629
9630 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9631 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9632 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9633 SDValue Ops[] = {
9634 Chain,
9635 VData,
9636 Rsrc,
9637 DAG.getConstant(0, DL, MVT::i32), // vindex
9638 Offsets.first, // voffset
9639 SOffset, // soffset
9640 Offsets.second, // offset
9641 Op.getOperand(6), // cachepolicy, swizzled buffer
9642 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9643 };
9644 unsigned Opc =
9646 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9647 MemSDNode *M = cast<MemSDNode>(Op);
9648
9649 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9650 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9651 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9652
9653 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9654 M->getMemoryVT(), M->getMemOperand());
9655 }
9656
9657 case Intrinsic::amdgcn_struct_buffer_store:
9658 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9659 case Intrinsic::amdgcn_struct_buffer_store_format:
9660 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9661 const bool IsFormat =
9662 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9663 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9664
9665 SDValue VData = Op.getOperand(2);
9666 EVT VDataVT = VData.getValueType();
9667 EVT EltType = VDataVT.getScalarType();
9668 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9669
9670 if (IsD16) {
9671 VData = handleD16VData(VData, DAG);
9672 VDataVT = VData.getValueType();
9673 }
9674
9675 if (!isTypeLegal(VDataVT)) {
9676 VData =
9677 DAG.getNode(ISD::BITCAST, DL,
9678 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9679 }
9680
9681 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9682 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9683 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9684 SDValue Ops[] = {
9685 Chain,
9686 VData,
9687 Rsrc,
9688 Op.getOperand(4), // vindex
9689 Offsets.first, // voffset
9690 SOffset, // soffset
9691 Offsets.second, // offset
9692 Op.getOperand(7), // cachepolicy, swizzled buffer
9693 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9694 };
9695 unsigned Opc =
9697 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9698 MemSDNode *M = cast<MemSDNode>(Op);
9699
9700 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9701 EVT VDataType = VData.getValueType().getScalarType();
9702 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9703 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9704
9705 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9706 M->getMemoryVT(), M->getMemOperand());
9707 }
9708 case Intrinsic::amdgcn_raw_buffer_load_lds:
9709 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9710 case Intrinsic::amdgcn_struct_buffer_load_lds:
9711 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9712 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9713 unsigned Opc;
9714 bool HasVIndex =
9715 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9716 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9717 unsigned OpOffset = HasVIndex ? 1 : 0;
9718 SDValue VOffset = Op.getOperand(5 + OpOffset);
9719 bool HasVOffset = !isNullConstant(VOffset);
9720 unsigned Size = Op->getConstantOperandVal(4);
9721
9722 switch (Size) {
9723 default:
9724 return SDValue();
9725 case 1:
9726 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9727 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9728 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9729 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9730 break;
9731 case 2:
9732 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9733 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9734 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9735 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9736 break;
9737 case 4:
9738 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9739 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9740 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9741 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9742 break;
9743 }
9744
9745 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9746
9748
9749 if (HasVIndex && HasVOffset)
9750 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9751 { Op.getOperand(5), // VIndex
9752 VOffset }));
9753 else if (HasVIndex)
9754 Ops.push_back(Op.getOperand(5));
9755 else if (HasVOffset)
9756 Ops.push_back(VOffset);
9757
9758 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9759 Ops.push_back(Rsrc);
9760 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9761 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9762 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9763 Ops.push_back(
9764 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9766 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9767 Ops.push_back(M0Val.getValue(0)); // Chain
9768 Ops.push_back(M0Val.getValue(1)); // Glue
9769
9770 auto *M = cast<MemSDNode>(Op);
9771 MachineMemOperand *LoadMMO = M->getMemOperand();
9772 // Don't set the offset value here because the pointer points to the base of
9773 // the buffer.
9774 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9775
9776 MachinePointerInfo StorePtrI = LoadPtrI;
9777 LoadPtrI.V = PoisonValue::get(
9781
9782 auto F = LoadMMO->getFlags() &
9784 LoadMMO =
9786 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9787
9789 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9790 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9791
9792 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9793 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9794
9795 return SDValue(Load, 0);
9796 }
9797 case Intrinsic::amdgcn_global_load_lds: {
9798 unsigned Opc;
9799 unsigned Size = Op->getConstantOperandVal(4);
9800 switch (Size) {
9801 default:
9802 return SDValue();
9803 case 1:
9804 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9805 break;
9806 case 2:
9807 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9808 break;
9809 case 4:
9810 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9811 break;
9812 }
9813
9814 auto *M = cast<MemSDNode>(Op);
9815 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9816
9818
9819 SDValue Addr = Op.getOperand(2); // Global ptr
9820 SDValue VOffset;
9821 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9822 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9823 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9824 SDValue LHS = Addr.getOperand(0);
9825 SDValue RHS = Addr.getOperand(1);
9826
9827 if (LHS->isDivergent())
9828 std::swap(LHS, RHS);
9829
9830 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9831 RHS.getOperand(0).getValueType() == MVT::i32) {
9832 // add (i64 sgpr), (zero_extend (i32 vgpr))
9833 Addr = LHS;
9834 VOffset = RHS.getOperand(0);
9835 }
9836 }
9837
9838 Ops.push_back(Addr);
9839 if (!Addr->isDivergent()) {
9840 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9841 if (!VOffset)
9842 VOffset = SDValue(
9843 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9844 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9845 Ops.push_back(VOffset);
9846 }
9847
9848 Ops.push_back(Op.getOperand(5)); // Offset
9849 Ops.push_back(Op.getOperand(6)); // CPol
9850 Ops.push_back(M0Val.getValue(0)); // Chain
9851 Ops.push_back(M0Val.getValue(1)); // Glue
9852
9853 MachineMemOperand *LoadMMO = M->getMemOperand();
9854 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9855 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9856 MachinePointerInfo StorePtrI = LoadPtrI;
9857 LoadPtrI.V = PoisonValue::get(
9861 auto F = LoadMMO->getFlags() &
9863 LoadMMO =
9865 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9867 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9868 LoadMMO->getAAInfo());
9869
9870 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9871 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9872
9873 return SDValue(Load, 0);
9874 }
9875 case Intrinsic::amdgcn_end_cf:
9876 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9877 Op->getOperand(2), Chain), 0);
9878 case Intrinsic::amdgcn_s_barrier_init:
9879 case Intrinsic::amdgcn_s_barrier_join:
9880 case Intrinsic::amdgcn_s_wakeup_barrier: {
9881 SDValue Chain = Op->getOperand(0);
9883 SDValue BarOp = Op->getOperand(2);
9884 unsigned Opc;
9885 bool IsInlinableBarID = false;
9886 int64_t BarVal;
9887
9888 if (isa<ConstantSDNode>(BarOp)) {
9889 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9890 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9891 }
9892
9893 if (IsInlinableBarID) {
9894 switch (IntrinsicID) {
9895 default:
9896 return SDValue();
9897 case Intrinsic::amdgcn_s_barrier_init:
9898 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9899 break;
9900 case Intrinsic::amdgcn_s_barrier_join:
9901 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9902 break;
9903 case Intrinsic::amdgcn_s_wakeup_barrier:
9904 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9905 break;
9906 }
9907
9908 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9909 Ops.push_back(K);
9910 } else {
9911 switch (IntrinsicID) {
9912 default:
9913 return SDValue();
9914 case Intrinsic::amdgcn_s_barrier_init:
9915 Opc = AMDGPU::S_BARRIER_INIT_M0;
9916 break;
9917 case Intrinsic::amdgcn_s_barrier_join:
9918 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9919 break;
9920 case Intrinsic::amdgcn_s_wakeup_barrier:
9921 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9922 break;
9923 }
9924 }
9925
9926 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9927 SDValue M0Val;
9928 // Member count will be read from M0[16:22]
9929 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9930 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9931
9932 if (!IsInlinableBarID) {
9933 // If reference to barrier id is not an inline constant then it must be
9934 // referenced with M0[4:0]. Perform an OR with the member count to
9935 // include it in M0.
9936 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9937 Op.getOperand(2), M0Val),
9938 0);
9939 }
9940 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9941 } else if (!IsInlinableBarID) {
9942 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9943 }
9944
9945 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9946 return SDValue(NewMI, 0);
9947 }
9948 default: {
9949 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9951 return lowerImage(Op, ImageDimIntr, DAG, true);
9952
9953 return Op;
9954 }
9955 }
9956}
9957
9958// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9959// offset (the offset that is included in bounds checking and swizzling, to be
9960// split between the instruction's voffset and immoffset fields) and soffset
9961// (the offset that is excluded from bounds checking and swizzling, to go in
9962// the instruction's soffset field). This function takes the first kind of
9963// offset and figures out how to split it between voffset and immoffset.
9964std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9965 SDValue Offset, SelectionDAG &DAG) const {
9966 SDLoc DL(Offset);
9967 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9968 SDValue N0 = Offset;
9969 ConstantSDNode *C1 = nullptr;
9970
9971 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9972 N0 = SDValue();
9973 else if (DAG.isBaseWithConstantOffset(N0)) {
9974 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9975 N0 = N0.getOperand(0);
9976 }
9977
9978 if (C1) {
9979 unsigned ImmOffset = C1->getZExtValue();
9980 // If the immediate value is too big for the immoffset field, put only bits
9981 // that would normally fit in the immoffset field. The remaining value that
9982 // is copied/added for the voffset field is a large power of 2, and it
9983 // stands more chance of being CSEd with the copy/add for another similar
9984 // load/store.
9985 // However, do not do that rounding down if that is a negative
9986 // number, as it appears to be illegal to have a negative offset in the
9987 // vgpr, even if adding the immediate offset makes it positive.
9988 unsigned Overflow = ImmOffset & ~MaxImm;
9989 ImmOffset -= Overflow;
9990 if ((int32_t)Overflow < 0) {
9991 Overflow += ImmOffset;
9992 ImmOffset = 0;
9993 }
9994 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9995 if (Overflow) {
9996 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9997 if (!N0)
9998 N0 = OverflowVal;
9999 else {
10000 SDValue Ops[] = { N0, OverflowVal };
10001 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10002 }
10003 }
10004 }
10005 if (!N0)
10006 N0 = DAG.getConstant(0, DL, MVT::i32);
10007 if (!C1)
10008 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10009 return {N0, SDValue(C1, 0)};
10010}
10011
10012// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10013// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10014// pointed to by Offsets.
10015void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10016 SelectionDAG &DAG, SDValue *Offsets,
10017 Align Alignment) const {
10019 SDLoc DL(CombinedOffset);
10020 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10021 uint32_t Imm = C->getZExtValue();
10022 uint32_t SOffset, ImmOffset;
10023 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10024 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10025 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10026 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10027 return;
10028 }
10029 }
10030 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10031 SDValue N0 = CombinedOffset.getOperand(0);
10032 SDValue N1 = CombinedOffset.getOperand(1);
10033 uint32_t SOffset, ImmOffset;
10034 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10035 if (Offset >= 0 &&
10036 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10037 Offsets[0] = N0;
10038 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10039 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10040 return;
10041 }
10042 }
10043
10044 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10045 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10046 : DAG.getConstant(0, DL, MVT::i32);
10047
10048 Offsets[0] = CombinedOffset;
10049 Offsets[1] = SOffsetZero;
10050 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10051}
10052
10053SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10054 SelectionDAG &DAG) const {
10055 if (!MaybePointer.getValueType().isScalarInteger())
10056 return MaybePointer;
10057
10058 SDLoc DL(MaybePointer);
10059
10060 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10061 return Rsrc;
10062}
10063
10064// Wrap a global or flat pointer into a buffer intrinsic using the flags
10065// specified in the intrinsic.
10066SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10067 SelectionDAG &DAG) const {
10068 SDLoc Loc(Op);
10069
10070 SDValue Pointer = Op->getOperand(1);
10071 SDValue Stride = Op->getOperand(2);
10072 SDValue NumRecords = Op->getOperand(3);
10073 SDValue Flags = Op->getOperand(4);
10074
10075 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10076 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10077 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10078 std::optional<uint32_t> ConstStride = std::nullopt;
10079 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10080 ConstStride = ConstNode->getZExtValue();
10081
10082 SDValue NewHighHalf = Masked;
10083 if (!ConstStride || *ConstStride != 0) {
10084 SDValue ShiftedStride;
10085 if (ConstStride) {
10086 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10087 } else {
10088 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10089 ShiftedStride =
10090 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10091 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10092 }
10093 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10094 }
10095
10096 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10097 NewHighHalf, NumRecords, Flags);
10098 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10099 return RsrcPtr;
10100}
10101
10102// Handle 8 bit and 16 bit buffer loads
10103SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10104 EVT LoadVT, SDLoc DL,
10106 MachineMemOperand *MMO,
10107 bool IsTFE) const {
10108 EVT IntVT = LoadVT.changeTypeToInteger();
10109
10110 if (IsTFE) {
10111 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10115 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10116 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10117 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10119 DAG.getConstant(1, DL, MVT::i32));
10120 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10121 DAG.getConstant(0, DL, MVT::i32));
10122 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10123 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10124 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10125 }
10126
10127 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10129
10130 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10131 SDValue BufferLoad =
10132 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10133 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10134 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10135
10136 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10137}
10138
10139// Handle 8 bit and 16 bit buffer stores
10140SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10141 EVT VDataType, SDLoc DL,
10142 SDValue Ops[],
10143 MemSDNode *M) const {
10144 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10145 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10146
10147 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10148 Ops[1] = BufferStoreExt;
10149 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10151 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10152 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10153 M->getMemOperand());
10154}
10155
10157 ISD::LoadExtType ExtType, SDValue Op,
10158 const SDLoc &SL, EVT VT) {
10159 if (VT.bitsLT(Op.getValueType()))
10160 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10161
10162 switch (ExtType) {
10163 case ISD::SEXTLOAD:
10164 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10165 case ISD::ZEXTLOAD:
10166 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10167 case ISD::EXTLOAD:
10168 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10169 case ISD::NON_EXTLOAD:
10170 return Op;
10171 }
10172
10173 llvm_unreachable("invalid ext type");
10174}
10175
10176// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10177// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10178SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10179 SelectionDAG &DAG = DCI.DAG;
10180 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10181 return SDValue();
10182
10183 // FIXME: Constant loads should all be marked invariant.
10184 unsigned AS = Ld->getAddressSpace();
10185 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10187 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10188 return SDValue();
10189
10190 // Don't do this early, since it may interfere with adjacent load merging for
10191 // illegal types. We can avoid losing alignment information for exotic types
10192 // pre-legalize.
10193 EVT MemVT = Ld->getMemoryVT();
10194 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10195 MemVT.getSizeInBits() >= 32)
10196 return SDValue();
10197
10198 SDLoc SL(Ld);
10199
10200 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10201 "unexpected vector extload");
10202
10203 // TODO: Drop only high part of range.
10204 SDValue Ptr = Ld->getBasePtr();
10205 SDValue NewLoad = DAG.getLoad(
10206 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10207 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10208 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10209 nullptr); // Drop ranges
10210
10211 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10212 if (MemVT.isFloatingPoint()) {
10214 "unexpected fp extload");
10215 TruncVT = MemVT.changeTypeToInteger();
10216 }
10217
10218 SDValue Cvt = NewLoad;
10219 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10220 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10221 DAG.getValueType(TruncVT));
10222 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10224 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10225 } else {
10227 }
10228
10229 EVT VT = Ld->getValueType(0);
10230 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10231
10232 DCI.AddToWorklist(Cvt.getNode());
10233
10234 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10235 // the appropriate extension from the 32-bit load.
10236 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10237 DCI.AddToWorklist(Cvt.getNode());
10238
10239 // Handle conversion back to floating point if necessary.
10240 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10241
10242 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10243}
10244
10246 const SIMachineFunctionInfo &Info) {
10247 // TODO: Should check if the address can definitely not access stack.
10248 if (Info.isEntryFunction())
10249 return Info.getUserSGPRInfo().hasFlatScratchInit();
10250 return true;
10251}
10252
10253SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10254 SDLoc DL(Op);
10255 LoadSDNode *Load = cast<LoadSDNode>(Op);
10256 ISD::LoadExtType ExtType = Load->getExtensionType();
10257 EVT MemVT = Load->getMemoryVT();
10258
10259 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10260 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10261 return SDValue();
10262
10263 // FIXME: Copied from PPC
10264 // First, load into 32 bits, then truncate to 1 bit.
10265
10266 SDValue Chain = Load->getChain();
10267 SDValue BasePtr = Load->getBasePtr();
10268 MachineMemOperand *MMO = Load->getMemOperand();
10269
10270 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10271
10272 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10273 BasePtr, RealMemVT, MMO);
10274
10275 if (!MemVT.isVector()) {
10276 SDValue Ops[] = {
10277 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10278 NewLD.getValue(1)
10279 };
10280
10281 return DAG.getMergeValues(Ops, DL);
10282 }
10283
10285 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10286 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10287 DAG.getConstant(I, DL, MVT::i32));
10288
10289 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10290 }
10291
10292 SDValue Ops[] = {
10293 DAG.getBuildVector(MemVT, DL, Elts),
10294 NewLD.getValue(1)
10295 };
10296
10297 return DAG.getMergeValues(Ops, DL);
10298 }
10299
10300 if (!MemVT.isVector())
10301 return SDValue();
10302
10303 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10304 "Custom lowering for non-i32 vectors hasn't been implemented.");
10305
10306 Align Alignment = Load->getAlign();
10307 unsigned AS = Load->getAddressSpace();
10308 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10309 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10310 return SplitVectorLoad(Op, DAG);
10311 }
10312
10315 // If there is a possibility that flat instruction access scratch memory
10316 // then we need to use the same legalization rules we use for private.
10317 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10319 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10321
10322 unsigned NumElements = MemVT.getVectorNumElements();
10323
10324 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10326 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10327 if (MemVT.isPow2VectorType() ||
10328 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10329 return SDValue();
10330 return WidenOrSplitVectorLoad(Op, DAG);
10331 }
10332 // Non-uniform loads will be selected to MUBUF instructions, so they
10333 // have the same legalization requirements as global and private
10334 // loads.
10335 //
10336 }
10337
10338 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10341 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10342 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10343 Alignment >= Align(4) && NumElements < 32) {
10344 if (MemVT.isPow2VectorType() ||
10345 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10346 return SDValue();
10347 return WidenOrSplitVectorLoad(Op, DAG);
10348 }
10349 // Non-uniform loads will be selected to MUBUF instructions, so they
10350 // have the same legalization requirements as global and private
10351 // loads.
10352 //
10353 }
10354 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10357 AS == AMDGPUAS::FLAT_ADDRESS) {
10358 if (NumElements > 4)
10359 return SplitVectorLoad(Op, DAG);
10360 // v3 loads not supported on SI.
10361 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10362 return WidenOrSplitVectorLoad(Op, DAG);
10363
10364 // v3 and v4 loads are supported for private and global memory.
10365 return SDValue();
10366 }
10367 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10368 // Depending on the setting of the private_element_size field in the
10369 // resource descriptor, we can only make private accesses up to a certain
10370 // size.
10371 switch (Subtarget->getMaxPrivateElementSize()) {
10372 case 4: {
10373 SDValue Ops[2];
10374 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10375 return DAG.getMergeValues(Ops, DL);
10376 }
10377 case 8:
10378 if (NumElements > 2)
10379 return SplitVectorLoad(Op, DAG);
10380 return SDValue();
10381 case 16:
10382 // Same as global/flat
10383 if (NumElements > 4)
10384 return SplitVectorLoad(Op, DAG);
10385 // v3 loads not supported on SI.
10386 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10387 return WidenOrSplitVectorLoad(Op, DAG);
10388
10389 return SDValue();
10390 default:
10391 llvm_unreachable("unsupported private_element_size");
10392 }
10393 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10394 unsigned Fast = 0;
10395 auto Flags = Load->getMemOperand()->getFlags();
10397 Load->getAlign(), Flags, &Fast) &&
10398 Fast > 1)
10399 return SDValue();
10400
10401 if (MemVT.isVector())
10402 return SplitVectorLoad(Op, DAG);
10403 }
10404
10406 MemVT, *Load->getMemOperand())) {
10407 SDValue Ops[2];
10408 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10409 return DAG.getMergeValues(Ops, DL);
10410 }
10411
10412 return SDValue();
10413}
10414
10415SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10416 EVT VT = Op.getValueType();
10417 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10418 VT.getSizeInBits() == 512)
10419 return splitTernaryVectorOp(Op, DAG);
10420
10421 assert(VT.getSizeInBits() == 64);
10422
10423 SDLoc DL(Op);
10424 SDValue Cond = Op.getOperand(0);
10425
10426 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10427 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10428
10429 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10430 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10431
10432 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10433 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10434
10435 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10436
10437 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10438 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10439
10440 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10441
10442 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10443 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10444}
10445
10446// Catch division cases where we can use shortcuts with rcp and rsq
10447// instructions.
10448SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10449 SelectionDAG &DAG) const {
10450 SDLoc SL(Op);
10451 SDValue LHS = Op.getOperand(0);
10452 SDValue RHS = Op.getOperand(1);
10453 EVT VT = Op.getValueType();
10454 const SDNodeFlags Flags = Op->getFlags();
10455
10456 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10458
10459 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10460 // Without !fpmath accuracy information, we can't do more because we don't
10461 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10462 // f16 is always accurate enough
10463 if (!AllowInaccurateRcp && VT != MVT::f16)
10464 return SDValue();
10465
10466 if (CLHS->isExactlyValue(1.0)) {
10467 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10468 // the CI documentation has a worst case error of 1 ulp.
10469 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10470 // use it as long as we aren't trying to use denormals.
10471 //
10472 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10473
10474 // 1.0 / sqrt(x) -> rsq(x)
10475
10476 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10477 // error seems really high at 2^29 ULP.
10478 // 1.0 / x -> rcp(x)
10479 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10480 }
10481
10482 // Same as for 1.0, but expand the sign out of the constant.
10483 if (CLHS->isExactlyValue(-1.0)) {
10484 // -1.0 / x -> rcp (fneg x)
10485 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10486 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10487 }
10488 }
10489
10490 // For f16 require afn or arcp.
10491 // For f32 require afn.
10492 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10493 return SDValue();
10494
10495 // Turn into multiply by the reciprocal.
10496 // x / y -> x * (1.0 / y)
10497 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10498 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10499}
10500
10501SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10502 SelectionDAG &DAG) const {
10503 SDLoc SL(Op);
10504 SDValue X = Op.getOperand(0);
10505 SDValue Y = Op.getOperand(1);
10506 EVT VT = Op.getValueType();
10507 const SDNodeFlags Flags = Op->getFlags();
10508
10509 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10511 if (!AllowInaccurateDiv)
10512 return SDValue();
10513
10514 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10515 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10516
10517 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10518 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10519
10520 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10521 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10522 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10523 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10524 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10525 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10526}
10527
10528static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10529 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10530 SDNodeFlags Flags) {
10531 if (GlueChain->getNumValues() <= 1) {
10532 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10533 }
10534
10535 assert(GlueChain->getNumValues() == 3);
10536
10537 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10538 switch (Opcode) {
10539 default: llvm_unreachable("no chain equivalent for opcode");
10540 case ISD::FMUL:
10541 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10542 break;
10543 }
10544
10545 return DAG.getNode(Opcode, SL, VTList,
10546 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10547 Flags);
10548}
10549
10550static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10551 EVT VT, SDValue A, SDValue B, SDValue C,
10552 SDValue GlueChain, SDNodeFlags Flags) {
10553 if (GlueChain->getNumValues() <= 1) {
10554 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10555 }
10556
10557 assert(GlueChain->getNumValues() == 3);
10558
10559 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10560 switch (Opcode) {
10561 default: llvm_unreachable("no chain equivalent for opcode");
10562 case ISD::FMA:
10563 Opcode = AMDGPUISD::FMA_W_CHAIN;
10564 break;
10565 }
10566
10567 return DAG.getNode(Opcode, SL, VTList,
10568 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10569 Flags);
10570}
10571
10572SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10573 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10574 return FastLowered;
10575
10576 SDLoc SL(Op);
10577 SDValue Src0 = Op.getOperand(0);
10578 SDValue Src1 = Op.getOperand(1);
10579
10580 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10581 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10582
10583 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10584 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10585
10586 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10587 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10588
10589 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10590}
10591
10592// Faster 2.5 ULP division that does not support denormals.
10593SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10594 SDNodeFlags Flags = Op->getFlags();
10595 SDLoc SL(Op);
10596 SDValue LHS = Op.getOperand(1);
10597 SDValue RHS = Op.getOperand(2);
10598
10599 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10600
10601 const APFloat K0Val(0x1p+96f);
10602 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10603
10604 const APFloat K1Val(0x1p-32f);
10605 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10606
10607 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10608
10609 EVT SetCCVT =
10610 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10611
10612 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10613
10614 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10615
10616 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10617
10618 // rcp does not support denormals.
10619 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10620
10621 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10622
10623 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10624}
10625
10626// Returns immediate value for setting the F32 denorm mode when using the
10627// S_DENORM_MODE instruction.
10629 const SIMachineFunctionInfo *Info,
10630 const GCNSubtarget *ST) {
10631 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10632 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10633 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10634 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10635}
10636
10637SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10638 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10639 return FastLowered;
10640
10641 // The selection matcher assumes anything with a chain selecting to a
10642 // mayRaiseFPException machine instruction. Since we're introducing a chain
10643 // here, we need to explicitly report nofpexcept for the regular fdiv
10644 // lowering.
10645 SDNodeFlags Flags = Op->getFlags();
10646 Flags.setNoFPExcept(true);
10647
10648 SDLoc SL(Op);
10649 SDValue LHS = Op.getOperand(0);
10650 SDValue RHS = Op.getOperand(1);
10651
10652 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10653
10654 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10655
10656 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10657 {RHS, RHS, LHS}, Flags);
10658 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10659 {LHS, RHS, LHS}, Flags);
10660
10661 // Denominator is scaled to not be denormal, so using rcp is ok.
10662 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10663 DenominatorScaled, Flags);
10664 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10665 DenominatorScaled, Flags);
10666
10667 using namespace AMDGPU::Hwreg;
10668 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10669 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10670
10671 const MachineFunction &MF = DAG.getMachineFunction();
10673 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10674
10675 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10676 const bool HasDynamicDenormals =
10677 (DenormMode.Input == DenormalMode::Dynamic) ||
10678 (DenormMode.Output == DenormalMode::Dynamic);
10679
10680 SDValue SavedDenormMode;
10681
10682 if (!PreservesDenormals) {
10683 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10684 // lowering. The chain dependence is insufficient, and we need glue. We do
10685 // not need the glue variants in a strictfp function.
10686
10687 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10688
10689 SDValue Glue = DAG.getEntryNode();
10690 if (HasDynamicDenormals) {
10691 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10692 DAG.getVTList(MVT::i32, MVT::Glue),
10693 {BitField, Glue});
10694 SavedDenormMode = SDValue(GetReg, 0);
10695
10696 Glue = DAG.getMergeValues(
10697 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10698 }
10699
10700 SDNode *EnableDenorm;
10701 if (Subtarget->hasDenormModeInst()) {
10702 const SDValue EnableDenormValue =
10703 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10704
10705 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10706 EnableDenormValue)
10707 .getNode();
10708 } else {
10709 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10710 SL, MVT::i32);
10711 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10712 {EnableDenormValue, BitField, Glue});
10713 }
10714
10715 SDValue Ops[3] = {
10716 NegDivScale0,
10717 SDValue(EnableDenorm, 0),
10718 SDValue(EnableDenorm, 1)
10719 };
10720
10721 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10722 }
10723
10724 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10725 ApproxRcp, One, NegDivScale0, Flags);
10726
10727 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10728 ApproxRcp, Fma0, Flags);
10729
10730 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10731 Fma1, Fma1, Flags);
10732
10733 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10734 NumeratorScaled, Mul, Flags);
10735
10736 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10737 Fma2, Fma1, Mul, Fma2, Flags);
10738
10739 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10740 NumeratorScaled, Fma3, Flags);
10741
10742 if (!PreservesDenormals) {
10743 SDNode *DisableDenorm;
10744 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10745 const SDValue DisableDenormValue = getSPDenormModeValue(
10746 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10747
10748 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10749 Fma4.getValue(1), DisableDenormValue,
10750 Fma4.getValue(2)).getNode();
10751 } else {
10752 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10753 const SDValue DisableDenormValue =
10754 HasDynamicDenormals
10755 ? SavedDenormMode
10756 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10757
10758 DisableDenorm = DAG.getMachineNode(
10759 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10760 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10761 }
10762
10763 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10764 SDValue(DisableDenorm, 0), DAG.getRoot());
10765 DAG.setRoot(OutputChain);
10766 }
10767
10768 SDValue Scale = NumeratorScaled.getValue(1);
10769 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10770 {Fma4, Fma1, Fma3, Scale}, Flags);
10771
10772 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10773}
10774
10775SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10776 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10777 return FastLowered;
10778
10779 SDLoc SL(Op);
10780 SDValue X = Op.getOperand(0);
10781 SDValue Y = Op.getOperand(1);
10782
10783 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10784
10785 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10786
10787 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10788
10789 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10790
10791 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10792
10793 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10794
10795 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10796
10797 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10798
10799 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10800
10801 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10802 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10803
10804 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10805 NegDivScale0, Mul, DivScale1);
10806
10807 SDValue Scale;
10808
10809 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10810 // Workaround a hardware bug on SI where the condition output from div_scale
10811 // is not usable.
10812
10813 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10814
10815 // Figure out if the scale to use for div_fmas.
10816 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10817 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10818 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10819 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10820
10821 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10822 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10823
10824 SDValue Scale0Hi
10825 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10826 SDValue Scale1Hi
10827 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10828
10829 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10830 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10831 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10832 } else {
10833 Scale = DivScale1.getValue(1);
10834 }
10835
10836 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10837 Fma4, Fma3, Mul, Scale);
10838
10839 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10840}
10841
10842SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10843 EVT VT = Op.getValueType();
10844
10845 if (VT == MVT::f32)
10846 return LowerFDIV32(Op, DAG);
10847
10848 if (VT == MVT::f64)
10849 return LowerFDIV64(Op, DAG);
10850
10851 if (VT == MVT::f16)
10852 return LowerFDIV16(Op, DAG);
10853
10854 llvm_unreachable("Unexpected type for fdiv");
10855}
10856
10857SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10858 SDLoc dl(Op);
10859 SDValue Val = Op.getOperand(0);
10860 EVT VT = Val.getValueType();
10861 EVT ResultExpVT = Op->getValueType(1);
10862 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10863
10864 SDValue Mant = DAG.getNode(
10866 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10867
10868 SDValue Exp = DAG.getNode(
10869 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10870 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10871
10872 if (Subtarget->hasFractBug()) {
10873 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10874 SDValue Inf = DAG.getConstantFP(
10876
10877 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10878 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10879 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10880 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10881 }
10882
10883 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10884 return DAG.getMergeValues({Mant, CastExp}, dl);
10885}
10886
10887SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10888 SDLoc DL(Op);
10889 StoreSDNode *Store = cast<StoreSDNode>(Op);
10890 EVT VT = Store->getMemoryVT();
10891
10892 if (VT == MVT::i1) {
10893 return DAG.getTruncStore(Store->getChain(), DL,
10894 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10895 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10896 }
10897
10898 assert(VT.isVector() &&
10899 Store->getValue().getValueType().getScalarType() == MVT::i32);
10900
10901 unsigned AS = Store->getAddressSpace();
10902 if (Subtarget->hasLDSMisalignedBug() &&
10903 AS == AMDGPUAS::FLAT_ADDRESS &&
10904 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10905 return SplitVectorStore(Op, DAG);
10906 }
10907
10910 // If there is a possibility that flat instruction access scratch memory
10911 // then we need to use the same legalization rules we use for private.
10912 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10914 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10916
10917 unsigned NumElements = VT.getVectorNumElements();
10918 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10919 AS == AMDGPUAS::FLAT_ADDRESS) {
10920 if (NumElements > 4)
10921 return SplitVectorStore(Op, DAG);
10922 // v3 stores not supported on SI.
10923 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10924 return SplitVectorStore(Op, DAG);
10925
10927 VT, *Store->getMemOperand()))
10928 return expandUnalignedStore(Store, DAG);
10929
10930 return SDValue();
10931 }
10932 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10933 switch (Subtarget->getMaxPrivateElementSize()) {
10934 case 4:
10935 return scalarizeVectorStore(Store, DAG);
10936 case 8:
10937 if (NumElements > 2)
10938 return SplitVectorStore(Op, DAG);
10939 return SDValue();
10940 case 16:
10941 if (NumElements > 4 ||
10942 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10943 return SplitVectorStore(Op, DAG);
10944 return SDValue();
10945 default:
10946 llvm_unreachable("unsupported private_element_size");
10947 }
10948 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10949 unsigned Fast = 0;
10950 auto Flags = Store->getMemOperand()->getFlags();
10952 Store->getAlign(), Flags, &Fast) &&
10953 Fast > 1)
10954 return SDValue();
10955
10956 if (VT.isVector())
10957 return SplitVectorStore(Op, DAG);
10958
10959 return expandUnalignedStore(Store, DAG);
10960 }
10961
10962 // Probably an invalid store. If so we'll end up emitting a selection error.
10963 return SDValue();
10964}
10965
10966// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10967SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10968 SDLoc SL(Op);
10969 assert(!Subtarget->has16BitInsts());
10970 SDNodeFlags Flags = Op->getFlags();
10971 SDValue Ext =
10972 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10973
10974 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10975 SDValue Sqrt =
10976 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10977
10978 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10979 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10980}
10981
10982SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10983 SDLoc DL(Op);
10984 SDNodeFlags Flags = Op->getFlags();
10985 MVT VT = Op.getValueType().getSimpleVT();
10986 const SDValue X = Op.getOperand(0);
10987
10988 if (allowApproxFunc(DAG, Flags)) {
10989 // Instruction is 1ulp but ignores denormals.
10990 return DAG.getNode(
10992 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10993 }
10994
10995 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10996 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10997
10998 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10999
11000 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11001
11002 SDValue SqrtX =
11003 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11004
11005 SDValue SqrtS;
11006 if (needsDenormHandlingF32(DAG, X, Flags)) {
11007 SDValue SqrtID =
11008 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11009 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11010
11011 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11012 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11013 DAG.getConstant(-1, DL, MVT::i32));
11014 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11015
11016 SDValue NegSqrtSNextDown =
11017 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11018
11019 SDValue SqrtVP =
11020 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11021
11022 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11023 DAG.getConstant(1, DL, MVT::i32));
11024 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11025
11026 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11027 SDValue SqrtVS =
11028 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11029
11030 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11031 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11032
11033 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11034 Flags);
11035
11036 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11037 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11038 Flags);
11039 } else {
11040 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11041
11042 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11043
11044 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11045 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11046 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11047
11048 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11049 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11050 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11051
11052 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11053 SDValue SqrtD =
11054 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11055 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11056 }
11057
11058 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11059
11060 SDValue ScaledDown =
11061 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11062
11063 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11064 SDValue IsZeroOrInf =
11065 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11066 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11067
11068 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11069}
11070
11071SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11072 // For double type, the SQRT and RSQ instructions don't have required
11073 // precision, we apply Goldschmidt's algorithm to improve the result:
11074 //
11075 // y0 = rsq(x)
11076 // g0 = x * y0
11077 // h0 = 0.5 * y0
11078 //
11079 // r0 = 0.5 - h0 * g0
11080 // g1 = g0 * r0 + g0
11081 // h1 = h0 * r0 + h0
11082 //
11083 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11084 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11085 // h2 = h1 * r1 + h1
11086 //
11087 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11088 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11089 //
11090 // sqrt(x) = g3
11091
11092 SDNodeFlags Flags = Op->getFlags();
11093
11094 SDLoc DL(Op);
11095
11096 SDValue X = Op.getOperand(0);
11097 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11098
11099 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11100
11101 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11102
11103 // Scale up input if it is too small.
11104 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11105 SDValue ScaleUp =
11106 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11107 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11108
11109 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11110
11111 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11112
11113 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11114 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11115
11116 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11117 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11118
11119 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11120
11121 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11122
11123 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11124 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11125
11126 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11127
11128 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11129 SDValue SqrtD1 =
11130 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11131
11132 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11133
11134 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11135 SDValue ScaleDown =
11136 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11137 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11138
11139 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11140 // with finite only or nsz because rsq(+/-0) = +/-inf
11141
11142 // TODO: Check for DAZ and expand to subnormals
11143 SDValue IsZeroOrInf =
11144 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11145 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11146
11147 // If x is +INF, +0, or -0, use its original value
11148 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11149 Flags);
11150}
11151
11152SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11153 SDLoc DL(Op);
11154 EVT VT = Op.getValueType();
11155 SDValue Arg = Op.getOperand(0);
11156 SDValue TrigVal;
11157
11158 // Propagate fast-math flags so that the multiply we introduce can be folded
11159 // if Arg is already the result of a multiply by constant.
11160 auto Flags = Op->getFlags();
11161
11162 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11163
11164 if (Subtarget->hasTrigReducedRange()) {
11165 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11166 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11167 } else {
11168 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11169 }
11170
11171 switch (Op.getOpcode()) {
11172 case ISD::FCOS:
11173 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11174 case ISD::FSIN:
11175 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11176 default:
11177 llvm_unreachable("Wrong trig opcode");
11178 }
11179}
11180
11181SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11182 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11183 assert(AtomicNode->isCompareAndSwap());
11184 unsigned AS = AtomicNode->getAddressSpace();
11185
11186 // No custom lowering required for local address space
11188 return Op;
11189
11190 // Non-local address space requires custom lowering for atomic compare
11191 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11192 SDLoc DL(Op);
11193 SDValue ChainIn = Op.getOperand(0);
11194 SDValue Addr = Op.getOperand(1);
11195 SDValue Old = Op.getOperand(2);
11196 SDValue New = Op.getOperand(3);
11197 EVT VT = Op.getValueType();
11198 MVT SimpleVT = VT.getSimpleVT();
11199 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11200
11201 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11202 SDValue Ops[] = { ChainIn, Addr, NewOld };
11203
11204 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11205 Ops, VT, AtomicNode->getMemOperand());
11206}
11207
11208//===----------------------------------------------------------------------===//
11209// Custom DAG optimizations
11210//===----------------------------------------------------------------------===//
11211
11212SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11213 DAGCombinerInfo &DCI) const {
11214 EVT VT = N->getValueType(0);
11215 EVT ScalarVT = VT.getScalarType();
11216 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11217 return SDValue();
11218
11219 SelectionDAG &DAG = DCI.DAG;
11220 SDLoc DL(N);
11221
11222 SDValue Src = N->getOperand(0);
11223 EVT SrcVT = Src.getValueType();
11224
11225 // TODO: We could try to match extracting the higher bytes, which would be
11226 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11227 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11228 // about in practice.
11229 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11230 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11231 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11232 DCI.AddToWorklist(Cvt.getNode());
11233
11234 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11235 if (ScalarVT != MVT::f32) {
11236 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11237 DAG.getTargetConstant(0, DL, MVT::i32));
11238 }
11239 return Cvt;
11240 }
11241 }
11242
11243 return SDValue();
11244}
11245
11246SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11247 DAGCombinerInfo &DCI) const {
11248 SDValue MagnitudeOp = N->getOperand(0);
11249 SDValue SignOp = N->getOperand(1);
11250 SelectionDAG &DAG = DCI.DAG;
11251 SDLoc DL(N);
11252
11253 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11254 // lower half with a copy.
11255 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11256 if (MagnitudeOp.getValueType() == MVT::f64) {
11257 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11258 SDValue MagLo =
11259 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11260 DAG.getConstant(0, DL, MVT::i32));
11261 SDValue MagHi =
11262 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11263 DAG.getConstant(1, DL, MVT::i32));
11264
11265 SDValue HiOp =
11266 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11267
11268 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11269
11270 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11271 }
11272
11273 if (SignOp.getValueType() != MVT::f64)
11274 return SDValue();
11275
11276 // Reduce width of sign operand, we only need the highest bit.
11277 //
11278 // fcopysign f64:x, f64:y ->
11279 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11280 // TODO: In some cases it might make sense to go all the way to f16.
11281 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11282 SDValue SignAsF32 =
11283 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11284 DAG.getConstant(1, DL, MVT::i32));
11285
11286 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11287 SignAsF32);
11288}
11289
11290// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11291// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11292// bits
11293
11294// This is a variant of
11295// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11296//
11297// The normal DAG combiner will do this, but only if the add has one use since
11298// that would increase the number of instructions.
11299//
11300// This prevents us from seeing a constant offset that can be folded into a
11301// memory instruction's addressing mode. If we know the resulting add offset of
11302// a pointer can be folded into an addressing offset, we can replace the pointer
11303// operand with the add of new constant offset. This eliminates one of the uses,
11304// and may allow the remaining use to also be simplified.
11305//
11306SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11307 unsigned AddrSpace,
11308 EVT MemVT,
11309 DAGCombinerInfo &DCI) const {
11310 SDValue N0 = N->getOperand(0);
11311 SDValue N1 = N->getOperand(1);
11312
11313 // We only do this to handle cases where it's profitable when there are
11314 // multiple uses of the add, so defer to the standard combine.
11315 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11316 N0->hasOneUse())
11317 return SDValue();
11318
11319 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11320 if (!CN1)
11321 return SDValue();
11322
11323 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11324 if (!CAdd)
11325 return SDValue();
11326
11327 SelectionDAG &DAG = DCI.DAG;
11328
11329 if (N0->getOpcode() == ISD::OR &&
11330 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11331 return SDValue();
11332
11333 // If the resulting offset is too large, we can't fold it into the
11334 // addressing mode offset.
11335 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11336 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11337
11338 AddrMode AM;
11339 AM.HasBaseReg = true;
11340 AM.BaseOffs = Offset.getSExtValue();
11341 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11342 return SDValue();
11343
11344 SDLoc SL(N);
11345 EVT VT = N->getValueType(0);
11346
11347 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11348 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11349
11351 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11352 (N0.getOpcode() == ISD::OR ||
11353 N0->getFlags().hasNoUnsignedWrap()));
11354
11355 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11356}
11357
11358/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11359/// by the chain and intrinsic ID. Theoretically we would also need to check the
11360/// specific intrinsic, but they all place the pointer operand first.
11361static unsigned getBasePtrIndex(const MemSDNode *N) {
11362 switch (N->getOpcode()) {
11363 case ISD::STORE:
11366 return 2;
11367 default:
11368 return 1;
11369 }
11370}
11371
11372SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11373 DAGCombinerInfo &DCI) const {
11374 SelectionDAG &DAG = DCI.DAG;
11375 SDLoc SL(N);
11376
11377 unsigned PtrIdx = getBasePtrIndex(N);
11378 SDValue Ptr = N->getOperand(PtrIdx);
11379
11380 // TODO: We could also do this for multiplies.
11381 if (Ptr.getOpcode() == ISD::SHL) {
11382 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11383 N->getMemoryVT(), DCI);
11384 if (NewPtr) {
11385 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11386
11387 NewOps[PtrIdx] = NewPtr;
11388 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11389 }
11390 }
11391
11392 return SDValue();
11393}
11394
11395static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11396 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11397 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11398 (Opc == ISD::XOR && Val == 0);
11399}
11400
11401// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11402// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11403// integer combine opportunities since most 64-bit operations are decomposed
11404// this way. TODO: We won't want this for SALU especially if it is an inline
11405// immediate.
11406SDValue SITargetLowering::splitBinaryBitConstantOp(
11407 DAGCombinerInfo &DCI,
11408 const SDLoc &SL,
11409 unsigned Opc, SDValue LHS,
11410 const ConstantSDNode *CRHS) const {
11411 uint64_t Val = CRHS->getZExtValue();
11412 uint32_t ValLo = Lo_32(Val);
11413 uint32_t ValHi = Hi_32(Val);
11415
11416 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11417 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11418 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11419 // If we need to materialize a 64-bit immediate, it will be split up later
11420 // anyway. Avoid creating the harder to understand 64-bit immediate
11421 // materialization.
11422 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11423 }
11424
11425 return SDValue();
11426}
11427
11429 if (V.getValueType() != MVT::i1)
11430 return false;
11431 switch (V.getOpcode()) {
11432 default:
11433 break;
11434 case ISD::SETCC:
11436 return true;
11437 case ISD::AND:
11438 case ISD::OR:
11439 case ISD::XOR:
11440 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11441 }
11442 return false;
11443}
11444
11445// If a constant has all zeroes or all ones within each byte return it.
11446// Otherwise return 0.
11448 // 0xff for any zero byte in the mask
11449 uint32_t ZeroByteMask = 0;
11450 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11451 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11452 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11453 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11454 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11455 if ((NonZeroByteMask & C) != NonZeroByteMask)
11456 return 0; // Partial bytes selected.
11457 return C;
11458}
11459
11460// Check if a node selects whole bytes from its operand 0 starting at a byte
11461// boundary while masking the rest. Returns select mask as in the v_perm_b32
11462// or -1 if not succeeded.
11463// Note byte select encoding:
11464// value 0-3 selects corresponding source byte;
11465// value 0xc selects zero;
11466// value 0xff selects 0xff.
11468 assert(V.getValueSizeInBits() == 32);
11469
11470 if (V.getNumOperands() != 2)
11471 return ~0;
11472
11473 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11474 if (!N1)
11475 return ~0;
11476
11477 uint32_t C = N1->getZExtValue();
11478
11479 switch (V.getOpcode()) {
11480 default:
11481 break;
11482 case ISD::AND:
11483 if (uint32_t ConstMask = getConstantPermuteMask(C))
11484 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11485 break;
11486
11487 case ISD::OR:
11488 if (uint32_t ConstMask = getConstantPermuteMask(C))
11489 return (0x03020100 & ~ConstMask) | ConstMask;
11490 break;
11491
11492 case ISD::SHL:
11493 if (C % 8)
11494 return ~0;
11495
11496 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11497
11498 case ISD::SRL:
11499 if (C % 8)
11500 return ~0;
11501
11502 return uint32_t(0x0c0c0c0c03020100ull >> C);
11503 }
11504
11505 return ~0;
11506}
11507
11508SDValue SITargetLowering::performAndCombine(SDNode *N,
11509 DAGCombinerInfo &DCI) const {
11510 if (DCI.isBeforeLegalize())
11511 return SDValue();
11512
11513 SelectionDAG &DAG = DCI.DAG;
11514 EVT VT = N->getValueType(0);
11515 SDValue LHS = N->getOperand(0);
11516 SDValue RHS = N->getOperand(1);
11517
11518
11519 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11520 if (VT == MVT::i64 && CRHS) {
11521 if (SDValue Split
11522 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11523 return Split;
11524 }
11525
11526 if (CRHS && VT == MVT::i32) {
11527 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11528 // nb = number of trailing zeroes in mask
11529 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11530 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11531 uint64_t Mask = CRHS->getZExtValue();
11532 unsigned Bits = llvm::popcount(Mask);
11533 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11534 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11535 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11536 unsigned Shift = CShift->getZExtValue();
11537 unsigned NB = CRHS->getAPIntValue().countr_zero();
11538 unsigned Offset = NB + Shift;
11539 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11540 SDLoc SL(N);
11541 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11542 LHS->getOperand(0),
11543 DAG.getConstant(Offset, SL, MVT::i32),
11544 DAG.getConstant(Bits, SL, MVT::i32));
11545 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11546 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11547 DAG.getValueType(NarrowVT));
11548 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11549 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11550 return Shl;
11551 }
11552 }
11553 }
11554
11555 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11556 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11557 isa<ConstantSDNode>(LHS.getOperand(2))) {
11558 uint32_t Sel = getConstantPermuteMask(Mask);
11559 if (!Sel)
11560 return SDValue();
11561
11562 // Select 0xc for all zero bytes
11563 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11564 SDLoc DL(N);
11565 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11566 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11567 }
11568 }
11569
11570 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11571 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11572 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11573 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11574 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11575
11576 SDValue X = LHS.getOperand(0);
11577 SDValue Y = RHS.getOperand(0);
11578 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11579 !isTypeLegal(X.getValueType()))
11580 return SDValue();
11581
11582 if (LCC == ISD::SETO) {
11583 if (X != LHS.getOperand(1))
11584 return SDValue();
11585
11586 if (RCC == ISD::SETUNE) {
11587 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11588 if (!C1 || !C1->isInfinity() || C1->isNegative())
11589 return SDValue();
11590
11597
11598 static_assert(((~(SIInstrFlags::S_NAN |
11601 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11602 "mask not equal");
11603
11604 SDLoc DL(N);
11605 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11606 X, DAG.getConstant(Mask, DL, MVT::i32));
11607 }
11608 }
11609 }
11610
11611 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11612 std::swap(LHS, RHS);
11613
11614 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11615 RHS.hasOneUse()) {
11616 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11617 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11618 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11619 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11620 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11621 (RHS.getOperand(0) == LHS.getOperand(0) &&
11622 LHS.getOperand(0) == LHS.getOperand(1))) {
11623 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11624 unsigned NewMask = LCC == ISD::SETO ?
11625 Mask->getZExtValue() & ~OrdMask :
11626 Mask->getZExtValue() & OrdMask;
11627
11628 SDLoc DL(N);
11629 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11630 DAG.getConstant(NewMask, DL, MVT::i32));
11631 }
11632 }
11633
11634 if (VT == MVT::i32 &&
11635 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11636 // and x, (sext cc from i1) => select cc, x, 0
11637 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11638 std::swap(LHS, RHS);
11639 if (isBoolSGPR(RHS.getOperand(0)))
11640 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11641 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11642 }
11643
11644 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11646 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11647 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11648 uint32_t LHSMask = getPermuteMask(LHS);
11649 uint32_t RHSMask = getPermuteMask(RHS);
11650 if (LHSMask != ~0u && RHSMask != ~0u) {
11651 // Canonicalize the expression in an attempt to have fewer unique masks
11652 // and therefore fewer registers used to hold the masks.
11653 if (LHSMask > RHSMask) {
11654 std::swap(LHSMask, RHSMask);
11655 std::swap(LHS, RHS);
11656 }
11657
11658 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11659 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11660 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11661 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11662
11663 // Check of we need to combine values from two sources within a byte.
11664 if (!(LHSUsedLanes & RHSUsedLanes) &&
11665 // If we select high and lower word keep it for SDWA.
11666 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11667 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11668 // Each byte in each mask is either selector mask 0-3, or has higher
11669 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11670 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11671 // mask which is not 0xff wins. By anding both masks we have a correct
11672 // result except that 0x0c shall be corrected to give 0x0c only.
11673 uint32_t Mask = LHSMask & RHSMask;
11674 for (unsigned I = 0; I < 32; I += 8) {
11675 uint32_t ByteSel = 0xff << I;
11676 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11677 Mask &= (0x0c << I) & 0xffffffff;
11678 }
11679
11680 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11681 // or 0x0c.
11682 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11683 SDLoc DL(N);
11684
11685 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11686 LHS.getOperand(0), RHS.getOperand(0),
11687 DAG.getConstant(Sel, DL, MVT::i32));
11688 }
11689 }
11690 }
11691
11692 return SDValue();
11693}
11694
11695// A key component of v_perm is a mapping between byte position of the src
11696// operands, and the byte position of the dest. To provide such, we need: 1. the
11697// node that provides x byte of the dest of the OR, and 2. the byte of the node
11698// used to provide that x byte. calculateByteProvider finds which node provides
11699// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11700// and finds an ultimate src and byte position For example: The supported
11701// LoadCombine pattern for vector loads is as follows
11702// t1
11703// or
11704// / \
11705// t2 t3
11706// zext shl
11707// | | \
11708// t4 t5 16
11709// or anyext
11710// / \ |
11711// t6 t7 t8
11712// srl shl or
11713// / | / \ / \
11714// t9 t10 t11 t12 t13 t14
11715// trunc* 8 trunc* 8 and and
11716// | | / | | \
11717// t15 t16 t17 t18 t19 t20
11718// trunc* 255 srl -256
11719// | / \
11720// t15 t15 16
11721//
11722// *In this example, the truncs are from i32->i16
11723//
11724// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11725// respectively. calculateSrcByte would find (given node) -> ultimate src &
11726// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11727// After finding the mapping, we can combine the tree into vperm t15, t16,
11728// 0x05000407
11729
11730// Find the source and byte position from a node.
11731// \p DestByte is the byte position of the dest of the or that the src
11732// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11733// dest of the or byte. \p Depth tracks how many recursive iterations we have
11734// performed.
11735static const std::optional<ByteProvider<SDValue>>
11736calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11737 unsigned Depth = 0) {
11738 // We may need to recursively traverse a series of SRLs
11739 if (Depth >= 6)
11740 return std::nullopt;
11741
11742 if (Op.getValueSizeInBits() < 8)
11743 return std::nullopt;
11744
11745 if (Op.getValueType().isVector())
11746 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11747
11748 switch (Op->getOpcode()) {
11749 case ISD::TRUNCATE: {
11750 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11751 }
11752
11753 case ISD::SIGN_EXTEND:
11754 case ISD::ZERO_EXTEND:
11756 SDValue NarrowOp = Op->getOperand(0);
11757 auto NarrowVT = NarrowOp.getValueType();
11758 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11759 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11760 NarrowVT = VTSign->getVT();
11761 }
11762 if (!NarrowVT.isByteSized())
11763 return std::nullopt;
11764 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11765
11766 if (SrcIndex >= NarrowByteWidth)
11767 return std::nullopt;
11768 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11769 }
11770
11771 case ISD::SRA:
11772 case ISD::SRL: {
11773 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11774 if (!ShiftOp)
11775 return std::nullopt;
11776
11777 uint64_t BitShift = ShiftOp->getZExtValue();
11778
11779 if (BitShift % 8 != 0)
11780 return std::nullopt;
11781
11782 SrcIndex += BitShift / 8;
11783
11784 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11785 }
11786
11787 default: {
11788 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11789 }
11790 }
11791 llvm_unreachable("fully handled switch");
11792}
11793
11794// For a byte position in the result of an Or, traverse the tree and find the
11795// node (and the byte of the node) which ultimately provides this {Or,
11796// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11797// the byte position of the Op that corresponds with the originally requested
11798// byte of the Or \p Depth tracks how many recursive iterations we have
11799// performed. \p StartingIndex is the originally requested byte of the Or
11800static const std::optional<ByteProvider<SDValue>>
11801calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11802 unsigned StartingIndex = 0) {
11803 // Finding Src tree of RHS of or typically requires at least 1 additional
11804 // depth
11805 if (Depth > 6)
11806 return std::nullopt;
11807
11808 unsigned BitWidth = Op.getScalarValueSizeInBits();
11809 if (BitWidth % 8 != 0)
11810 return std::nullopt;
11811 if (Index > BitWidth / 8 - 1)
11812 return std::nullopt;
11813
11814 bool IsVec = Op.getValueType().isVector();
11815 switch (Op.getOpcode()) {
11816 case ISD::OR: {
11817 if (IsVec)
11818 return std::nullopt;
11819
11820 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11821 StartingIndex);
11822 if (!RHS)
11823 return std::nullopt;
11824 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11825 StartingIndex);
11826 if (!LHS)
11827 return std::nullopt;
11828 // A well formed Or will have two ByteProviders for each byte, one of which
11829 // is constant zero
11830 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11831 return std::nullopt;
11832 if (!LHS || LHS->isConstantZero())
11833 return RHS;
11834 if (!RHS || RHS->isConstantZero())
11835 return LHS;
11836 return std::nullopt;
11837 }
11838
11839 case ISD::AND: {
11840 if (IsVec)
11841 return std::nullopt;
11842
11843 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11844 if (!BitMaskOp)
11845 return std::nullopt;
11846
11847 uint32_t BitMask = BitMaskOp->getZExtValue();
11848 // Bits we expect for our StartingIndex
11849 uint32_t IndexMask = 0xFF << (Index * 8);
11850
11851 if ((IndexMask & BitMask) != IndexMask) {
11852 // If the result of the and partially provides the byte, then it
11853 // is not well formatted
11854 if (IndexMask & BitMask)
11855 return std::nullopt;
11857 }
11858
11859 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11860 }
11861
11862 case ISD::FSHR: {
11863 if (IsVec)
11864 return std::nullopt;
11865
11866 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11867 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11868 if (!ShiftOp || Op.getValueType().isVector())
11869 return std::nullopt;
11870
11871 uint64_t BitsProvided = Op.getValueSizeInBits();
11872 if (BitsProvided % 8 != 0)
11873 return std::nullopt;
11874
11875 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11876 if (BitShift % 8)
11877 return std::nullopt;
11878
11879 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11880 uint64_t ByteShift = BitShift / 8;
11881
11882 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11883 uint64_t BytesProvided = BitsProvided / 8;
11884 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11885 NewIndex %= BytesProvided;
11886 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11887 }
11888
11889 case ISD::SRA:
11890 case ISD::SRL: {
11891 if (IsVec)
11892 return std::nullopt;
11893
11894 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11895 if (!ShiftOp)
11896 return std::nullopt;
11897
11898 uint64_t BitShift = ShiftOp->getZExtValue();
11899 if (BitShift % 8)
11900 return std::nullopt;
11901
11902 auto BitsProvided = Op.getScalarValueSizeInBits();
11903 if (BitsProvided % 8 != 0)
11904 return std::nullopt;
11905
11906 uint64_t BytesProvided = BitsProvided / 8;
11907 uint64_t ByteShift = BitShift / 8;
11908 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11909 // If the byte we are trying to provide (as tracked by index) falls in this
11910 // range, then the SRL provides the byte. The byte of interest of the src of
11911 // the SRL is Index + ByteShift
11912 return BytesProvided - ByteShift > Index
11913 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11914 Index + ByteShift)
11916 }
11917
11918 case ISD::SHL: {
11919 if (IsVec)
11920 return std::nullopt;
11921
11922 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11923 if (!ShiftOp)
11924 return std::nullopt;
11925
11926 uint64_t BitShift = ShiftOp->getZExtValue();
11927 if (BitShift % 8 != 0)
11928 return std::nullopt;
11929 uint64_t ByteShift = BitShift / 8;
11930
11931 // If we are shifting by an amount greater than (or equal to)
11932 // the index we are trying to provide, then it provides 0s. If not,
11933 // then this bytes are not definitively 0s, and the corresponding byte
11934 // of interest is Index - ByteShift of the src
11935 return Index < ByteShift
11937 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11938 Depth + 1, StartingIndex);
11939 }
11940 case ISD::ANY_EXTEND:
11941 case ISD::SIGN_EXTEND:
11942 case ISD::ZERO_EXTEND:
11944 case ISD::AssertZext:
11945 case ISD::AssertSext: {
11946 if (IsVec)
11947 return std::nullopt;
11948
11949 SDValue NarrowOp = Op->getOperand(0);
11950 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11951 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11952 Op->getOpcode() == ISD::AssertZext ||
11953 Op->getOpcode() == ISD::AssertSext) {
11954 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11955 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11956 }
11957 if (NarrowBitWidth % 8 != 0)
11958 return std::nullopt;
11959 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11960
11961 if (Index >= NarrowByteWidth)
11962 return Op.getOpcode() == ISD::ZERO_EXTEND
11963 ? std::optional<ByteProvider<SDValue>>(
11965 : std::nullopt;
11966 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11967 }
11968
11969 case ISD::TRUNCATE: {
11970 if (IsVec)
11971 return std::nullopt;
11972
11973 uint64_t NarrowByteWidth = BitWidth / 8;
11974
11975 if (NarrowByteWidth >= Index) {
11976 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11977 StartingIndex);
11978 }
11979
11980 return std::nullopt;
11981 }
11982
11983 case ISD::CopyFromReg: {
11984 if (BitWidth / 8 > Index)
11985 return calculateSrcByte(Op, StartingIndex, Index);
11986
11987 return std::nullopt;
11988 }
11989
11990 case ISD::LOAD: {
11991 auto L = cast<LoadSDNode>(Op.getNode());
11992
11993 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11994 if (NarrowBitWidth % 8 != 0)
11995 return std::nullopt;
11996 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11997
11998 // If the width of the load does not reach byte we are trying to provide for
11999 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12000 // question
12001 if (Index >= NarrowByteWidth) {
12002 return L->getExtensionType() == ISD::ZEXTLOAD
12003 ? std::optional<ByteProvider<SDValue>>(
12005 : std::nullopt;
12006 }
12007
12008 if (NarrowByteWidth > Index) {
12009 return calculateSrcByte(Op, StartingIndex, Index);
12010 }
12011
12012 return std::nullopt;
12013 }
12014
12015 case ISD::BSWAP: {
12016 if (IsVec)
12017 return std::nullopt;
12018
12019 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12020 Depth + 1, StartingIndex);
12021 }
12022
12024 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12025 if (!IdxOp)
12026 return std::nullopt;
12027 auto VecIdx = IdxOp->getZExtValue();
12028 auto ScalarSize = Op.getScalarValueSizeInBits();
12029 if (ScalarSize < 32)
12030 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12031 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12032 StartingIndex, Index);
12033 }
12034
12035 case AMDGPUISD::PERM: {
12036 if (IsVec)
12037 return std::nullopt;
12038
12039 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12040 if (!PermMask)
12041 return std::nullopt;
12042
12043 auto IdxMask =
12044 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12045 if (IdxMask > 0x07 && IdxMask != 0x0c)
12046 return std::nullopt;
12047
12048 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12049 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12050
12051 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12054 }
12055
12056 default: {
12057 return std::nullopt;
12058 }
12059 }
12060
12061 llvm_unreachable("fully handled switch");
12062}
12063
12064// Returns true if the Operand is a scalar and is 16 bits
12065static bool isExtendedFrom16Bits(SDValue &Operand) {
12066
12067 switch (Operand.getOpcode()) {
12068 case ISD::ANY_EXTEND:
12069 case ISD::SIGN_EXTEND:
12070 case ISD::ZERO_EXTEND: {
12071 auto OpVT = Operand.getOperand(0).getValueType();
12072 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12073 }
12074 case ISD::LOAD: {
12075 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12076 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12077 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12078 ExtType == ISD::EXTLOAD) {
12079 auto MemVT = L->getMemoryVT();
12080 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12081 }
12082 return L->getMemoryVT().getSizeInBits() == 16;
12083 }
12084 default:
12085 return false;
12086 }
12087}
12088
12089// Returns true if the mask matches consecutive bytes, and the first byte
12090// begins at a power of 2 byte offset from 0th byte
12091static bool addresses16Bits(int Mask) {
12092 int Low8 = Mask & 0xff;
12093 int Hi8 = (Mask & 0xff00) >> 8;
12094
12095 assert(Low8 < 8 && Hi8 < 8);
12096 // Are the bytes contiguous in the order of increasing addresses.
12097 bool IsConsecutive = (Hi8 - Low8 == 1);
12098 // Is the first byte at location that is aligned for 16 bit instructions.
12099 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12100 // In this case, we still need code to extract the 16 bit operand, so it
12101 // is better to use i8 v_perm
12102 bool Is16Aligned = !(Low8 % 2);
12103
12104 return IsConsecutive && Is16Aligned;
12105}
12106
12107// Do not lower into v_perm if the operands are actually 16 bit
12108// and the selected bits (based on PermMask) correspond with two
12109// easily addressable 16 bit operands.
12111 SDValue &OtherOp) {
12112 int Low16 = PermMask & 0xffff;
12113 int Hi16 = (PermMask & 0xffff0000) >> 16;
12114
12115 auto TempOp = peekThroughBitcasts(Op);
12116 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12117
12118 auto OpIs16Bit =
12119 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12120 if (!OpIs16Bit)
12121 return true;
12122
12123 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12124 isExtendedFrom16Bits(TempOtherOp);
12125 if (!OtherOpIs16Bit)
12126 return true;
12127
12128 // Do we cleanly address both
12129 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12130}
12131
12133 unsigned DWordOffset) {
12134 SDValue Ret;
12135
12136 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12137 // ByteProvider must be at least 8 bits
12138 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12139
12140 if (TypeSize <= 32)
12141 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12142
12143 if (Src.getValueType().isVector()) {
12144 auto ScalarTySize = Src.getScalarValueSizeInBits();
12145 auto ScalarTy = Src.getValueType().getScalarType();
12146 if (ScalarTySize == 32) {
12147 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12148 DAG.getConstant(DWordOffset, SL, MVT::i32));
12149 }
12150 if (ScalarTySize > 32) {
12151 Ret = DAG.getNode(
12152 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12153 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12154 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12155 if (ShiftVal)
12156 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12157 DAG.getConstant(ShiftVal, SL, MVT::i32));
12158 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12159 }
12160
12161 assert(ScalarTySize < 32);
12162 auto NumElements = TypeSize / ScalarTySize;
12163 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12164 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12165 auto NumElementsIn32 = 32 / ScalarTySize;
12166 auto NumAvailElements = DWordOffset < Trunc32Elements
12167 ? NumElementsIn32
12168 : NumElements - NormalizedTrunc;
12169
12171 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12172 NumAvailElements);
12173
12174 Ret = DAG.getBuildVector(
12175 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12176 VecSrcs);
12177 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12178 }
12179
12180 /// Scalar Type
12181 auto ShiftVal = 32 * DWordOffset;
12182 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12183 DAG.getConstant(ShiftVal, SL, MVT::i32));
12184 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12185}
12186
12188 SelectionDAG &DAG = DCI.DAG;
12189 [[maybe_unused]] EVT VT = N->getValueType(0);
12191
12192 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12193 assert(VT == MVT::i32);
12194 for (int i = 0; i < 4; i++) {
12195 // Find the ByteProvider that provides the ith byte of the result of OR
12196 std::optional<ByteProvider<SDValue>> P =
12197 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12198 // TODO support constantZero
12199 if (!P || P->isConstantZero())
12200 return SDValue();
12201
12202 PermNodes.push_back(*P);
12203 }
12204 if (PermNodes.size() != 4)
12205 return SDValue();
12206
12207 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12208 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12209 uint64_t PermMask = 0x00000000;
12210 for (size_t i = 0; i < PermNodes.size(); i++) {
12211 auto PermOp = PermNodes[i];
12212 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12213 // by sizeof(Src2) = 4
12214 int SrcByteAdjust = 4;
12215
12216 // If the Src uses a byte from a different DWORD, then it corresponds
12217 // with a difference source
12218 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12219 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12220 if (SecondSrc)
12221 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12222 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12223 return SDValue();
12224
12225 // Set the index of the second distinct Src node
12226 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12227 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12228 SrcByteAdjust = 0;
12229 }
12230 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12232 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12233 }
12234 SDLoc DL(N);
12235 SDValue Op = *PermNodes[FirstSrc.first].Src;
12236 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12237 assert(Op.getValueSizeInBits() == 32);
12238
12239 // Check that we are not just extracting the bytes in order from an op
12240 if (!SecondSrc) {
12241 int Low16 = PermMask & 0xffff;
12242 int Hi16 = (PermMask & 0xffff0000) >> 16;
12243
12244 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12245 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12246
12247 // The perm op would really just produce Op. So combine into Op
12248 if (WellFormedLow && WellFormedHi)
12249 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12250 }
12251
12252 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12253
12254 if (SecondSrc) {
12255 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12256 assert(OtherOp.getValueSizeInBits() == 32);
12257 }
12258
12259 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12260
12261 assert(Op.getValueType().isByteSized() &&
12262 OtherOp.getValueType().isByteSized());
12263
12264 // If the ultimate src is less than 32 bits, then we will only be
12265 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12266 // CalculateByteProvider would not have returned Op as source if we
12267 // used a byte that is outside its ValueType. Thus, we are free to
12268 // ANY_EXTEND as the extended bits are dont-cares.
12269 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12270 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12271
12272 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12273 DAG.getConstant(PermMask, DL, MVT::i32));
12274 }
12275 return SDValue();
12276}
12277
12278SDValue SITargetLowering::performOrCombine(SDNode *N,
12279 DAGCombinerInfo &DCI) const {
12280 SelectionDAG &DAG = DCI.DAG;
12281 SDValue LHS = N->getOperand(0);
12282 SDValue RHS = N->getOperand(1);
12283
12284 EVT VT = N->getValueType(0);
12285 if (VT == MVT::i1) {
12286 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12287 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12288 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12289 SDValue Src = LHS.getOperand(0);
12290 if (Src != RHS.getOperand(0))
12291 return SDValue();
12292
12293 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12294 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12295 if (!CLHS || !CRHS)
12296 return SDValue();
12297
12298 // Only 10 bits are used.
12299 static const uint32_t MaxMask = 0x3ff;
12300
12301 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12302 SDLoc DL(N);
12303 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12304 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12305 }
12306
12307 return SDValue();
12308 }
12309
12310 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12311 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12312 LHS.getOpcode() == AMDGPUISD::PERM &&
12313 isa<ConstantSDNode>(LHS.getOperand(2))) {
12314 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12315 if (!Sel)
12316 return SDValue();
12317
12318 Sel |= LHS.getConstantOperandVal(2);
12319 SDLoc DL(N);
12320 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12321 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12322 }
12323
12324 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12326 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12327 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12328
12329 // If all the uses of an or need to extract the individual elements, do not
12330 // attempt to lower into v_perm
12331 auto usesCombinedOperand = [](SDNode *OrUse) {
12332 // If we have any non-vectorized use, then it is a candidate for v_perm
12333 if (OrUse->getOpcode() != ISD::BITCAST ||
12334 !OrUse->getValueType(0).isVector())
12335 return true;
12336
12337 // If we have any non-vectorized use, then it is a candidate for v_perm
12338 for (auto VUse : OrUse->uses()) {
12339 if (!VUse->getValueType(0).isVector())
12340 return true;
12341
12342 // If the use of a vector is a store, then combining via a v_perm
12343 // is beneficial.
12344 // TODO -- whitelist more uses
12345 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12346 if (VUse->getOpcode() == VectorwiseOp)
12347 return true;
12348 }
12349 return false;
12350 };
12351
12352 if (!any_of(N->uses(), usesCombinedOperand))
12353 return SDValue();
12354
12355 uint32_t LHSMask = getPermuteMask(LHS);
12356 uint32_t RHSMask = getPermuteMask(RHS);
12357
12358 if (LHSMask != ~0u && RHSMask != ~0u) {
12359 // Canonicalize the expression in an attempt to have fewer unique masks
12360 // and therefore fewer registers used to hold the masks.
12361 if (LHSMask > RHSMask) {
12362 std::swap(LHSMask, RHSMask);
12363 std::swap(LHS, RHS);
12364 }
12365
12366 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12367 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12368 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12369 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12370
12371 // Check of we need to combine values from two sources within a byte.
12372 if (!(LHSUsedLanes & RHSUsedLanes) &&
12373 // If we select high and lower word keep it for SDWA.
12374 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12375 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12376 // Kill zero bytes selected by other mask. Zero value is 0xc.
12377 LHSMask &= ~RHSUsedLanes;
12378 RHSMask &= ~LHSUsedLanes;
12379 // Add 4 to each active LHS lane
12380 LHSMask |= LHSUsedLanes & 0x04040404;
12381 // Combine masks
12382 uint32_t Sel = LHSMask | RHSMask;
12383 SDLoc DL(N);
12384
12385 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12386 LHS.getOperand(0), RHS.getOperand(0),
12387 DAG.getConstant(Sel, DL, MVT::i32));
12388 }
12389 }
12390 if (LHSMask == ~0u || RHSMask == ~0u) {
12391 if (SDValue Perm = matchPERM(N, DCI))
12392 return Perm;
12393 }
12394 }
12395
12396 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12397 return SDValue();
12398
12399 // TODO: This could be a generic combine with a predicate for extracting the
12400 // high half of an integer being free.
12401
12402 // (or i64:x, (zero_extend i32:y)) ->
12403 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12404 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12405 RHS.getOpcode() != ISD::ZERO_EXTEND)
12406 std::swap(LHS, RHS);
12407
12408 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12409 SDValue ExtSrc = RHS.getOperand(0);
12410 EVT SrcVT = ExtSrc.getValueType();
12411 if (SrcVT == MVT::i32) {
12412 SDLoc SL(N);
12413 SDValue LowLHS, HiBits;
12414 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12415 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12416
12417 DCI.AddToWorklist(LowOr.getNode());
12418 DCI.AddToWorklist(HiBits.getNode());
12419
12420 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12421 LowOr, HiBits);
12422 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12423 }
12424 }
12425
12426 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12427 if (CRHS) {
12428 if (SDValue Split
12429 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12430 N->getOperand(0), CRHS))
12431 return Split;
12432 }
12433
12434 return SDValue();
12435}
12436
12437SDValue SITargetLowering::performXorCombine(SDNode *N,
12438 DAGCombinerInfo &DCI) const {
12439 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12440 return RV;
12441
12442 SDValue LHS = N->getOperand(0);
12443 SDValue RHS = N->getOperand(1);
12444
12445 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12446 SelectionDAG &DAG = DCI.DAG;
12447
12448 EVT VT = N->getValueType(0);
12449 if (CRHS && VT == MVT::i64) {
12450 if (SDValue Split
12451 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12452 return Split;
12453 }
12454
12455 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12456 // fneg-like xors into 64-bit select.
12457 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12458 // This looks like an fneg, try to fold as a source modifier.
12459 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12460 shouldFoldFNegIntoSrc(N, LHS)) {
12461 // xor (select c, a, b), 0x80000000 ->
12462 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12463 SDLoc DL(N);
12464 SDValue CastLHS =
12465 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12466 SDValue CastRHS =
12467 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12468 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12469 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12470 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12471 LHS->getOperand(0), FNegLHS, FNegRHS);
12472 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12473 }
12474 }
12475
12476 return SDValue();
12477}
12478
12479SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12480 DAGCombinerInfo &DCI) const {
12481 if (!Subtarget->has16BitInsts() ||
12482 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12483 return SDValue();
12484
12485 EVT VT = N->getValueType(0);
12486 if (VT != MVT::i32)
12487 return SDValue();
12488
12489 SDValue Src = N->getOperand(0);
12490 if (Src.getValueType() != MVT::i16)
12491 return SDValue();
12492
12493 return SDValue();
12494}
12495
12496SDValue
12497SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12498 DAGCombinerInfo &DCI) const {
12499 SDValue Src = N->getOperand(0);
12500 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12501
12502 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12503 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12504 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12505 VTSign->getVT() == MVT::i8) ||
12506 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12507 VTSign->getVT() == MVT::i16))) {
12508 assert(Subtarget->hasScalarSubwordLoads() &&
12509 "s_buffer_load_{u8, i8} are supported "
12510 "in GFX12 (or newer) architectures.");
12511 EVT VT = Src.getValueType();
12512 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12515 SDLoc DL(N);
12516 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12517 SDValue Ops[] = {
12518 Src.getOperand(0), // source register
12519 Src.getOperand(1), // offset
12520 Src.getOperand(2) // cachePolicy
12521 };
12522 auto *M = cast<MemSDNode>(Src);
12523 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12524 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12525 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12526 return LoadVal;
12527 }
12528 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12529 VTSign->getVT() == MVT::i8) ||
12530 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12531 VTSign->getVT() == MVT::i16)) &&
12532 Src.hasOneUse()) {
12533 auto *M = cast<MemSDNode>(Src);
12534 SDValue Ops[] = {
12535 Src.getOperand(0), // Chain
12536 Src.getOperand(1), // rsrc
12537 Src.getOperand(2), // vindex
12538 Src.getOperand(3), // voffset
12539 Src.getOperand(4), // soffset
12540 Src.getOperand(5), // offset
12541 Src.getOperand(6),
12542 Src.getOperand(7)
12543 };
12544 // replace with BUFFER_LOAD_BYTE/SHORT
12545 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12546 Src.getOperand(0).getValueType());
12547 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12549 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12550 ResList,
12551 Ops, M->getMemoryVT(),
12552 M->getMemOperand());
12553 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12554 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12555 }
12556 return SDValue();
12557}
12558
12559SDValue SITargetLowering::performClassCombine(SDNode *N,
12560 DAGCombinerInfo &DCI) const {
12561 SelectionDAG &DAG = DCI.DAG;
12562 SDValue Mask = N->getOperand(1);
12563
12564 // fp_class x, 0 -> false
12565 if (isNullConstant(Mask))
12566 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12567
12568 if (N->getOperand(0).isUndef())
12569 return DAG.getUNDEF(MVT::i1);
12570
12571 return SDValue();
12572}
12573
12574SDValue SITargetLowering::performRcpCombine(SDNode *N,
12575 DAGCombinerInfo &DCI) const {
12576 EVT VT = N->getValueType(0);
12577 SDValue N0 = N->getOperand(0);
12578
12579 if (N0.isUndef()) {
12580 return DCI.DAG.getConstantFP(
12582 VT);
12583 }
12584
12585 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12586 N0.getOpcode() == ISD::SINT_TO_FP)) {
12587 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12588 N->getFlags());
12589 }
12590
12591 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12592 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12593 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12594 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12595 N0.getOperand(0), N->getFlags());
12596 }
12597
12599}
12600
12602 unsigned MaxDepth) const {
12603 unsigned Opcode = Op.getOpcode();
12604 if (Opcode == ISD::FCANONICALIZE)
12605 return true;
12606
12607 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12608 const auto &F = CFP->getValueAPF();
12609 if (F.isNaN() && F.isSignaling())
12610 return false;
12611 if (!F.isDenormal())
12612 return true;
12613
12614 DenormalMode Mode =
12615 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12616 return Mode == DenormalMode::getIEEE();
12617 }
12618
12619 // If source is a result of another standard FP operation it is already in
12620 // canonical form.
12621 if (MaxDepth == 0)
12622 return false;
12623
12624 switch (Opcode) {
12625 // These will flush denorms if required.
12626 case ISD::FADD:
12627 case ISD::FSUB:
12628 case ISD::FMUL:
12629 case ISD::FCEIL:
12630 case ISD::FFLOOR:
12631 case ISD::FMA:
12632 case ISD::FMAD:
12633 case ISD::FSQRT:
12634 case ISD::FDIV:
12635 case ISD::FREM:
12636 case ISD::FP_ROUND:
12637 case ISD::FP_EXTEND:
12638 case ISD::FP16_TO_FP:
12639 case ISD::FP_TO_FP16:
12640 case ISD::BF16_TO_FP:
12641 case ISD::FP_TO_BF16:
12642 case ISD::FLDEXP:
12645 case AMDGPUISD::RCP:
12646 case AMDGPUISD::RSQ:
12650 case AMDGPUISD::LOG:
12651 case AMDGPUISD::EXP:
12655 case AMDGPUISD::FRACT:
12662 case AMDGPUISD::SIN_HW:
12663 case AMDGPUISD::COS_HW:
12664 return true;
12665
12666 // It can/will be lowered or combined as a bit operation.
12667 // Need to check their input recursively to handle.
12668 case ISD::FNEG:
12669 case ISD::FABS:
12670 case ISD::FCOPYSIGN:
12671 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12672
12673 case ISD::AND:
12674 if (Op.getValueType() == MVT::i32) {
12675 // Be careful as we only know it is a bitcast floating point type. It
12676 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12677 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12678 // is valid to optimize for all types.
12679 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12680 if (RHS->getZExtValue() == 0xffff0000) {
12681 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12682 }
12683 }
12684 }
12685 break;
12686
12687 case ISD::FSIN:
12688 case ISD::FCOS:
12689 case ISD::FSINCOS:
12690 return Op.getValueType().getScalarType() != MVT::f16;
12691
12692 case ISD::FMINNUM:
12693 case ISD::FMAXNUM:
12694 case ISD::FMINNUM_IEEE:
12695 case ISD::FMAXNUM_IEEE:
12696 case ISD::FMINIMUM:
12697 case ISD::FMAXIMUM:
12698 case AMDGPUISD::CLAMP:
12699 case AMDGPUISD::FMED3:
12700 case AMDGPUISD::FMAX3:
12701 case AMDGPUISD::FMIN3:
12703 case AMDGPUISD::FMINIMUM3: {
12704 // FIXME: Shouldn't treat the generic operations different based these.
12705 // However, we aren't really required to flush the result from
12706 // minnum/maxnum..
12707
12708 // snans will be quieted, so we only need to worry about denormals.
12709 if (Subtarget->supportsMinMaxDenormModes() ||
12710 // FIXME: denormalsEnabledForType is broken for dynamic
12711 denormalsEnabledForType(DAG, Op.getValueType()))
12712 return true;
12713
12714 // Flushing may be required.
12715 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12716 // targets need to check their input recursively.
12717
12718 // FIXME: Does this apply with clamp? It's implemented with max.
12719 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12720 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12721 return false;
12722 }
12723
12724 return true;
12725 }
12726 case ISD::SELECT: {
12727 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12728 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12729 }
12730 case ISD::BUILD_VECTOR: {
12731 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12732 SDValue SrcOp = Op.getOperand(i);
12733 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12734 return false;
12735 }
12736
12737 return true;
12738 }
12741 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12742 }
12744 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12745 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12746 }
12747 case ISD::UNDEF:
12748 // Could be anything.
12749 return false;
12750
12751 case ISD::BITCAST:
12752 // TODO: This is incorrect as it loses track of the operand's type. We may
12753 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12754 // same bits that are canonicalized in one type need not be in the other.
12755 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12756 case ISD::TRUNCATE: {
12757 // Hack round the mess we make when legalizing extract_vector_elt
12758 if (Op.getValueType() == MVT::i16) {
12759 SDValue TruncSrc = Op.getOperand(0);
12760 if (TruncSrc.getValueType() == MVT::i32 &&
12761 TruncSrc.getOpcode() == ISD::BITCAST &&
12762 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12763 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12764 }
12765 }
12766 return false;
12767 }
12769 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12770 // TODO: Handle more intrinsics
12771 switch (IntrinsicID) {
12772 case Intrinsic::amdgcn_cvt_pkrtz:
12773 case Intrinsic::amdgcn_cubeid:
12774 case Intrinsic::amdgcn_frexp_mant:
12775 case Intrinsic::amdgcn_fdot2:
12776 case Intrinsic::amdgcn_rcp:
12777 case Intrinsic::amdgcn_rsq:
12778 case Intrinsic::amdgcn_rsq_clamp:
12779 case Intrinsic::amdgcn_rcp_legacy:
12780 case Intrinsic::amdgcn_rsq_legacy:
12781 case Intrinsic::amdgcn_trig_preop:
12782 case Intrinsic::amdgcn_log:
12783 case Intrinsic::amdgcn_exp2:
12784 case Intrinsic::amdgcn_sqrt:
12785 return true;
12786 default:
12787 break;
12788 }
12789
12790 break;
12791 }
12792 default:
12793 break;
12794 }
12795
12796 // FIXME: denormalsEnabledForType is broken for dynamic
12797 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12798 DAG.isKnownNeverSNaN(Op);
12799}
12800
12802 unsigned MaxDepth) const {
12803 const MachineRegisterInfo &MRI = MF.getRegInfo();
12804 MachineInstr *MI = MRI.getVRegDef(Reg);
12805 unsigned Opcode = MI->getOpcode();
12806
12807 if (Opcode == AMDGPU::G_FCANONICALIZE)
12808 return true;
12809
12810 std::optional<FPValueAndVReg> FCR;
12811 // Constant splat (can be padded with undef) or scalar constant.
12812 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12813 if (FCR->Value.isSignaling())
12814 return false;
12815 if (!FCR->Value.isDenormal())
12816 return true;
12817
12818 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12819 return Mode == DenormalMode::getIEEE();
12820 }
12821
12822 if (MaxDepth == 0)
12823 return false;
12824
12825 switch (Opcode) {
12826 case AMDGPU::G_FADD:
12827 case AMDGPU::G_FSUB:
12828 case AMDGPU::G_FMUL:
12829 case AMDGPU::G_FCEIL:
12830 case AMDGPU::G_FFLOOR:
12831 case AMDGPU::G_FRINT:
12832 case AMDGPU::G_FNEARBYINT:
12833 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12834 case AMDGPU::G_INTRINSIC_TRUNC:
12835 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12836 case AMDGPU::G_FMA:
12837 case AMDGPU::G_FMAD:
12838 case AMDGPU::G_FSQRT:
12839 case AMDGPU::G_FDIV:
12840 case AMDGPU::G_FREM:
12841 case AMDGPU::G_FPOW:
12842 case AMDGPU::G_FPEXT:
12843 case AMDGPU::G_FLOG:
12844 case AMDGPU::G_FLOG2:
12845 case AMDGPU::G_FLOG10:
12846 case AMDGPU::G_FPTRUNC:
12847 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12848 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12849 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12850 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12851 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12852 return true;
12853 case AMDGPU::G_FNEG:
12854 case AMDGPU::G_FABS:
12855 case AMDGPU::G_FCOPYSIGN:
12856 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12857 case AMDGPU::G_FMINNUM:
12858 case AMDGPU::G_FMAXNUM:
12859 case AMDGPU::G_FMINNUM_IEEE:
12860 case AMDGPU::G_FMAXNUM_IEEE:
12861 case AMDGPU::G_FMINIMUM:
12862 case AMDGPU::G_FMAXIMUM: {
12863 if (Subtarget->supportsMinMaxDenormModes() ||
12864 // FIXME: denormalsEnabledForType is broken for dynamic
12865 denormalsEnabledForType(MRI.getType(Reg), MF))
12866 return true;
12867
12868 [[fallthrough]];
12869 }
12870 case AMDGPU::G_BUILD_VECTOR:
12871 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12872 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12873 return false;
12874 return true;
12875 case AMDGPU::G_INTRINSIC:
12876 case AMDGPU::G_INTRINSIC_CONVERGENT:
12877 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12878 case Intrinsic::amdgcn_fmul_legacy:
12879 case Intrinsic::amdgcn_fmad_ftz:
12880 case Intrinsic::amdgcn_sqrt:
12881 case Intrinsic::amdgcn_fmed3:
12882 case Intrinsic::amdgcn_sin:
12883 case Intrinsic::amdgcn_cos:
12884 case Intrinsic::amdgcn_log:
12885 case Intrinsic::amdgcn_exp2:
12886 case Intrinsic::amdgcn_log_clamp:
12887 case Intrinsic::amdgcn_rcp:
12888 case Intrinsic::amdgcn_rcp_legacy:
12889 case Intrinsic::amdgcn_rsq:
12890 case Intrinsic::amdgcn_rsq_clamp:
12891 case Intrinsic::amdgcn_rsq_legacy:
12892 case Intrinsic::amdgcn_div_scale:
12893 case Intrinsic::amdgcn_div_fmas:
12894 case Intrinsic::amdgcn_div_fixup:
12895 case Intrinsic::amdgcn_fract:
12896 case Intrinsic::amdgcn_cvt_pkrtz:
12897 case Intrinsic::amdgcn_cubeid:
12898 case Intrinsic::amdgcn_cubema:
12899 case Intrinsic::amdgcn_cubesc:
12900 case Intrinsic::amdgcn_cubetc:
12901 case Intrinsic::amdgcn_frexp_mant:
12902 case Intrinsic::amdgcn_fdot2:
12903 case Intrinsic::amdgcn_trig_preop:
12904 return true;
12905 default:
12906 break;
12907 }
12908
12909 [[fallthrough]];
12910 default:
12911 return false;
12912 }
12913
12914 llvm_unreachable("invalid operation");
12915}
12916
12917// Constant fold canonicalize.
12918SDValue SITargetLowering::getCanonicalConstantFP(
12919 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12920 // Flush denormals to 0 if not enabled.
12921 if (C.isDenormal()) {
12922 DenormalMode Mode =
12923 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12924 if (Mode == DenormalMode::getPreserveSign()) {
12925 return DAG.getConstantFP(
12926 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12927 }
12928
12929 if (Mode != DenormalMode::getIEEE())
12930 return SDValue();
12931 }
12932
12933 if (C.isNaN()) {
12934 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12935 if (C.isSignaling()) {
12936 // Quiet a signaling NaN.
12937 // FIXME: Is this supposed to preserve payload bits?
12938 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12939 }
12940
12941 // Make sure it is the canonical NaN bitpattern.
12942 //
12943 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12944 // immediate?
12945 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12946 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12947 }
12948
12949 // Already canonical.
12950 return DAG.getConstantFP(C, SL, VT);
12951}
12952
12954 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12955}
12956
12957SDValue SITargetLowering::performFCanonicalizeCombine(
12958 SDNode *N,
12959 DAGCombinerInfo &DCI) const {
12960 SelectionDAG &DAG = DCI.DAG;
12961 SDValue N0 = N->getOperand(0);
12962 EVT VT = N->getValueType(0);
12963
12964 // fcanonicalize undef -> qnan
12965 if (N0.isUndef()) {
12967 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12968 }
12969
12970 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12971 EVT VT = N->getValueType(0);
12972 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12973 }
12974
12975 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12976 // (fcanonicalize k)
12977 //
12978 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12979
12980 // TODO: This could be better with wider vectors that will be split to v2f16,
12981 // and to consider uses since there aren't that many packed operations.
12982 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12983 isTypeLegal(MVT::v2f16)) {
12984 SDLoc SL(N);
12985 SDValue NewElts[2];
12986 SDValue Lo = N0.getOperand(0);
12987 SDValue Hi = N0.getOperand(1);
12988 EVT EltVT = Lo.getValueType();
12989
12991 for (unsigned I = 0; I != 2; ++I) {
12992 SDValue Op = N0.getOperand(I);
12993 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12994 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12995 CFP->getValueAPF());
12996 } else if (Op.isUndef()) {
12997 // Handled below based on what the other operand is.
12998 NewElts[I] = Op;
12999 } else {
13000 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13001 }
13002 }
13003
13004 // If one half is undef, and one is constant, prefer a splat vector rather
13005 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13006 // cheaper to use and may be free with a packed operation.
13007 if (NewElts[0].isUndef()) {
13008 if (isa<ConstantFPSDNode>(NewElts[1]))
13009 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13010 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13011 }
13012
13013 if (NewElts[1].isUndef()) {
13014 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13015 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13016 }
13017
13018 return DAG.getBuildVector(VT, SL, NewElts);
13019 }
13020 }
13021
13022 return SDValue();
13023}
13024
13025static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13026 switch (Opc) {
13027 case ISD::FMAXNUM:
13028 case ISD::FMAXNUM_IEEE:
13029 return AMDGPUISD::FMAX3;
13030 case ISD::FMAXIMUM:
13031 return AMDGPUISD::FMAXIMUM3;
13032 case ISD::SMAX:
13033 return AMDGPUISD::SMAX3;
13034 case ISD::UMAX:
13035 return AMDGPUISD::UMAX3;
13036 case ISD::FMINNUM:
13037 case ISD::FMINNUM_IEEE:
13038 return AMDGPUISD::FMIN3;
13039 case ISD::FMINIMUM:
13040 return AMDGPUISD::FMINIMUM3;
13041 case ISD::SMIN:
13042 return AMDGPUISD::SMIN3;
13043 case ISD::UMIN:
13044 return AMDGPUISD::UMIN3;
13045 default:
13046 llvm_unreachable("Not a min/max opcode");
13047 }
13048}
13049
13050SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13051 const SDLoc &SL, SDValue Src,
13052 SDValue MinVal,
13053 SDValue MaxVal,
13054 bool Signed) const {
13055
13056 // med3 comes from
13057 // min(max(x, K0), K1), K0 < K1
13058 // max(min(x, K0), K1), K1 < K0
13059 //
13060 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13061 // min/max op.
13062 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13063 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13064
13065 if (!MinK || !MaxK)
13066 return SDValue();
13067
13068 if (Signed) {
13069 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13070 return SDValue();
13071 } else {
13072 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13073 return SDValue();
13074 }
13075
13076 EVT VT = MinK->getValueType(0);
13077 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13078 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13079 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13080
13081 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13082 // not available, but this is unlikely to be profitable as constants
13083 // will often need to be materialized & extended, especially on
13084 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13085 return SDValue();
13086}
13087
13089 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13090 return C;
13091
13092 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13093 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13094 return C;
13095 }
13096
13097 return nullptr;
13098}
13099
13100SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13101 const SDLoc &SL,
13102 SDValue Op0,
13103 SDValue Op1) const {
13105 if (!K1)
13106 return SDValue();
13107
13109 if (!K0)
13110 return SDValue();
13111
13112 // Ordered >= (although NaN inputs should have folded away by now).
13113 if (K0->getValueAPF() > K1->getValueAPF())
13114 return SDValue();
13115
13116 const MachineFunction &MF = DAG.getMachineFunction();
13118
13119 // TODO: Check IEEE bit enabled?
13120 EVT VT = Op0.getValueType();
13121 if (Info->getMode().DX10Clamp) {
13122 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13123 // hardware fmed3 behavior converting to a min.
13124 // FIXME: Should this be allowing -0.0?
13125 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13126 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13127 }
13128
13129 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13130 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13131 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13132 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13133 // then give the other result, which is different from med3 with a NaN
13134 // input.
13135 SDValue Var = Op0.getOperand(0);
13136 if (!DAG.isKnownNeverSNaN(Var))
13137 return SDValue();
13138
13140
13141 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13142 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13143 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13144 Var, SDValue(K0, 0), SDValue(K1, 0));
13145 }
13146 }
13147
13148 return SDValue();
13149}
13150
13151/// \return true if the subtarget supports minimum3 and maximum3 with the given
13152/// base min/max opcode \p Opc for type \p VT.
13153static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13154 EVT VT) {
13155 switch (Opc) {
13156 case ISD::FMINNUM:
13157 case ISD::FMAXNUM:
13158 case ISD::FMINNUM_IEEE:
13159 case ISD::FMAXNUM_IEEE:
13162 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13163 case ISD::FMINIMUM:
13164 case ISD::FMAXIMUM:
13165 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13166 case ISD::SMAX:
13167 case ISD::SMIN:
13168 case ISD::UMAX:
13169 case ISD::UMIN:
13170 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13171 default:
13172 return false;
13173 }
13174
13175 llvm_unreachable("not a min/max opcode");
13176}
13177
13178SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13179 DAGCombinerInfo &DCI) const {
13180 SelectionDAG &DAG = DCI.DAG;
13181
13182 EVT VT = N->getValueType(0);
13183 unsigned Opc = N->getOpcode();
13184 SDValue Op0 = N->getOperand(0);
13185 SDValue Op1 = N->getOperand(1);
13186
13187 // Only do this if the inner op has one use since this will just increases
13188 // register pressure for no benefit.
13189
13190 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13191 // max(max(a, b), c) -> max3(a, b, c)
13192 // min(min(a, b), c) -> min3(a, b, c)
13193 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13194 SDLoc DL(N);
13195 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13196 DL,
13197 N->getValueType(0),
13198 Op0.getOperand(0),
13199 Op0.getOperand(1),
13200 Op1);
13201 }
13202
13203 // Try commuted.
13204 // max(a, max(b, c)) -> max3(a, b, c)
13205 // min(a, min(b, c)) -> min3(a, b, c)
13206 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13207 SDLoc DL(N);
13208 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13209 DL,
13210 N->getValueType(0),
13211 Op0,
13212 Op1.getOperand(0),
13213 Op1.getOperand(1));
13214 }
13215 }
13216
13217 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13218 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13219 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13220 if (SDValue Med3 = performIntMed3ImmCombine(
13221 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13222 return Med3;
13223 }
13224 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13225 if (SDValue Med3 = performIntMed3ImmCombine(
13226 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13227 return Med3;
13228 }
13229
13230 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13231 if (SDValue Med3 = performIntMed3ImmCombine(
13232 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13233 return Med3;
13234 }
13235 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13236 if (SDValue Med3 = performIntMed3ImmCombine(
13237 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13238 return Med3;
13239 }
13240
13241 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13242 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13243 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13244 (Opc == AMDGPUISD::FMIN_LEGACY &&
13245 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13246 (VT == MVT::f32 || VT == MVT::f64 ||
13247 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13248 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13249 Op0.hasOneUse()) {
13250 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13251 return Res;
13252 }
13253
13254 return SDValue();
13255}
13256
13258 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13259 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13260 // FIXME: Should this be allowing -0.0?
13261 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13262 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13263 }
13264 }
13265
13266 return false;
13267}
13268
13269// FIXME: Should only worry about snans for version with chain.
13270SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13271 DAGCombinerInfo &DCI) const {
13272 EVT VT = N->getValueType(0);
13273 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13274 // NaNs. With a NaN input, the order of the operands may change the result.
13275
13276 SelectionDAG &DAG = DCI.DAG;
13277 SDLoc SL(N);
13278
13279 SDValue Src0 = N->getOperand(0);
13280 SDValue Src1 = N->getOperand(1);
13281 SDValue Src2 = N->getOperand(2);
13282
13283 if (isClampZeroToOne(Src0, Src1)) {
13284 // const_a, const_b, x -> clamp is safe in all cases including signaling
13285 // nans.
13286 // FIXME: Should this be allowing -0.0?
13287 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13288 }
13289
13290 const MachineFunction &MF = DAG.getMachineFunction();
13292
13293 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13294 // handling no dx10-clamp?
13295 if (Info->getMode().DX10Clamp) {
13296 // If NaNs is clamped to 0, we are free to reorder the inputs.
13297
13298 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13299 std::swap(Src0, Src1);
13300
13301 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13302 std::swap(Src1, Src2);
13303
13304 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13305 std::swap(Src0, Src1);
13306
13307 if (isClampZeroToOne(Src1, Src2))
13308 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13309 }
13310
13311 return SDValue();
13312}
13313
13314SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13315 DAGCombinerInfo &DCI) const {
13316 SDValue Src0 = N->getOperand(0);
13317 SDValue Src1 = N->getOperand(1);
13318 if (Src0.isUndef() && Src1.isUndef())
13319 return DCI.DAG.getUNDEF(N->getValueType(0));
13320 return SDValue();
13321}
13322
13323// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13324// expanded into a set of cmp/select instructions.
13326 unsigned NumElem,
13327 bool IsDivergentIdx,
13328 const GCNSubtarget *Subtarget) {
13330 return false;
13331
13332 unsigned VecSize = EltSize * NumElem;
13333
13334 // Sub-dword vectors of size 2 dword or less have better implementation.
13335 if (VecSize <= 64 && EltSize < 32)
13336 return false;
13337
13338 // Always expand the rest of sub-dword instructions, otherwise it will be
13339 // lowered via memory.
13340 if (EltSize < 32)
13341 return true;
13342
13343 // Always do this if var-idx is divergent, otherwise it will become a loop.
13344 if (IsDivergentIdx)
13345 return true;
13346
13347 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13348 unsigned NumInsts = NumElem /* Number of compares */ +
13349 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13350
13351 // On some architectures (GFX9) movrel is not available and it's better
13352 // to expand.
13353 if (!Subtarget->hasMovrel())
13354 return NumInsts <= 16;
13355
13356 // If movrel is available, use it instead of expanding for vector of 8
13357 // elements.
13358 return NumInsts <= 15;
13359}
13360
13362 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13363 if (isa<ConstantSDNode>(Idx))
13364 return false;
13365
13366 SDValue Vec = N->getOperand(0);
13367 EVT VecVT = Vec.getValueType();
13368 EVT EltVT = VecVT.getVectorElementType();
13369 unsigned EltSize = EltVT.getSizeInBits();
13370 unsigned NumElem = VecVT.getVectorNumElements();
13371
13373 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13374}
13375
13376SDValue SITargetLowering::performExtractVectorEltCombine(
13377 SDNode *N, DAGCombinerInfo &DCI) const {
13378 SDValue Vec = N->getOperand(0);
13379 SelectionDAG &DAG = DCI.DAG;
13380
13381 EVT VecVT = Vec.getValueType();
13382 EVT VecEltVT = VecVT.getVectorElementType();
13383 EVT ResVT = N->getValueType(0);
13384
13385 unsigned VecSize = VecVT.getSizeInBits();
13386 unsigned VecEltSize = VecEltVT.getSizeInBits();
13387
13388 if ((Vec.getOpcode() == ISD::FNEG ||
13390 SDLoc SL(N);
13391 SDValue Idx = N->getOperand(1);
13392 SDValue Elt =
13393 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13394 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13395 }
13396
13397 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13398 // =>
13399 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13400 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13401 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13402 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13403 SDLoc SL(N);
13404 SDValue Idx = N->getOperand(1);
13405 unsigned Opc = Vec.getOpcode();
13406
13407 switch(Opc) {
13408 default:
13409 break;
13410 // TODO: Support other binary operations.
13411 case ISD::FADD:
13412 case ISD::FSUB:
13413 case ISD::FMUL:
13414 case ISD::ADD:
13415 case ISD::UMIN:
13416 case ISD::UMAX:
13417 case ISD::SMIN:
13418 case ISD::SMAX:
13419 case ISD::FMAXNUM:
13420 case ISD::FMINNUM:
13421 case ISD::FMAXNUM_IEEE:
13422 case ISD::FMINNUM_IEEE:
13423 case ISD::FMAXIMUM:
13424 case ISD::FMINIMUM: {
13425 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13426 Vec.getOperand(0), Idx);
13427 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13428 Vec.getOperand(1), Idx);
13429
13430 DCI.AddToWorklist(Elt0.getNode());
13431 DCI.AddToWorklist(Elt1.getNode());
13432 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13433 }
13434 }
13435 }
13436
13437 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13439 SDLoc SL(N);
13440 SDValue Idx = N->getOperand(1);
13441 SDValue V;
13442 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13443 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13444 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13445 if (I == 0)
13446 V = Elt;
13447 else
13448 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13449 }
13450 return V;
13451 }
13452
13453 if (!DCI.isBeforeLegalize())
13454 return SDValue();
13455
13456 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13457 // elements. This exposes more load reduction opportunities by replacing
13458 // multiple small extract_vector_elements with a single 32-bit extract.
13459 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13460 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13461 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13462 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13463
13464 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13465 unsigned EltIdx = BitIndex / 32;
13466 unsigned LeftoverBitIdx = BitIndex % 32;
13467 SDLoc SL(N);
13468
13469 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13470 DCI.AddToWorklist(Cast.getNode());
13471
13472 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13473 DAG.getConstant(EltIdx, SL, MVT::i32));
13474 DCI.AddToWorklist(Elt.getNode());
13475 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13476 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13477 DCI.AddToWorklist(Srl.getNode());
13478
13479 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13480 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13481 DCI.AddToWorklist(Trunc.getNode());
13482
13483 if (VecEltVT == ResVT) {
13484 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13485 }
13486
13487 assert(ResVT.isScalarInteger());
13488 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13489 }
13490
13491 return SDValue();
13492}
13493
13494SDValue
13495SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13496 DAGCombinerInfo &DCI) const {
13497 SDValue Vec = N->getOperand(0);
13498 SDValue Idx = N->getOperand(2);
13499 EVT VecVT = Vec.getValueType();
13500 EVT EltVT = VecVT.getVectorElementType();
13501
13502 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13503 // => BUILD_VECTOR n x select (e, const-idx)
13505 return SDValue();
13506
13507 SelectionDAG &DAG = DCI.DAG;
13508 SDLoc SL(N);
13509 SDValue Ins = N->getOperand(1);
13510 EVT IdxVT = Idx.getValueType();
13511
13513 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13514 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13515 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13516 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13517 Ops.push_back(V);
13518 }
13519
13520 return DAG.getBuildVector(VecVT, SL, Ops);
13521}
13522
13523/// Return the source of an fp_extend from f16 to f32, or a converted FP
13524/// constant.
13526 if (Src.getOpcode() == ISD::FP_EXTEND &&
13527 Src.getOperand(0).getValueType() == MVT::f16) {
13528 return Src.getOperand(0);
13529 }
13530
13531 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13532 APFloat Val = CFP->getValueAPF();
13533 bool LosesInfo = true;
13535 if (!LosesInfo)
13536 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13537 }
13538
13539 return SDValue();
13540}
13541
13542SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13543 DAGCombinerInfo &DCI) const {
13544 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13545 "combine only useful on gfx8");
13546
13547 SDValue TruncSrc = N->getOperand(0);
13548 EVT VT = N->getValueType(0);
13549 if (VT != MVT::f16)
13550 return SDValue();
13551
13552 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13553 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13554 return SDValue();
13555
13556 SelectionDAG &DAG = DCI.DAG;
13557 SDLoc SL(N);
13558
13559 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13560 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13561 // casting back.
13562
13563 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13564 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13565 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13566 if (!A)
13567 return SDValue();
13568
13569 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13570 if (!B)
13571 return SDValue();
13572
13573 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13574 if (!C)
13575 return SDValue();
13576
13577 // This changes signaling nan behavior. If an input is a signaling nan, it
13578 // would have been quieted by the fpext originally. We don't care because
13579 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13580 // we would be worse off than just doing the promotion.
13581 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13582 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13583 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13584 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13585}
13586
13587unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13588 const SDNode *N0,
13589 const SDNode *N1) const {
13590 EVT VT = N0->getValueType(0);
13591
13592 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13593 // support denormals ever.
13594 if (((VT == MVT::f32 &&
13596 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13599 return ISD::FMAD;
13600
13601 const TargetOptions &Options = DAG.getTarget().Options;
13602 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13603 (N0->getFlags().hasAllowContract() &&
13604 N1->getFlags().hasAllowContract())) &&
13606 return ISD::FMA;
13607 }
13608
13609 return 0;
13610}
13611
13612// For a reassociatable opcode perform:
13613// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13614SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13615 SelectionDAG &DAG) const {
13616 EVT VT = N->getValueType(0);
13617 if (VT != MVT::i32 && VT != MVT::i64)
13618 return SDValue();
13619
13620 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13621 return SDValue();
13622
13623 unsigned Opc = N->getOpcode();
13624 SDValue Op0 = N->getOperand(0);
13625 SDValue Op1 = N->getOperand(1);
13626
13627 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13628 return SDValue();
13629
13630 if (Op0->isDivergent())
13631 std::swap(Op0, Op1);
13632
13633 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13634 return SDValue();
13635
13636 SDValue Op2 = Op1.getOperand(1);
13637 Op1 = Op1.getOperand(0);
13638 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13639 return SDValue();
13640
13641 if (Op1->isDivergent())
13642 std::swap(Op1, Op2);
13643
13644 SDLoc SL(N);
13645 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13646 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13647}
13648
13649static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13650 EVT VT,
13651 SDValue N0, SDValue N1, SDValue N2,
13652 bool Signed) {
13654 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13655 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13656 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13657}
13658
13659// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13660// multiplies, if any.
13661//
13662// Full 64-bit multiplies that feed into an addition are lowered here instead
13663// of using the generic expansion. The generic expansion ends up with
13664// a tree of ADD nodes that prevents us from using the "add" part of the
13665// MAD instruction. The expansion produced here results in a chain of ADDs
13666// instead of a tree.
13667SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13668 DAGCombinerInfo &DCI) const {
13669 assert(N->getOpcode() == ISD::ADD);
13670
13671 SelectionDAG &DAG = DCI.DAG;
13672 EVT VT = N->getValueType(0);
13673 SDLoc SL(N);
13674 SDValue LHS = N->getOperand(0);
13675 SDValue RHS = N->getOperand(1);
13676
13677 if (VT.isVector())
13678 return SDValue();
13679
13680 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13681 // result in scalar registers for uniform values.
13682 if (!N->isDivergent() && Subtarget->hasSMulHi())
13683 return SDValue();
13684
13685 unsigned NumBits = VT.getScalarSizeInBits();
13686 if (NumBits <= 32 || NumBits > 64)
13687 return SDValue();
13688
13689 if (LHS.getOpcode() != ISD::MUL) {
13690 assert(RHS.getOpcode() == ISD::MUL);
13691 std::swap(LHS, RHS);
13692 }
13693
13694 // Avoid the fold if it would unduly increase the number of multiplies due to
13695 // multiple uses, except on hardware with full-rate multiply-add (which is
13696 // part of full-rate 64-bit ops).
13697 if (!Subtarget->hasFullRate64Ops()) {
13698 unsigned NumUsers = 0;
13699 for (SDNode *Use : LHS->uses()) {
13700 // There is a use that does not feed into addition, so the multiply can't
13701 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13702 if (Use->getOpcode() != ISD::ADD)
13703 return SDValue();
13704
13705 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13706 // MUL + 3xADD + 3xADDC over 3xMAD.
13707 ++NumUsers;
13708 if (NumUsers >= 3)
13709 return SDValue();
13710 }
13711 }
13712
13713 SDValue MulLHS = LHS.getOperand(0);
13714 SDValue MulRHS = LHS.getOperand(1);
13715 SDValue AddRHS = RHS;
13716
13717 // Always check whether operands are small unsigned values, since that
13718 // knowledge is useful in more cases. Check for small signed values only if
13719 // doing so can unlock a shorter code sequence.
13720 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13721 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13722
13723 bool MulSignedLo = false;
13724 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13725 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13726 numBitsSigned(MulRHS, DAG) <= 32;
13727 }
13728
13729 // The operands and final result all have the same number of bits. If
13730 // operands need to be extended, they can be extended with garbage. The
13731 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13732 // truncated away in the end.
13733 if (VT != MVT::i64) {
13734 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13735 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13736 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13737 }
13738
13739 // The basic code generated is conceptually straightforward. Pseudo code:
13740 //
13741 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13742 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13743 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13744 //
13745 // The second and third lines are optional, depending on whether the factors
13746 // are {sign,zero}-extended or not.
13747 //
13748 // The actual DAG is noisier than the pseudo code, but only due to
13749 // instructions that disassemble values into low and high parts, and
13750 // assemble the final result.
13751 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13752
13753 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13754 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13755 SDValue Accum =
13756 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13757
13758 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13759 SDValue AccumLo, AccumHi;
13760 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13761
13762 if (!MulLHSUnsigned32) {
13763 auto MulLHSHi =
13764 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13765 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13766 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13767 }
13768
13769 if (!MulRHSUnsigned32) {
13770 auto MulRHSHi =
13771 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13772 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13773 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13774 }
13775
13776 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13777 Accum = DAG.getBitcast(MVT::i64, Accum);
13778 }
13779
13780 if (VT != MVT::i64)
13781 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13782 return Accum;
13783}
13784
13785// Collect the ultimate src of each of the mul node's operands, and confirm
13786// each operand is 8 bytes.
13787static std::optional<ByteProvider<SDValue>>
13788handleMulOperand(const SDValue &MulOperand) {
13789 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13790 if (!Byte0 || Byte0->isConstantZero()) {
13791 return std::nullopt;
13792 }
13793 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13794 if (Byte1 && !Byte1->isConstantZero()) {
13795 return std::nullopt;
13796 }
13797 return Byte0;
13798}
13799
13800static unsigned addPermMasks(unsigned First, unsigned Second) {
13801 unsigned FirstCs = First & 0x0c0c0c0c;
13802 unsigned SecondCs = Second & 0x0c0c0c0c;
13803 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13804 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13805
13806 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13807 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13808 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13809 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13810
13811 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13812}
13813
13814struct DotSrc {
13816 int64_t PermMask;
13818};
13819
13823 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13824
13825 assert(Src0.Src.has_value() && Src1.Src.has_value());
13826 // Src0s and Src1s are empty, just place arbitrarily.
13827 if (Step == 0) {
13828 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13829 Src0.SrcOffset / 4});
13830 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13831 Src1.SrcOffset / 4});
13832 return;
13833 }
13834
13835 for (int BPI = 0; BPI < 2; BPI++) {
13836 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13837 if (BPI == 1) {
13838 BPP = {Src1, Src0};
13839 }
13840 unsigned ZeroMask = 0x0c0c0c0c;
13841 unsigned FMask = 0xFF << (8 * (3 - Step));
13842
13843 unsigned FirstMask =
13844 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13845 unsigned SecondMask =
13846 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13847 // Attempt to find Src vector which contains our SDValue, if so, add our
13848 // perm mask to the existing one. If we are unable to find a match for the
13849 // first SDValue, attempt to find match for the second.
13850 int FirstGroup = -1;
13851 for (int I = 0; I < 2; I++) {
13852 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13853 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13854 return IterElt.SrcOp == *BPP.first.Src &&
13855 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13856 };
13857
13858 auto Match = llvm::find_if(Srcs, MatchesFirst);
13859 if (Match != Srcs.end()) {
13860 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13861 FirstGroup = I;
13862 break;
13863 }
13864 }
13865 if (FirstGroup != -1) {
13866 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13867 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13868 return IterElt.SrcOp == *BPP.second.Src &&
13869 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13870 };
13871 auto Match = llvm::find_if(Srcs, MatchesSecond);
13872 if (Match != Srcs.end()) {
13873 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13874 } else
13875 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13876 return;
13877 }
13878 }
13879
13880 // If we have made it here, then we could not find a match in Src0s or Src1s
13881 // for either Src0 or Src1, so just place them arbitrarily.
13882
13883 unsigned ZeroMask = 0x0c0c0c0c;
13884 unsigned FMask = 0xFF << (8 * (3 - Step));
13885
13886 Src0s.push_back(
13887 {*Src0.Src,
13888 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13889 Src1.SrcOffset / 4});
13890 Src1s.push_back(
13891 {*Src1.Src,
13892 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13893 Src1.SrcOffset / 4});
13894
13895 return;
13896}
13897
13899 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13900 bool IsAny) {
13901
13902 // If we just have one source, just permute it accordingly.
13903 if (Srcs.size() == 1) {
13904 auto Elt = Srcs.begin();
13905 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13906
13907 // v_perm will produce the original value
13908 if (Elt->PermMask == 0x3020100)
13909 return EltOp;
13910
13911 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13912 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13913 }
13914
13915 auto FirstElt = Srcs.begin();
13916 auto SecondElt = std::next(FirstElt);
13917
13919
13920 // If we have multiple sources in the chain, combine them via perms (using
13921 // calculated perm mask) and Ors.
13922 while (true) {
13923 auto FirstMask = FirstElt->PermMask;
13924 auto SecondMask = SecondElt->PermMask;
13925
13926 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13927 unsigned FirstPlusFour = FirstMask | 0x04040404;
13928 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13929 // original 0x0C.
13930 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13931
13932 auto PermMask = addPermMasks(FirstMask, SecondMask);
13933 auto FirstVal =
13934 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13935 auto SecondVal =
13936 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13937
13938 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13939 SecondVal,
13940 DAG.getConstant(PermMask, SL, MVT::i32)));
13941
13942 FirstElt = std::next(SecondElt);
13943 if (FirstElt == Srcs.end())
13944 break;
13945
13946 SecondElt = std::next(FirstElt);
13947 // If we only have a FirstElt, then just combine that into the cumulative
13948 // source node.
13949 if (SecondElt == Srcs.end()) {
13950 auto EltOp =
13951 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13952
13953 Perms.push_back(
13954 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13955 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13956 break;
13957 }
13958 }
13959
13960 assert(Perms.size() == 1 || Perms.size() == 2);
13961 return Perms.size() == 2
13962 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13963 : Perms[0];
13964}
13965
13966static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13967 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13968 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13969 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13970 EntryMask += ZeroMask;
13971 }
13972}
13973
13974static bool isMul(const SDValue Op) {
13975 auto Opcode = Op.getOpcode();
13976
13977 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13978 Opcode == AMDGPUISD::MUL_I24);
13979}
13980
13981static std::optional<bool>
13983 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13984 const SDValue &S1Op, const SelectionDAG &DAG) {
13985 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13986 // of the dot4 is irrelevant.
13987 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13988 return false;
13989
13990 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13991 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13992 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13993 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13994 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13995 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13996
13997 assert(!(S0IsUnsigned && S0IsSigned));
13998 assert(!(S1IsUnsigned && S1IsSigned));
13999
14000 // There are 9 possible permutations of
14001 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14002
14003 // In two permutations, the sign bits are known to be the same for both Ops,
14004 // so simply return Signed / Unsigned corresponding to the MSB
14005
14006 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14007 return S0IsSigned;
14008
14009 // In another two permutations, the sign bits are known to be opposite. In
14010 // this case return std::nullopt to indicate a bad match.
14011
14012 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14013 return std::nullopt;
14014
14015 // In the remaining five permutations, we don't know the value of the sign
14016 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14017 // the upper bits must be extension bits. Thus, the only ways for the sign
14018 // bit to be unknown is if it was sign extended from unknown value, or if it
14019 // was any extended. In either case, it is correct to use the signed
14020 // version of the signedness semantics of dot4
14021
14022 // In two of such permutations, we known the sign bit is set for
14023 // one op, and the other is unknown. It is okay to used signed version of
14024 // dot4.
14025 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14026 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14027 return true;
14028
14029 // In one such permutation, we don't know either of the sign bits. It is okay
14030 // to used the signed version of dot4.
14031 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14032 return true;
14033
14034 // In two of such permutations, we known the sign bit is unset for
14035 // one op, and the other is unknown. Return std::nullopt to indicate a
14036 // bad match.
14037 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14038 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14039 return std::nullopt;
14040
14041 llvm_unreachable("Fully covered condition");
14042}
14043
14044SDValue SITargetLowering::performAddCombine(SDNode *N,
14045 DAGCombinerInfo &DCI) const {
14046 SelectionDAG &DAG = DCI.DAG;
14047 EVT VT = N->getValueType(0);
14048 SDLoc SL(N);
14049 SDValue LHS = N->getOperand(0);
14050 SDValue RHS = N->getOperand(1);
14051
14052 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14053 if (Subtarget->hasMad64_32()) {
14054 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14055 return Folded;
14056 }
14057 }
14058
14059 if (SDValue V = reassociateScalarOps(N, DAG)) {
14060 return V;
14061 }
14062
14063 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14064 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14065 SDValue TempNode(N, 0);
14066 std::optional<bool> IsSigned;
14070
14071 // Match the v_dot4 tree, while collecting src nodes.
14072 int ChainLength = 0;
14073 for (int I = 0; I < 4; I++) {
14074 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14075 if (MulIdx == -1)
14076 break;
14077 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14078 if (!Src0)
14079 break;
14080 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14081 if (!Src1)
14082 break;
14083
14084 auto IterIsSigned = checkDot4MulSignedness(
14085 TempNode->getOperand(MulIdx), *Src0, *Src1,
14086 TempNode->getOperand(MulIdx)->getOperand(0),
14087 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14088 if (!IterIsSigned)
14089 break;
14090 if (!IsSigned)
14091 IsSigned = *IterIsSigned;
14092 if (*IterIsSigned != *IsSigned)
14093 break;
14094 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14095 auto AddIdx = 1 - MulIdx;
14096 // Allow the special case where add (add (mul24, 0), mul24) became ->
14097 // add (mul24, mul24).
14098 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14099 Src2s.push_back(TempNode->getOperand(AddIdx));
14100 auto Src0 =
14101 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14102 if (!Src0)
14103 break;
14104 auto Src1 =
14105 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14106 if (!Src1)
14107 break;
14108 auto IterIsSigned = checkDot4MulSignedness(
14109 TempNode->getOperand(AddIdx), *Src0, *Src1,
14110 TempNode->getOperand(AddIdx)->getOperand(0),
14111 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14112 if (!IterIsSigned)
14113 break;
14114 assert(IsSigned);
14115 if (*IterIsSigned != *IsSigned)
14116 break;
14117 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14118 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14119 ChainLength = I + 2;
14120 break;
14121 }
14122
14123 TempNode = TempNode->getOperand(AddIdx);
14124 Src2s.push_back(TempNode);
14125 ChainLength = I + 1;
14126 if (TempNode->getNumOperands() < 2)
14127 break;
14128 LHS = TempNode->getOperand(0);
14129 RHS = TempNode->getOperand(1);
14130 }
14131
14132 if (ChainLength < 2)
14133 return SDValue();
14134
14135 // Masks were constructed with assumption that we would find a chain of
14136 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14137 // 0x0c) so they do not affect dot calculation.
14138 if (ChainLength < 4) {
14139 fixMasks(Src0s, ChainLength);
14140 fixMasks(Src1s, ChainLength);
14141 }
14142
14143 SDValue Src0, Src1;
14144
14145 // If we are just using a single source for both, and have permuted the
14146 // bytes consistently, we can just use the sources without permuting
14147 // (commutation).
14148 bool UseOriginalSrc = false;
14149 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14150 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14151 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14152 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14153 SmallVector<unsigned, 4> SrcBytes;
14154 auto Src0Mask = Src0s.begin()->PermMask;
14155 SrcBytes.push_back(Src0Mask & 0xFF000000);
14156 bool UniqueEntries = true;
14157 for (auto I = 1; I < 4; I++) {
14158 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14159
14160 if (is_contained(SrcBytes, NextByte)) {
14161 UniqueEntries = false;
14162 break;
14163 }
14164 SrcBytes.push_back(NextByte);
14165 }
14166
14167 if (UniqueEntries) {
14168 UseOriginalSrc = true;
14169
14170 auto FirstElt = Src0s.begin();
14171 auto FirstEltOp =
14172 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14173
14174 auto SecondElt = Src1s.begin();
14175 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14176 SecondElt->DWordOffset);
14177
14178 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14179 MVT::getIntegerVT(32));
14180 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14181 MVT::getIntegerVT(32));
14182 }
14183 }
14184
14185 if (!UseOriginalSrc) {
14186 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14187 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14188 }
14189
14190 assert(IsSigned);
14191 SDValue Src2 =
14192 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14193
14194 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14195 : Intrinsic::amdgcn_udot4,
14196 SL, MVT::i64);
14197
14198 assert(!VT.isVector());
14199 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14200 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14201
14202 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14203 }
14204
14205 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14206 return SDValue();
14207
14208 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14209 // add x, sext (setcc) => usubo_carry x, 0, setcc
14210 unsigned Opc = LHS.getOpcode();
14211 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14212 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14213 std::swap(RHS, LHS);
14214
14215 Opc = RHS.getOpcode();
14216 switch (Opc) {
14217 default: break;
14218 case ISD::ZERO_EXTEND:
14219 case ISD::SIGN_EXTEND:
14220 case ISD::ANY_EXTEND: {
14221 auto Cond = RHS.getOperand(0);
14222 // If this won't be a real VOPC output, we would still need to insert an
14223 // extra instruction anyway.
14224 if (!isBoolSGPR(Cond))
14225 break;
14226 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14227 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14229 return DAG.getNode(Opc, SL, VTList, Args);
14230 }
14231 case ISD::UADDO_CARRY: {
14232 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14233 if (!isNullConstant(RHS.getOperand(1)))
14234 break;
14235 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14236 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14237 }
14238 }
14239 return SDValue();
14240}
14241
14242SDValue SITargetLowering::performSubCombine(SDNode *N,
14243 DAGCombinerInfo &DCI) const {
14244 SelectionDAG &DAG = DCI.DAG;
14245 EVT VT = N->getValueType(0);
14246
14247 if (VT != MVT::i32)
14248 return SDValue();
14249
14250 SDLoc SL(N);
14251 SDValue LHS = N->getOperand(0);
14252 SDValue RHS = N->getOperand(1);
14253
14254 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14255 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14256 unsigned Opc = RHS.getOpcode();
14257 switch (Opc) {
14258 default: break;
14259 case ISD::ZERO_EXTEND:
14260 case ISD::SIGN_EXTEND:
14261 case ISD::ANY_EXTEND: {
14262 auto Cond = RHS.getOperand(0);
14263 // If this won't be a real VOPC output, we would still need to insert an
14264 // extra instruction anyway.
14265 if (!isBoolSGPR(Cond))
14266 break;
14267 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14268 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14270 return DAG.getNode(Opc, SL, VTList, Args);
14271 }
14272 }
14273
14274 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14275 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14276 if (!isNullConstant(LHS.getOperand(1)))
14277 return SDValue();
14278 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14279 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14280 }
14281 return SDValue();
14282}
14283
14284SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14285 DAGCombinerInfo &DCI) const {
14286
14287 if (N->getValueType(0) != MVT::i32)
14288 return SDValue();
14289
14290 if (!isNullConstant(N->getOperand(1)))
14291 return SDValue();
14292
14293 SelectionDAG &DAG = DCI.DAG;
14294 SDValue LHS = N->getOperand(0);
14295
14296 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14297 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14298 unsigned LHSOpc = LHS.getOpcode();
14299 unsigned Opc = N->getOpcode();
14300 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14301 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14302 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14303 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14304 }
14305 return SDValue();
14306}
14307
14308SDValue SITargetLowering::performFAddCombine(SDNode *N,
14309 DAGCombinerInfo &DCI) const {
14310 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14311 return SDValue();
14312
14313 SelectionDAG &DAG = DCI.DAG;
14314 EVT VT = N->getValueType(0);
14315
14316 SDLoc SL(N);
14317 SDValue LHS = N->getOperand(0);
14318 SDValue RHS = N->getOperand(1);
14319
14320 // These should really be instruction patterns, but writing patterns with
14321 // source modifiers is a pain.
14322
14323 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14324 if (LHS.getOpcode() == ISD::FADD) {
14325 SDValue A = LHS.getOperand(0);
14326 if (A == LHS.getOperand(1)) {
14327 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14328 if (FusedOp != 0) {
14329 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14330 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14331 }
14332 }
14333 }
14334
14335 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14336 if (RHS.getOpcode() == ISD::FADD) {
14337 SDValue A = RHS.getOperand(0);
14338 if (A == RHS.getOperand(1)) {
14339 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14340 if (FusedOp != 0) {
14341 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14342 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14343 }
14344 }
14345 }
14346
14347 return SDValue();
14348}
14349
14350SDValue SITargetLowering::performFSubCombine(SDNode *N,
14351 DAGCombinerInfo &DCI) const {
14352 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14353 return SDValue();
14354
14355 SelectionDAG &DAG = DCI.DAG;
14356 SDLoc SL(N);
14357 EVT VT = N->getValueType(0);
14358 assert(!VT.isVector());
14359
14360 // Try to get the fneg to fold into the source modifier. This undoes generic
14361 // DAG combines and folds them into the mad.
14362 //
14363 // Only do this if we are not trying to support denormals. v_mad_f32 does
14364 // not support denormals ever.
14365 SDValue LHS = N->getOperand(0);
14366 SDValue RHS = N->getOperand(1);
14367 if (LHS.getOpcode() == ISD::FADD) {
14368 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14369 SDValue A = LHS.getOperand(0);
14370 if (A == LHS.getOperand(1)) {
14371 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14372 if (FusedOp != 0){
14373 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14374 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14375
14376 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14377 }
14378 }
14379 }
14380
14381 if (RHS.getOpcode() == ISD::FADD) {
14382 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14383
14384 SDValue A = RHS.getOperand(0);
14385 if (A == RHS.getOperand(1)) {
14386 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14387 if (FusedOp != 0){
14388 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14389 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14390 }
14391 }
14392 }
14393
14394 return SDValue();
14395}
14396
14397SDValue SITargetLowering::performFDivCombine(SDNode *N,
14398 DAGCombinerInfo &DCI) const {
14399 SelectionDAG &DAG = DCI.DAG;
14400 SDLoc SL(N);
14401 EVT VT = N->getValueType(0);
14402 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14403 return SDValue();
14404
14405 SDValue LHS = N->getOperand(0);
14406 SDValue RHS = N->getOperand(1);
14407
14408 SDNodeFlags Flags = N->getFlags();
14409 SDNodeFlags RHSFlags = RHS->getFlags();
14410 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14411 !RHS->hasOneUse())
14412 return SDValue();
14413
14414 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14415 bool IsNegative = false;
14416 if (CLHS->isExactlyValue(1.0) ||
14417 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14418 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14419 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14420 if (RHS.getOpcode() == ISD::FSQRT) {
14421 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14422 SDValue Rsq =
14423 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14424 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14425 }
14426 }
14427 }
14428
14429 return SDValue();
14430}
14431
14432SDValue SITargetLowering::performFMACombine(SDNode *N,
14433 DAGCombinerInfo &DCI) const {
14434 SelectionDAG &DAG = DCI.DAG;
14435 EVT VT = N->getValueType(0);
14436 SDLoc SL(N);
14437
14438 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14439 return SDValue();
14440
14441 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14442 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14443 SDValue Op1 = N->getOperand(0);
14444 SDValue Op2 = N->getOperand(1);
14445 SDValue FMA = N->getOperand(2);
14446
14447 if (FMA.getOpcode() != ISD::FMA ||
14448 Op1.getOpcode() != ISD::FP_EXTEND ||
14449 Op2.getOpcode() != ISD::FP_EXTEND)
14450 return SDValue();
14451
14452 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14453 // regardless of the denorm mode setting. Therefore,
14454 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14455 const TargetOptions &Options = DAG.getTarget().Options;
14456 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14457 (N->getFlags().hasAllowContract() &&
14458 FMA->getFlags().hasAllowContract())) {
14459 Op1 = Op1.getOperand(0);
14460 Op2 = Op2.getOperand(0);
14461 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14463 return SDValue();
14464
14465 SDValue Vec1 = Op1.getOperand(0);
14466 SDValue Idx1 = Op1.getOperand(1);
14467 SDValue Vec2 = Op2.getOperand(0);
14468
14469 SDValue FMAOp1 = FMA.getOperand(0);
14470 SDValue FMAOp2 = FMA.getOperand(1);
14471 SDValue FMAAcc = FMA.getOperand(2);
14472
14473 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14474 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14475 return SDValue();
14476
14477 FMAOp1 = FMAOp1.getOperand(0);
14478 FMAOp2 = FMAOp2.getOperand(0);
14479 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14481 return SDValue();
14482
14483 SDValue Vec3 = FMAOp1.getOperand(0);
14484 SDValue Vec4 = FMAOp2.getOperand(0);
14485 SDValue Idx2 = FMAOp1.getOperand(1);
14486
14487 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14488 // Idx1 and Idx2 cannot be the same.
14489 Idx1 == Idx2)
14490 return SDValue();
14491
14492 if (Vec1 == Vec2 || Vec3 == Vec4)
14493 return SDValue();
14494
14495 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14496 return SDValue();
14497
14498 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14499 (Vec1 == Vec4 && Vec2 == Vec3)) {
14500 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14501 DAG.getTargetConstant(0, SL, MVT::i1));
14502 }
14503 }
14504 return SDValue();
14505}
14506
14507SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14508 DAGCombinerInfo &DCI) const {
14509 SelectionDAG &DAG = DCI.DAG;
14510 SDLoc SL(N);
14511
14512 SDValue LHS = N->getOperand(0);
14513 SDValue RHS = N->getOperand(1);
14514 EVT VT = LHS.getValueType();
14515 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14516
14517 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14518 if (!CRHS) {
14519 CRHS = dyn_cast<ConstantSDNode>(LHS);
14520 if (CRHS) {
14521 std::swap(LHS, RHS);
14523 }
14524 }
14525
14526 if (CRHS) {
14527 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14528 isBoolSGPR(LHS.getOperand(0))) {
14529 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14530 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14531 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14532 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14533 if ((CRHS->isAllOnes() &&
14534 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14535 (CRHS->isZero() &&
14536 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14537 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14538 DAG.getConstant(-1, SL, MVT::i1));
14539 if ((CRHS->isAllOnes() &&
14540 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14541 (CRHS->isZero() &&
14542 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14543 return LHS.getOperand(0);
14544 }
14545
14546 const APInt &CRHSVal = CRHS->getAPIntValue();
14547 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14548 LHS.getOpcode() == ISD::SELECT &&
14549 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14550 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14551 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14552 isBoolSGPR(LHS.getOperand(0))) {
14553 // Given CT != FT:
14554 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14555 // setcc (select cc, CT, CF), CF, ne => cc
14556 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14557 // setcc (select cc, CT, CF), CT, eq => cc
14558 const APInt &CT = LHS.getConstantOperandAPInt(1);
14559 const APInt &CF = LHS.getConstantOperandAPInt(2);
14560
14561 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14562 (CT == CRHSVal && CC == ISD::SETNE))
14563 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14564 DAG.getConstant(-1, SL, MVT::i1));
14565 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14566 (CT == CRHSVal && CC == ISD::SETEQ))
14567 return LHS.getOperand(0);
14568 }
14569 }
14570
14571 if (VT != MVT::f32 && VT != MVT::f64 &&
14572 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14573 return SDValue();
14574
14575 // Match isinf/isfinite pattern
14576 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14577 // (fcmp one (fabs x), inf) -> (fp_class x,
14578 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14579 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14580 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14581 if (!CRHS)
14582 return SDValue();
14583
14584 const APFloat &APF = CRHS->getValueAPF();
14585 if (APF.isInfinity() && !APF.isNegative()) {
14586 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14588 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14594 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14595 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14596 DAG.getConstant(Mask, SL, MVT::i32));
14597 }
14598 }
14599
14600 return SDValue();
14601}
14602
14603SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14604 DAGCombinerInfo &DCI) const {
14605 SelectionDAG &DAG = DCI.DAG;
14606 SDLoc SL(N);
14607 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14608
14609 SDValue Src = N->getOperand(0);
14610 SDValue Shift = N->getOperand(0);
14611
14612 // TODO: Extend type shouldn't matter (assuming legal types).
14613 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14614 Shift = Shift.getOperand(0);
14615
14616 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14617 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14618 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14619 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14620 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14621 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14622 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14623 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14624 SDLoc(Shift.getOperand(0)), MVT::i32);
14625
14626 unsigned ShiftOffset = 8 * Offset;
14627 if (Shift.getOpcode() == ISD::SHL)
14628 ShiftOffset -= C->getZExtValue();
14629 else
14630 ShiftOffset += C->getZExtValue();
14631
14632 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14633 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14634 MVT::f32, Shifted);
14635 }
14636 }
14637 }
14638
14639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14640 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14641 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14642 // We simplified Src. If this node is not dead, visit it again so it is
14643 // folded properly.
14644 if (N->getOpcode() != ISD::DELETED_NODE)
14645 DCI.AddToWorklist(N);
14646 return SDValue(N, 0);
14647 }
14648
14649 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14650 if (SDValue DemandedSrc =
14652 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14653
14654 return SDValue();
14655}
14656
14657SDValue SITargetLowering::performClampCombine(SDNode *N,
14658 DAGCombinerInfo &DCI) const {
14659 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14660 if (!CSrc)
14661 return SDValue();
14662
14663 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14664 const APFloat &F = CSrc->getValueAPF();
14665 APFloat Zero = APFloat::getZero(F.getSemantics());
14666 if (F < Zero ||
14667 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14668 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14669 }
14670
14671 APFloat One(F.getSemantics(), "1.0");
14672 if (F > One)
14673 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14674
14675 return SDValue(CSrc, 0);
14676}
14677
14678
14680 DAGCombinerInfo &DCI) const {
14681 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14682 return SDValue();
14683 switch (N->getOpcode()) {
14684 case ISD::ADD:
14685 return performAddCombine(N, DCI);
14686 case ISD::SUB:
14687 return performSubCombine(N, DCI);
14688 case ISD::UADDO_CARRY:
14689 case ISD::USUBO_CARRY:
14690 return performAddCarrySubCarryCombine(N, DCI);
14691 case ISD::FADD:
14692 return performFAddCombine(N, DCI);
14693 case ISD::FSUB:
14694 return performFSubCombine(N, DCI);
14695 case ISD::FDIV:
14696 return performFDivCombine(N, DCI);
14697 case ISD::SETCC:
14698 return performSetCCCombine(N, DCI);
14699 case ISD::FMAXNUM:
14700 case ISD::FMINNUM:
14701 case ISD::FMAXNUM_IEEE:
14702 case ISD::FMINNUM_IEEE:
14703 case ISD::FMAXIMUM:
14704 case ISD::FMINIMUM:
14705 case ISD::SMAX:
14706 case ISD::SMIN:
14707 case ISD::UMAX:
14708 case ISD::UMIN:
14711 return performMinMaxCombine(N, DCI);
14712 case ISD::FMA:
14713 return performFMACombine(N, DCI);
14714 case ISD::AND:
14715 return performAndCombine(N, DCI);
14716 case ISD::OR:
14717 return performOrCombine(N, DCI);
14718 case ISD::FSHR: {
14720 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14721 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14722 return matchPERM(N, DCI);
14723 }
14724 break;
14725 }
14726 case ISD::XOR:
14727 return performXorCombine(N, DCI);
14728 case ISD::ZERO_EXTEND:
14729 return performZeroExtendCombine(N, DCI);
14731 return performSignExtendInRegCombine(N , DCI);
14733 return performClassCombine(N, DCI);
14734 case ISD::FCANONICALIZE:
14735 return performFCanonicalizeCombine(N, DCI);
14736 case AMDGPUISD::RCP:
14737 return performRcpCombine(N, DCI);
14738 case ISD::FLDEXP:
14739 case AMDGPUISD::FRACT:
14740 case AMDGPUISD::RSQ:
14743 case AMDGPUISD::RSQ_CLAMP: {
14744 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14745 SDValue Src = N->getOperand(0);
14746 if (Src.isUndef())
14747 return Src;
14748 break;
14749 }
14750 case ISD::SINT_TO_FP:
14751 case ISD::UINT_TO_FP:
14752 return performUCharToFloatCombine(N, DCI);
14753 case ISD::FCOPYSIGN:
14754 return performFCopySignCombine(N, DCI);
14759 return performCvtF32UByteNCombine(N, DCI);
14760 case AMDGPUISD::FMED3:
14761 return performFMed3Combine(N, DCI);
14763 return performCvtPkRTZCombine(N, DCI);
14764 case AMDGPUISD::CLAMP:
14765 return performClampCombine(N, DCI);
14766 case ISD::SCALAR_TO_VECTOR: {
14767 SelectionDAG &DAG = DCI.DAG;
14768 EVT VT = N->getValueType(0);
14769
14770 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14771 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14772 SDLoc SL(N);
14773 SDValue Src = N->getOperand(0);
14774 EVT EltVT = Src.getValueType();
14775 if (EltVT != MVT::i16)
14776 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14777
14778 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14779 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14780 }
14781
14782 break;
14783 }
14785 return performExtractVectorEltCombine(N, DCI);
14787 return performInsertVectorEltCombine(N, DCI);
14788 case ISD::FP_ROUND:
14789 return performFPRoundCombine(N, DCI);
14790 case ISD::LOAD: {
14791 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14792 return Widened;
14793 [[fallthrough]];
14794 }
14795 default: {
14796 if (!DCI.isBeforeLegalize()) {
14797 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14798 return performMemSDNodeCombine(MemNode, DCI);
14799 }
14800
14801 break;
14802 }
14803 }
14804
14806}
14807
14808/// Helper function for adjustWritemask
14809static unsigned SubIdx2Lane(unsigned Idx) {
14810 switch (Idx) {
14811 default: return ~0u;
14812 case AMDGPU::sub0: return 0;
14813 case AMDGPU::sub1: return 1;
14814 case AMDGPU::sub2: return 2;
14815 case AMDGPU::sub3: return 3;
14816 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14817 }
14818}
14819
14820/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14821SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14822 SelectionDAG &DAG) const {
14823 unsigned Opcode = Node->getMachineOpcode();
14824
14825 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14826 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14827 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14828 return Node; // not implemented for D16
14829
14830 SDNode *Users[5] = { nullptr };
14831 unsigned Lane = 0;
14832 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14833 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14834 unsigned NewDmask = 0;
14835 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14836 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14837 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14838 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14839 ? true
14840 : false;
14841 unsigned TFCLane = 0;
14842 bool HasChain = Node->getNumValues() > 1;
14843
14844 if (OldDmask == 0) {
14845 // These are folded out, but on the chance it happens don't assert.
14846 return Node;
14847 }
14848
14849 unsigned OldBitsSet = llvm::popcount(OldDmask);
14850 // Work out which is the TFE/LWE lane if that is enabled.
14851 if (UsesTFC) {
14852 TFCLane = OldBitsSet;
14853 }
14854
14855 // Try to figure out the used register components
14856 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14857 I != E; ++I) {
14858
14859 // Don't look at users of the chain.
14860 if (I.getUse().getResNo() != 0)
14861 continue;
14862
14863 // Abort if we can't understand the usage
14864 if (!I->isMachineOpcode() ||
14865 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14866 return Node;
14867
14868 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14869 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14870 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14871 // set, etc.
14872 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14873 if (Lane == ~0u)
14874 return Node;
14875
14876 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14877 if (UsesTFC && Lane == TFCLane) {
14878 Users[Lane] = *I;
14879 } else {
14880 // Set which texture component corresponds to the lane.
14881 unsigned Comp;
14882 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14883 Comp = llvm::countr_zero(Dmask);
14884 Dmask &= ~(1 << Comp);
14885 }
14886
14887 // Abort if we have more than one user per component.
14888 if (Users[Lane])
14889 return Node;
14890
14891 Users[Lane] = *I;
14892 NewDmask |= 1 << Comp;
14893 }
14894 }
14895
14896 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14897 bool NoChannels = !NewDmask;
14898 if (NoChannels) {
14899 if (!UsesTFC) {
14900 // No uses of the result and not using TFC. Then do nothing.
14901 return Node;
14902 }
14903 // If the original dmask has one channel - then nothing to do
14904 if (OldBitsSet == 1)
14905 return Node;
14906 // Use an arbitrary dmask - required for the instruction to work
14907 NewDmask = 1;
14908 }
14909 // Abort if there's no change
14910 if (NewDmask == OldDmask)
14911 return Node;
14912
14913 unsigned BitsSet = llvm::popcount(NewDmask);
14914
14915 // Check for TFE or LWE - increase the number of channels by one to account
14916 // for the extra return value
14917 // This will need adjustment for D16 if this is also included in
14918 // adjustWriteMask (this function) but at present D16 are excluded.
14919 unsigned NewChannels = BitsSet + UsesTFC;
14920
14921 int NewOpcode =
14922 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14923 assert(NewOpcode != -1 &&
14924 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14925 "failed to find equivalent MIMG op");
14926
14927 // Adjust the writemask in the node
14929 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14930 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14931 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14932
14933 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14934
14935 MVT ResultVT = NewChannels == 1 ?
14936 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14937 NewChannels == 5 ? 8 : NewChannels);
14938 SDVTList NewVTList = HasChain ?
14939 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14940
14941
14942 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14943 NewVTList, Ops);
14944
14945 if (HasChain) {
14946 // Update chain.
14947 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14948 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14949 }
14950
14951 if (NewChannels == 1) {
14952 assert(Node->hasNUsesOfValue(1, 0));
14953 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14954 SDLoc(Node), Users[Lane]->getValueType(0),
14955 SDValue(NewNode, 0));
14956 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14957 return nullptr;
14958 }
14959
14960 // Update the users of the node with the new indices
14961 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14962 SDNode *User = Users[i];
14963 if (!User) {
14964 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14965 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14966 if (i || !NoChannels)
14967 continue;
14968 } else {
14969 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14970 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14971 if (NewUser != User) {
14972 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14973 DAG.RemoveDeadNode(User);
14974 }
14975 }
14976
14977 switch (Idx) {
14978 default: break;
14979 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14980 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14981 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14982 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14983 }
14984 }
14985
14986 DAG.RemoveDeadNode(Node);
14987 return nullptr;
14988}
14989
14991 if (Op.getOpcode() == ISD::AssertZext)
14992 Op = Op.getOperand(0);
14993
14994 return isa<FrameIndexSDNode>(Op);
14995}
14996
14997/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14998/// with frame index operands.
14999/// LLVM assumes that inputs are to these instructions are registers.
15001 SelectionDAG &DAG) const {
15002 if (Node->getOpcode() == ISD::CopyToReg) {
15003 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15004 SDValue SrcVal = Node->getOperand(2);
15005
15006 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15007 // to try understanding copies to physical registers.
15008 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15009 SDLoc SL(Node);
15011 SDValue VReg = DAG.getRegister(
15012 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15013
15014 SDNode *Glued = Node->getGluedNode();
15015 SDValue ToVReg
15016 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15017 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15018 SDValue ToResultReg
15019 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15020 VReg, ToVReg.getValue(1));
15021 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15022 DAG.RemoveDeadNode(Node);
15023 return ToResultReg.getNode();
15024 }
15025 }
15026
15028 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15029 if (!isFrameIndexOp(Node->getOperand(i))) {
15030 Ops.push_back(Node->getOperand(i));
15031 continue;
15032 }
15033
15034 SDLoc DL(Node);
15035 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15036 Node->getOperand(i).getValueType(),
15037 Node->getOperand(i)), 0));
15038 }
15039
15040 return DAG.UpdateNodeOperands(Node, Ops);
15041}
15042
15043/// Fold the instructions after selecting them.
15044/// Returns null if users were already updated.
15046 SelectionDAG &DAG) const {
15048 unsigned Opcode = Node->getMachineOpcode();
15049
15050 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15051 !TII->isGather4(Opcode) &&
15052 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15053 return adjustWritemask(Node, DAG);
15054 }
15055
15056 if (Opcode == AMDGPU::INSERT_SUBREG ||
15057 Opcode == AMDGPU::REG_SEQUENCE) {
15059 return Node;
15060 }
15061
15062 switch (Opcode) {
15063 case AMDGPU::V_DIV_SCALE_F32_e64:
15064 case AMDGPU::V_DIV_SCALE_F64_e64: {
15065 // Satisfy the operand register constraint when one of the inputs is
15066 // undefined. Ordinarily each undef value will have its own implicit_def of
15067 // a vreg, so force these to use a single register.
15068 SDValue Src0 = Node->getOperand(1);
15069 SDValue Src1 = Node->getOperand(3);
15070 SDValue Src2 = Node->getOperand(5);
15071
15072 if ((Src0.isMachineOpcode() &&
15073 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15074 (Src0 == Src1 || Src0 == Src2))
15075 break;
15076
15077 MVT VT = Src0.getValueType().getSimpleVT();
15078 const TargetRegisterClass *RC =
15079 getRegClassFor(VT, Src0.getNode()->isDivergent());
15080
15082 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15083
15084 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15085 UndefReg, Src0, SDValue());
15086
15087 // src0 must be the same register as src1 or src2, even if the value is
15088 // undefined, so make sure we don't violate this constraint.
15089 if (Src0.isMachineOpcode() &&
15090 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15091 if (Src1.isMachineOpcode() &&
15092 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15093 Src0 = Src1;
15094 else if (Src2.isMachineOpcode() &&
15095 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15096 Src0 = Src2;
15097 else {
15098 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15099 Src0 = UndefReg;
15100 Src1 = UndefReg;
15101 }
15102 } else
15103 break;
15104
15105 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15106 Ops[1] = Src0;
15107 Ops[3] = Src1;
15108 Ops[5] = Src2;
15109 Ops.push_back(ImpDef.getValue(1));
15110 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15111 }
15112 default:
15113 break;
15114 }
15115
15116 return Node;
15117}
15118
15119// Any MIMG instructions that use tfe or lwe require an initialization of the
15120// result register that will be written in the case of a memory access failure.
15121// The required code is also added to tie this init code to the result of the
15122// img instruction.
15125 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15126 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15127 MachineBasicBlock &MBB = *MI.getParent();
15128
15129 int DstIdx =
15130 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15131 unsigned InitIdx = 0;
15132
15133 if (TII->isImage(MI)) {
15134 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15135 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15136 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15137
15138 if (!TFE && !LWE) // intersect_ray
15139 return;
15140
15141 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15142 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15143 unsigned D16Val = D16 ? D16->getImm() : 0;
15144
15145 if (!TFEVal && !LWEVal)
15146 return;
15147
15148 // At least one of TFE or LWE are non-zero
15149 // We have to insert a suitable initialization of the result value and
15150 // tie this to the dest of the image instruction.
15151
15152 // Calculate which dword we have to initialize to 0.
15153 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15154
15155 // check that dmask operand is found.
15156 assert(MO_Dmask && "Expected dmask operand in instruction");
15157
15158 unsigned dmask = MO_Dmask->getImm();
15159 // Determine the number of active lanes taking into account the
15160 // Gather4 special case
15161 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15162
15163 bool Packed = !Subtarget->hasUnpackedD16VMem();
15164
15165 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15166
15167 // Abandon attempt if the dst size isn't large enough
15168 // - this is in fact an error but this is picked up elsewhere and
15169 // reported correctly.
15170 uint32_t DstSize =
15171 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15172 if (DstSize < InitIdx)
15173 return;
15174 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15175 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15176 } else {
15177 return;
15178 }
15179
15180 const DebugLoc &DL = MI.getDebugLoc();
15181
15182 // Create a register for the initialization value.
15183 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15184 unsigned NewDst = 0; // Final initialized value will be in here
15185
15186 // If PRTStrictNull feature is enabled (the default) then initialize
15187 // all the result registers to 0, otherwise just the error indication
15188 // register (VGPRn+1)
15189 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15190 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15191
15192 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15193 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15194 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15195 // Initialize dword
15196 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15197 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15198 .addImm(0);
15199 // Insert into the super-reg
15200 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15201 .addReg(PrevDst)
15202 .addReg(SubReg)
15204
15205 PrevDst = NewDst;
15206 }
15207
15208 // Add as an implicit operand
15209 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15210
15211 // Tie the just added implicit operand to the dst
15212 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15213}
15214
15215/// Assign the register class depending on the number of
15216/// bits set in the writemask
15218 SDNode *Node) const {
15220
15221 MachineFunction *MF = MI.getParent()->getParent();
15224
15225 if (TII->isVOP3(MI.getOpcode())) {
15226 // Make sure constant bus requirements are respected.
15227 TII->legalizeOperandsVOP3(MRI, MI);
15228
15229 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15230 // This saves a chain-copy of registers and better balance register
15231 // use between vgpr and agpr as agpr tuples tend to be big.
15232 if (!MI.getDesc().operands().empty()) {
15233 unsigned Opc = MI.getOpcode();
15234 bool HasAGPRs = Info->mayNeedAGPRs();
15235 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15236 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15237 for (auto I :
15238 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15239 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15240 if (I == -1)
15241 break;
15242 if ((I == Src2Idx) && (HasAGPRs))
15243 break;
15244 MachineOperand &Op = MI.getOperand(I);
15245 if (!Op.isReg() || !Op.getReg().isVirtual())
15246 continue;
15247 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15248 if (!TRI->hasAGPRs(RC))
15249 continue;
15250 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15251 if (!Src || !Src->isCopy() ||
15252 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15253 continue;
15254 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15255 // All uses of agpr64 and agpr32 can also accept vgpr except for
15256 // v_accvgpr_read, but we do not produce agpr reads during selection,
15257 // so no use checks are needed.
15258 MRI.setRegClass(Op.getReg(), NewRC);
15259 }
15260
15261 if (!HasAGPRs)
15262 return;
15263
15264 // Resolve the rest of AV operands to AGPRs.
15265 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15266 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15267 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15268 if (TRI->isVectorSuperClass(RC)) {
15269 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15270 MRI.setRegClass(Src2->getReg(), NewRC);
15271 if (Src2->isTied())
15272 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15273 }
15274 }
15275 }
15276 }
15277
15278 return;
15279 }
15280
15281 if (TII->isImage(MI))
15282 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15283}
15284
15286 uint64_t Val) {
15287 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15288 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15289}
15290
15292 const SDLoc &DL,
15293 SDValue Ptr) const {
15295
15296 // Build the half of the subregister with the constants before building the
15297 // full 128-bit register. If we are building multiple resource descriptors,
15298 // this will allow CSEing of the 2-component register.
15299 const SDValue Ops0[] = {
15300 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15301 buildSMovImm32(DAG, DL, 0),
15302 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15303 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15304 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15305 };
15306
15307 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15308 MVT::v2i32, Ops0), 0);
15309
15310 // Combine the constants and the pointer.
15311 const SDValue Ops1[] = {
15312 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15313 Ptr,
15314 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15315 SubRegHi,
15316 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15317 };
15318
15319 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15320}
15321
15322/// Return a resource descriptor with the 'Add TID' bit enabled
15323/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15324/// of the resource descriptor) to create an offset, which is added to
15325/// the resource pointer.
15327 SDValue Ptr, uint32_t RsrcDword1,
15328 uint64_t RsrcDword2And3) const {
15329 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15330 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15331 if (RsrcDword1) {
15332 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15333 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15334 0);
15335 }
15336
15337 SDValue DataLo = buildSMovImm32(DAG, DL,
15338 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15339 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15340
15341 const SDValue Ops[] = {
15342 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15343 PtrLo,
15344 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15345 PtrHi,
15346 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15347 DataLo,
15348 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15349 DataHi,
15350 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15351 };
15352
15353 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15354}
15355
15356//===----------------------------------------------------------------------===//
15357// SI Inline Assembly Support
15358//===----------------------------------------------------------------------===//
15359
15360std::pair<unsigned, const TargetRegisterClass *>
15362 StringRef Constraint,
15363 MVT VT) const {
15364 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15365
15366 const TargetRegisterClass *RC = nullptr;
15367 if (Constraint.size() == 1) {
15368 const unsigned BitWidth = VT.getSizeInBits();
15369 switch (Constraint[0]) {
15370 default:
15371 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15372 case 's':
15373 case 'r':
15374 switch (BitWidth) {
15375 case 16:
15376 RC = &AMDGPU::SReg_32RegClass;
15377 break;
15378 case 64:
15379 RC = &AMDGPU::SGPR_64RegClass;
15380 break;
15381 default:
15383 if (!RC)
15384 return std::pair(0U, nullptr);
15385 break;
15386 }
15387 break;
15388 case 'v':
15389 switch (BitWidth) {
15390 case 16:
15391 RC = &AMDGPU::VGPR_32RegClass;
15392 break;
15393 default:
15394 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15395 if (!RC)
15396 return std::pair(0U, nullptr);
15397 break;
15398 }
15399 break;
15400 case 'a':
15401 if (!Subtarget->hasMAIInsts())
15402 break;
15403 switch (BitWidth) {
15404 case 16:
15405 RC = &AMDGPU::AGPR_32RegClass;
15406 break;
15407 default:
15408 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15409 if (!RC)
15410 return std::pair(0U, nullptr);
15411 break;
15412 }
15413 break;
15414 }
15415 // We actually support i128, i16 and f16 as inline parameters
15416 // even if they are not reported as legal
15417 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15418 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15419 return std::pair(0U, RC);
15420 }
15421
15422 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15423 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15424 if (RegName.consume_front("v")) {
15425 RC = &AMDGPU::VGPR_32RegClass;
15426 } else if (RegName.consume_front("s")) {
15427 RC = &AMDGPU::SGPR_32RegClass;
15428 } else if (RegName.consume_front("a")) {
15429 RC = &AMDGPU::AGPR_32RegClass;
15430 }
15431
15432 if (RC) {
15433 uint32_t Idx;
15434 if (RegName.consume_front("[")) {
15435 uint32_t End;
15436 bool Failed = RegName.consumeInteger(10, Idx);
15437 Failed |= !RegName.consume_front(":");
15438 Failed |= RegName.consumeInteger(10, End);
15439 Failed |= !RegName.consume_back("]");
15440 if (!Failed) {
15441 uint32_t Width = (End - Idx + 1) * 32;
15442 MCRegister Reg = RC->getRegister(Idx);
15444 RC = TRI->getVGPRClassForBitWidth(Width);
15445 else if (SIRegisterInfo::isSGPRClass(RC))
15446 RC = TRI->getSGPRClassForBitWidth(Width);
15447 else if (SIRegisterInfo::isAGPRClass(RC))
15448 RC = TRI->getAGPRClassForBitWidth(Width);
15449 if (RC) {
15450 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15451 return std::pair(Reg, RC);
15452 }
15453 }
15454 } else {
15455 bool Failed = RegName.getAsInteger(10, Idx);
15456 if (!Failed && Idx < RC->getNumRegs())
15457 return std::pair(RC->getRegister(Idx), RC);
15458 }
15459 }
15460 }
15461
15462 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15463 if (Ret.first)
15464 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15465
15466 return Ret;
15467}
15468
15469static bool isImmConstraint(StringRef Constraint) {
15470 if (Constraint.size() == 1) {
15471 switch (Constraint[0]) {
15472 default: break;
15473 case 'I':
15474 case 'J':
15475 case 'A':
15476 case 'B':
15477 case 'C':
15478 return true;
15479 }
15480 } else if (Constraint == "DA" ||
15481 Constraint == "DB") {
15482 return true;
15483 }
15484 return false;
15485}
15486
15489 if (Constraint.size() == 1) {
15490 switch (Constraint[0]) {
15491 default: break;
15492 case 's':
15493 case 'v':
15494 case 'a':
15495 return C_RegisterClass;
15496 }
15497 }
15498 if (isImmConstraint(Constraint)) {
15499 return C_Other;
15500 }
15501 return TargetLowering::getConstraintType(Constraint);
15502}
15503
15504static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15506 Val = Val & maskTrailingOnes<uint64_t>(Size);
15507 }
15508 return Val;
15509}
15510
15512 StringRef Constraint,
15513 std::vector<SDValue> &Ops,
15514 SelectionDAG &DAG) const {
15515 if (isImmConstraint(Constraint)) {
15516 uint64_t Val;
15517 if (getAsmOperandConstVal(Op, Val) &&
15518 checkAsmConstraintVal(Op, Constraint, Val)) {
15519 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15520 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15521 }
15522 } else {
15523 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15524 }
15525}
15526
15528 unsigned Size = Op.getScalarValueSizeInBits();
15529 if (Size > 64)
15530 return false;
15531
15532 if (Size == 16 && !Subtarget->has16BitInsts())
15533 return false;
15534
15535 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15536 Val = C->getSExtValue();
15537 return true;
15538 }
15539 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15540 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15541 return true;
15542 }
15543 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15544 if (Size != 16 || Op.getNumOperands() != 2)
15545 return false;
15546 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15547 return false;
15548 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15549 Val = C->getSExtValue();
15550 return true;
15551 }
15552 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15553 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15554 return true;
15555 }
15556 }
15557
15558 return false;
15559}
15560
15562 uint64_t Val) const {
15563 if (Constraint.size() == 1) {
15564 switch (Constraint[0]) {
15565 case 'I':
15567 case 'J':
15568 return isInt<16>(Val);
15569 case 'A':
15570 return checkAsmConstraintValA(Op, Val);
15571 case 'B':
15572 return isInt<32>(Val);
15573 case 'C':
15574 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15576 default:
15577 break;
15578 }
15579 } else if (Constraint.size() == 2) {
15580 if (Constraint == "DA") {
15581 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15582 int64_t LoBits = static_cast<int32_t>(Val);
15583 return checkAsmConstraintValA(Op, HiBits, 32) &&
15584 checkAsmConstraintValA(Op, LoBits, 32);
15585 }
15586 if (Constraint == "DB") {
15587 return true;
15588 }
15589 }
15590 llvm_unreachable("Invalid asm constraint");
15591}
15592
15594 unsigned MaxSize) const {
15595 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15596 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15597 if (Size == 16) {
15598 MVT VT = Op.getSimpleValueType();
15599 switch (VT.SimpleTy) {
15600 default:
15601 return false;
15602 case MVT::i16:
15603 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15604 case MVT::f16:
15605 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15606 case MVT::bf16:
15607 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15608 case MVT::v2i16:
15609 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15610 case MVT::v2f16:
15611 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15612 case MVT::v2bf16:
15613 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15614 }
15615 }
15616 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15617 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15618 return true;
15619 return false;
15620}
15621
15622static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15623 switch (UnalignedClassID) {
15624 case AMDGPU::VReg_64RegClassID:
15625 return AMDGPU::VReg_64_Align2RegClassID;
15626 case AMDGPU::VReg_96RegClassID:
15627 return AMDGPU::VReg_96_Align2RegClassID;
15628 case AMDGPU::VReg_128RegClassID:
15629 return AMDGPU::VReg_128_Align2RegClassID;
15630 case AMDGPU::VReg_160RegClassID:
15631 return AMDGPU::VReg_160_Align2RegClassID;
15632 case AMDGPU::VReg_192RegClassID:
15633 return AMDGPU::VReg_192_Align2RegClassID;
15634 case AMDGPU::VReg_224RegClassID:
15635 return AMDGPU::VReg_224_Align2RegClassID;
15636 case AMDGPU::VReg_256RegClassID:
15637 return AMDGPU::VReg_256_Align2RegClassID;
15638 case AMDGPU::VReg_288RegClassID:
15639 return AMDGPU::VReg_288_Align2RegClassID;
15640 case AMDGPU::VReg_320RegClassID:
15641 return AMDGPU::VReg_320_Align2RegClassID;
15642 case AMDGPU::VReg_352RegClassID:
15643 return AMDGPU::VReg_352_Align2RegClassID;
15644 case AMDGPU::VReg_384RegClassID:
15645 return AMDGPU::VReg_384_Align2RegClassID;
15646 case AMDGPU::VReg_512RegClassID:
15647 return AMDGPU::VReg_512_Align2RegClassID;
15648 case AMDGPU::VReg_1024RegClassID:
15649 return AMDGPU::VReg_1024_Align2RegClassID;
15650 case AMDGPU::AReg_64RegClassID:
15651 return AMDGPU::AReg_64_Align2RegClassID;
15652 case AMDGPU::AReg_96RegClassID:
15653 return AMDGPU::AReg_96_Align2RegClassID;
15654 case AMDGPU::AReg_128RegClassID:
15655 return AMDGPU::AReg_128_Align2RegClassID;
15656 case AMDGPU::AReg_160RegClassID:
15657 return AMDGPU::AReg_160_Align2RegClassID;
15658 case AMDGPU::AReg_192RegClassID:
15659 return AMDGPU::AReg_192_Align2RegClassID;
15660 case AMDGPU::AReg_256RegClassID:
15661 return AMDGPU::AReg_256_Align2RegClassID;
15662 case AMDGPU::AReg_512RegClassID:
15663 return AMDGPU::AReg_512_Align2RegClassID;
15664 case AMDGPU::AReg_1024RegClassID:
15665 return AMDGPU::AReg_1024_Align2RegClassID;
15666 default:
15667 return -1;
15668 }
15669}
15670
15671// Figure out which registers should be reserved for stack access. Only after
15672// the function is legalized do we know all of the non-spill stack objects or if
15673// calls are present.
15677 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15678 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15679 const SIInstrInfo *TII = ST.getInstrInfo();
15680
15681 if (Info->isEntryFunction()) {
15682 // Callable functions have fixed registers used for stack access.
15684 }
15685
15686 // TODO: Move this logic to getReservedRegs()
15687 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15688 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15689 Register SReg = ST.isWave32()
15690 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15691 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15692 &AMDGPU::SGPR_64RegClass);
15693 Info->setSGPRForEXECCopy(SReg);
15694
15695 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15696 Info->getStackPtrOffsetReg()));
15697 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15698 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15699
15700 // We need to worry about replacing the default register with itself in case
15701 // of MIR testcases missing the MFI.
15702 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15703 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15704
15705 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15706 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15707
15708 Info->limitOccupancy(MF);
15709
15710 if (ST.isWave32() && !MF.empty()) {
15711 for (auto &MBB : MF) {
15712 for (auto &MI : MBB) {
15713 TII->fixImplicitOperands(MI);
15714 }
15715 }
15716 }
15717
15718 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15719 // classes if required. Ideally the register class constraints would differ
15720 // per-subtarget, but there's no easy way to achieve that right now. This is
15721 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15722 // from using them as the register class for legal types.
15723 if (ST.needsAlignedVGPRs()) {
15724 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15725 const Register Reg = Register::index2VirtReg(I);
15726 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15727 if (!RC)
15728 continue;
15729 int NewClassID = getAlignedAGPRClassID(RC->getID());
15730 if (NewClassID != -1)
15731 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15732 }
15733 }
15734
15736}
15737
15739 KnownBits &Known,
15740 const APInt &DemandedElts,
15741 const SelectionDAG &DAG,
15742 unsigned Depth) const {
15743 Known.resetAll();
15744 unsigned Opc = Op.getOpcode();
15745 switch (Opc) {
15747 unsigned IID = Op.getConstantOperandVal(0);
15748 switch (IID) {
15749 case Intrinsic::amdgcn_mbcnt_lo:
15750 case Intrinsic::amdgcn_mbcnt_hi: {
15751 const GCNSubtarget &ST =
15753 // These return at most the (wavefront size - 1) + src1
15754 // As long as src1 is an immediate we can calc known bits
15755 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15756 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15757 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15758 // Cater for potential carry
15759 MaxActiveBits += Src1ValBits ? 1 : 0;
15760 unsigned Size = Op.getValueType().getSizeInBits();
15761 if (MaxActiveBits < Size)
15762 Known.Zero.setHighBits(Size - MaxActiveBits);
15763 return;
15764 }
15765 }
15766 break;
15767 }
15768 }
15770 Op, Known, DemandedElts, DAG, Depth);
15771}
15772
15774 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15776
15777 // Set the high bits to zero based on the maximum allowed scratch size per
15778 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15779 // calculation won't overflow, so assume the sign bit is never set.
15780 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15781}
15782
15784 KnownBits &Known, unsigned Dim) {
15785 unsigned MaxValue =
15786 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15787 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15788}
15789
15791 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15792 const MachineRegisterInfo &MRI, unsigned Depth) const {
15793 const MachineInstr *MI = MRI.getVRegDef(R);
15794 switch (MI->getOpcode()) {
15795 case AMDGPU::G_INTRINSIC:
15796 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15797 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15798 case Intrinsic::amdgcn_workitem_id_x:
15799 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15800 break;
15801 case Intrinsic::amdgcn_workitem_id_y:
15802 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15803 break;
15804 case Intrinsic::amdgcn_workitem_id_z:
15805 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15806 break;
15807 case Intrinsic::amdgcn_mbcnt_lo:
15808 case Intrinsic::amdgcn_mbcnt_hi: {
15809 // These return at most the wavefront size - 1.
15810 unsigned Size = MRI.getType(R).getSizeInBits();
15811 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15812 break;
15813 }
15814 case Intrinsic::amdgcn_groupstaticsize: {
15815 // We can report everything over the maximum size as 0. We can't report
15816 // based on the actual size because we don't know if it's accurate or not
15817 // at any given point.
15818 Known.Zero.setHighBits(
15819 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15820 break;
15821 }
15822 }
15823 break;
15824 }
15825 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15826 Known.Zero.setHighBits(24);
15827 break;
15828 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15829 Known.Zero.setHighBits(16);
15830 break;
15831 case AMDGPU::G_AMDGPU_SMED3:
15832 case AMDGPU::G_AMDGPU_UMED3: {
15833 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15834
15835 KnownBits Known2;
15836 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15837 if (Known2.isUnknown())
15838 break;
15839
15840 KnownBits Known1;
15841 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15842 if (Known1.isUnknown())
15843 break;
15844
15845 KnownBits Known0;
15846 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15847 if (Known0.isUnknown())
15848 break;
15849
15850 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15851 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15852 Known.One = Known0.One & Known1.One & Known2.One;
15853 break;
15854 }
15855 }
15856}
15857
15860 unsigned Depth) const {
15861 const MachineInstr *MI = MRI.getVRegDef(R);
15862 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15863 // FIXME: Can this move to generic code? What about the case where the call
15864 // site specifies a lower alignment?
15865 Intrinsic::ID IID = GI->getIntrinsicID();
15867 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15868 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15869 return *RetAlign;
15870 }
15871 return Align(1);
15872}
15873
15876 const Align CacheLineAlign = Align(64);
15877
15878 // Pre-GFX10 target did not benefit from loop alignment
15879 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15880 getSubtarget()->hasInstFwdPrefetchBug())
15881 return PrefAlign;
15882
15883 // On GFX10 I$ is 4 x 64 bytes cache lines.
15884 // By default prefetcher keeps one cache line behind and reads two ahead.
15885 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15886 // behind and one ahead.
15887 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15888 // If loop fits 64 bytes it always spans no more than two cache lines and
15889 // does not need an alignment.
15890 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15891 // Else if loop is less or equal 192 bytes we need two lines behind.
15892
15894 const MachineBasicBlock *Header = ML->getHeader();
15895 if (Header->getAlignment() != PrefAlign)
15896 return Header->getAlignment(); // Already processed.
15897
15898 unsigned LoopSize = 0;
15899 for (const MachineBasicBlock *MBB : ML->blocks()) {
15900 // If inner loop block is aligned assume in average half of the alignment
15901 // size to be added as nops.
15902 if (MBB != Header)
15903 LoopSize += MBB->getAlignment().value() / 2;
15904
15905 for (const MachineInstr &MI : *MBB) {
15906 LoopSize += TII->getInstSizeInBytes(MI);
15907 if (LoopSize > 192)
15908 return PrefAlign;
15909 }
15910 }
15911
15912 if (LoopSize <= 64)
15913 return PrefAlign;
15914
15915 if (LoopSize <= 128)
15916 return CacheLineAlign;
15917
15918 // If any of parent loops is surrounded by prefetch instructions do not
15919 // insert new for inner loop, which would reset parent's settings.
15920 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15921 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15922 auto I = Exit->getFirstNonDebugInstr();
15923 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15924 return CacheLineAlign;
15925 }
15926 }
15927
15928 MachineBasicBlock *Pre = ML->getLoopPreheader();
15929 MachineBasicBlock *Exit = ML->getExitBlock();
15930
15931 if (Pre && Exit) {
15932 auto PreTerm = Pre->getFirstTerminator();
15933 if (PreTerm == Pre->begin() ||
15934 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15935 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15936 .addImm(1); // prefetch 2 lines behind PC
15937
15938 auto ExitHead = Exit->getFirstNonDebugInstr();
15939 if (ExitHead == Exit->end() ||
15940 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15941 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15942 .addImm(2); // prefetch 1 line behind PC
15943 }
15944
15945 return CacheLineAlign;
15946}
15947
15949static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15950 assert(N->getOpcode() == ISD::CopyFromReg);
15951 do {
15952 // Follow the chain until we find an INLINEASM node.
15953 N = N->getOperand(0).getNode();
15954 if (N->getOpcode() == ISD::INLINEASM ||
15955 N->getOpcode() == ISD::INLINEASM_BR)
15956 return true;
15957 } while (N->getOpcode() == ISD::CopyFromReg);
15958 return false;
15959}
15960
15963 UniformityInfo *UA) const {
15964 switch (N->getOpcode()) {
15965 case ISD::CopyFromReg: {
15966 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15967 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15968 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15969 Register Reg = R->getReg();
15970
15971 // FIXME: Why does this need to consider isLiveIn?
15972 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15973 return !TRI->isSGPRReg(MRI, Reg);
15974
15975 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15976 return UA->isDivergent(V);
15977
15979 return !TRI->isSGPRReg(MRI, Reg);
15980 }
15981 case ISD::LOAD: {
15982 const LoadSDNode *L = cast<LoadSDNode>(N);
15983 unsigned AS = L->getAddressSpace();
15984 // A flat load may access private memory.
15986 }
15987 case ISD::CALLSEQ_END:
15988 return true;
15990 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15992 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16011 // Target-specific read-modify-write atomics are sources of divergence.
16012 return true;
16013 default:
16014 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16015 // Generic read-modify-write atomics are sources of divergence.
16016 return A->readMem() && A->writeMem();
16017 }
16018 return false;
16019 }
16020}
16021
16023 EVT VT) const {
16024 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16025 case MVT::f32:
16027 case MVT::f64:
16028 case MVT::f16:
16030 default:
16031 return false;
16032 }
16033}
16034
16036 LLT Ty, const MachineFunction &MF) const {
16037 switch (Ty.getScalarSizeInBits()) {
16038 case 32:
16039 return !denormalModeIsFlushAllF32(MF);
16040 case 64:
16041 case 16:
16042 return !denormalModeIsFlushAllF64F16(MF);
16043 default:
16044 return false;
16045 }
16046}
16047
16049 const SelectionDAG &DAG,
16050 bool SNaN,
16051 unsigned Depth) const {
16052 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16053 const MachineFunction &MF = DAG.getMachineFunction();
16055
16056 if (Info->getMode().DX10Clamp)
16057 return true; // Clamped to 0.
16058 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16059 }
16060
16062 SNaN, Depth);
16063}
16064
16065#if 0
16066// FIXME: This should be checked before unsafe fp atomics are enabled
16067// Global FP atomic instructions have a hardcoded FP mode and do not support
16068// FP32 denormals, and only support v2f16 denormals.
16069static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16071 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16072 if (&Flt == &APFloat::IEEEsingle())
16073 return DenormMode == DenormalMode::getPreserveSign();
16074 return DenormMode == DenormalMode::getIEEE();
16075}
16076#endif
16077
16078// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079// floating point atomic instructions. May generate more efficient code,
16080// but may not respect rounding and denormal modes, and may give incorrect
16081// results for certain memory destinations.
16083 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16084 "true";
16085}
16086
16088 LLVMContext &Ctx = RMW->getContext();
16090 Ctx.getSyncScopeNames(SSNs);
16091 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16092 ? "system"
16093 : SSNs[RMW->getSyncScopeID()];
16094
16095 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16096 << "Hardware instruction generated for atomic "
16097 << RMW->getOperationName(RMW->getOperation())
16098 << " operation at memory scope " << MemScope;
16099}
16100
16101static bool isHalf2OrBFloat2(Type *Ty) {
16102 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16103 Type *EltTy = VT->getElementType();
16104 return VT->getNumElements() == 2 &&
16105 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16106 }
16107
16108 return false;
16109}
16110
16111static bool isHalf2(Type *Ty) {
16112 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16113 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16114}
16115
16116static bool isBFloat2(Type *Ty) {
16117 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16118 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16119}
16120
16123 unsigned AS = RMW->getPointerAddressSpace();
16124 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16126
16127 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16129 ORE.emit([=]() {
16130 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16131 });
16132 return Kind;
16133 };
16134
16135 auto SSID = RMW->getSyncScopeID();
16136 bool HasSystemScope =
16137 SSID == SyncScope::System ||
16138 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16139
16140 switch (RMW->getOperation()) {
16141 case AtomicRMWInst::Sub:
16142 case AtomicRMWInst::Or:
16143 case AtomicRMWInst::Xor: {
16144 // Atomic sub/or/xor do not work over PCI express, but atomic add
16145 // does. InstCombine transforms these with 0 to or, so undo that.
16146 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16147 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16148 ConstVal && ConstVal->isNullValue())
16150 }
16151
16152 break;
16153 }
16154 case AtomicRMWInst::FAdd: {
16155 Type *Ty = RMW->getType();
16156
16157 // TODO: Handle REGION_ADDRESS
16158 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16159 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16160 // is fixed to round-to-nearest-even.
16161 //
16162 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16163 // round-to-nearest-even.
16164 //
16165 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16166 // suggests it is OK if the floating-point mode may not match the calling
16167 // thread.
16168 if (Ty->isFloatTy()) {
16171 }
16172
16173 if (Ty->isDoubleTy()) {
16174 // Ignores denormal mode, but we don't consider flushing mandatory.
16177 }
16178
16179 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16181
16183 }
16184
16188
16189 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16191
16192 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16193 // gfx940, gfx12
16194 // FIXME: Needs to account for no fine-grained memory
16195 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16197 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16198 // gfx90a, gfx940, gfx12
16199 // FIXME: Needs to account for no fine-grained memory
16200 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16202
16203 // gfx940, gfx12
16204 // FIXME: Needs to account for no fine-grained memory
16205 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16207 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16208 // gfx90a, gfx940, gfx12
16209 // FIXME: Needs to account for no fine-grained memory
16210 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16212
16213 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16214 // buffer. gfx12 does have the buffer version.
16215 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16217 }
16218
16221
16222 // Always expand system scope fp atomics.
16223 if (HasSystemScope)
16225
16226 // global and flat atomic fadd f64: gfx90a, gfx940.
16227 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16228 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16229
16230 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16231 if (Ty->isFloatTy()) {
16232 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16233 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16234 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16235 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16236 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16237 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16238 } else {
16239 // gfx908
16240 if (RMW->use_empty() &&
16242 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16243 }
16244 }
16245
16246 // flat atomic fadd f32: gfx940, gfx11+.
16247 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16248 if (Subtarget->hasFlatAtomicFaddF32Inst())
16249 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16250
16251 // If it is in flat address space, and the type is float, we will try to
16252 // expand it, if the target supports global and lds atomic fadd. The
16253 // reason we need that is, in the expansion, we emit the check of address
16254 // space. If it is in global address space, we emit the global atomic
16255 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16256 if (Subtarget->hasLDSFPAtomicAddF32()) {
16257 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16259 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16261 }
16262 }
16263
16265 }
16267 case AtomicRMWInst::FMax: {
16268 Type *Ty = RMW->getType();
16269
16270 // LDS float and double fmin/fmax were always supported.
16271 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16273
16276
16277 // Always expand system scope fp atomics.
16278 if (HasSystemScope)
16280
16281 // For flat and global cases:
16282 // float, double in gfx7. Manual claims denormal support.
16283 // Removed in gfx8.
16284 // float, double restored in gfx10.
16285 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16286 //
16287 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16288 // f32.
16289 //
16290 // FIXME: Check scope and fine grained memory
16291 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16292 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16293 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16294 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16295 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16298 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16299 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16300 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16301 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16302 }
16303
16305 }
16306 case AtomicRMWInst::Min:
16307 case AtomicRMWInst::Max:
16309 case AtomicRMWInst::UMax: {
16312 // Always expand system scope min/max atomics.
16313 if (HasSystemScope)
16315 }
16316 break;
16317 }
16318 default:
16319 break;
16320 }
16321
16323}
16324
16330}
16331
16334 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16337}
16338
16344}
16345
16346const TargetRegisterClass *
16347SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16349 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16350 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16351 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16352 : &AMDGPU::SReg_32RegClass;
16353 if (!TRI->isSGPRClass(RC) && !isDivergent)
16354 return TRI->getEquivalentSGPRClass(RC);
16355 if (TRI->isSGPRClass(RC) && isDivergent)
16356 return TRI->getEquivalentVGPRClass(RC);
16357
16358 return RC;
16359}
16360
16361// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16362// uniform values (as produced by the mask results of control flow intrinsics)
16363// used outside of divergent blocks. The phi users need to also be treated as
16364// always uniform.
16365//
16366// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16367static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16368 unsigned WaveSize) {
16369 // FIXME: We assume we never cast the mask results of a control flow
16370 // intrinsic.
16371 // Early exit if the type won't be consistent as a compile time hack.
16372 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16373 if (!IT || IT->getBitWidth() != WaveSize)
16374 return false;
16375
16376 if (!isa<Instruction>(V))
16377 return false;
16378 if (!Visited.insert(V).second)
16379 return false;
16380 bool Result = false;
16381 for (const auto *U : V->users()) {
16382 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16383 if (V == U->getOperand(1)) {
16384 switch (Intrinsic->getIntrinsicID()) {
16385 default:
16386 Result = false;
16387 break;
16388 case Intrinsic::amdgcn_if_break:
16389 case Intrinsic::amdgcn_if:
16390 case Intrinsic::amdgcn_else:
16391 Result = true;
16392 break;
16393 }
16394 }
16395 if (V == U->getOperand(0)) {
16396 switch (Intrinsic->getIntrinsicID()) {
16397 default:
16398 Result = false;
16399 break;
16400 case Intrinsic::amdgcn_end_cf:
16401 case Intrinsic::amdgcn_loop:
16402 Result = true;
16403 break;
16404 }
16405 }
16406 } else {
16407 Result = hasCFUser(U, Visited, WaveSize);
16408 }
16409 if (Result)
16410 break;
16411 }
16412 return Result;
16413}
16414
16416 const Value *V) const {
16417 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16418 if (CI->isInlineAsm()) {
16419 // FIXME: This cannot give a correct answer. This should only trigger in
16420 // the case where inline asm returns mixed SGPR and VGPR results, used
16421 // outside the defining block. We don't have a specific result to
16422 // consider, so this assumes if any value is SGPR, the overall register
16423 // also needs to be SGPR.
16424 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16426 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16427 for (auto &TC : TargetConstraints) {
16428 if (TC.Type == InlineAsm::isOutput) {
16431 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16432 if (RC && SIRI->isSGPRClass(RC))
16433 return true;
16434 }
16435 }
16436 }
16437 }
16439 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16440}
16441
16443 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16444 for (; I != E; ++I) {
16445 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16446 if (getBasePtrIndex(M) == I.getOperandNo())
16447 return true;
16448 }
16449 }
16450 return false;
16451}
16452
16454 SDValue N1) const {
16455 if (!N0.hasOneUse())
16456 return false;
16457 // Take care of the opportunity to keep N0 uniform
16458 if (N0->isDivergent() || !N1->isDivergent())
16459 return true;
16460 // Check if we have a good chance to form the memory access pattern with the
16461 // base and offset
16462 return (DAG.isBaseWithConstantOffset(N0) &&
16463 hasMemSDNodeUser(*N0->use_begin()));
16464}
16465
16467 Register N0, Register N1) const {
16468 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16469}
16470
16473 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16475 if (I.getMetadata("amdgpu.noclobber"))
16476 Flags |= MONoClobber;
16477 if (I.getMetadata("amdgpu.last.use"))
16478 Flags |= MOLastUse;
16479 return Flags;
16480}
16481
16483 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16484 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16485 if (User->getOpcode() != ISD::CopyToReg)
16486 return false;
16487 if (!Def->isMachineOpcode())
16488 return false;
16489 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16490 if (!MDef)
16491 return false;
16492
16493 unsigned ResNo = User->getOperand(Op).getResNo();
16494 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16495 return false;
16496 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16497 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16498 PhysReg = AMDGPU::SCC;
16499 const TargetRegisterClass *RC =
16500 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16501 Cost = RC->getCopyCost();
16502 return true;
16503 }
16504 return false;
16505}
16506
16509
16512 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16513 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16514 "this cannot be replaced with add");
16516 return;
16517 }
16518
16519 assert(Subtarget->hasAtomicFaddInsts() &&
16520 "target should have atomic fadd instructions");
16521 assert(AI->getType()->isFloatTy() &&
16523 "generic atomicrmw expansion only supports FP32 operand in flat "
16524 "address space");
16525 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16526
16527 // Given: atomicrmw fadd ptr %addr, float %val ordering
16528 //
16529 // With this expansion we produce the following code:
16530 // [...]
16531 // br label %atomicrmw.check.shared
16532 //
16533 // atomicrmw.check.shared:
16534 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16535 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16536 //
16537 // atomicrmw.shared:
16538 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16539 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16540 // float %val ordering
16541 // br label %atomicrmw.phi
16542 //
16543 // atomicrmw.check.private:
16544 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16545 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16546 //
16547 // atomicrmw.private:
16548 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16549 // %loaded.private = load float, ptr addrspace(5) %cast.private
16550 // %val.new = fadd float %loaded.private, %val
16551 // store float %val.new, ptr addrspace(5) %cast.private
16552 // br label %atomicrmw.phi
16553 //
16554 // atomicrmw.global:
16555 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16556 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16557 // float %val ordering
16558 // br label %atomicrmw.phi
16559 //
16560 // atomicrmw.phi:
16561 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16562 // [ %loaded.private, %atomicrmw.private ],
16563 // [ %loaded.global, %atomicrmw.global ]
16564 // br label %atomicrmw.end
16565 //
16566 // atomicrmw.end:
16567 // [...]
16568
16569 IRBuilder<> Builder(AI);
16570 LLVMContext &Ctx = Builder.getContext();
16571
16572 BasicBlock *BB = Builder.GetInsertBlock();
16573 Function *F = BB->getParent();
16574 BasicBlock *ExitBB =
16575 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16576 BasicBlock *CheckSharedBB =
16577 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16578 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16579 BasicBlock *CheckPrivateBB =
16580 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16581 BasicBlock *PrivateBB =
16582 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16583 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16584 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16585
16586 Value *Val = AI->getValOperand();
16587 Type *ValTy = Val->getType();
16588 Value *Addr = AI->getPointerOperand();
16589
16590 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16591 Value *Val) -> Value * {
16592 AtomicRMWInst *OldVal =
16593 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16594 AI->getOrdering(), AI->getSyncScopeID());
16596 AI->getAllMetadata(MDs);
16597 for (auto &P : MDs)
16598 OldVal->setMetadata(P.first, P.second);
16599 return OldVal;
16600 };
16601
16602 std::prev(BB->end())->eraseFromParent();
16603 Builder.SetInsertPoint(BB);
16604 Builder.CreateBr(CheckSharedBB);
16605
16606 Builder.SetInsertPoint(CheckSharedBB);
16607 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16608 {Addr}, nullptr, "is.shared");
16609 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16610
16611 Builder.SetInsertPoint(SharedBB);
16612 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16614 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16615 Builder.CreateBr(PhiBB);
16616
16617 Builder.SetInsertPoint(CheckPrivateBB);
16618 CallInst *IsPrivate = Builder.CreateIntrinsic(
16619 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16620 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16621
16622 Builder.SetInsertPoint(PrivateBB);
16623 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16625 Value *LoadedPrivate =
16626 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16627 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16628 Builder.CreateStore(NewVal, CastToPrivate);
16629 Builder.CreateBr(PhiBB);
16630
16631 Builder.SetInsertPoint(GlobalBB);
16632 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16634 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16635 Builder.CreateBr(PhiBB);
16636
16637 Builder.SetInsertPoint(PhiBB);
16638 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16639 Loaded->addIncoming(LoadedShared, SharedBB);
16640 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16641 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16642 Builder.CreateBr(ExitBB);
16643
16644 AI->replaceAllUsesWith(Loaded);
16645 AI->eraseFromParent();
16646}
16647
16648LoadInst *
16650 IRBuilder<> Builder(AI);
16651 auto Order = AI->getOrdering();
16652
16653 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16654 // must be flushed if the atomic ordering had a release semantics. This is
16655 // not necessary a fence, a release fence just coincides to do that flush.
16656 // Avoid replacing of an atomicrmw with a release semantics.
16657 if (isReleaseOrStronger(Order))
16658 return nullptr;
16659
16660 LoadInst *LI = Builder.CreateAlignedLoad(
16661 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16662 LI->setAtomic(Order, AI->getSyncScopeID());
16663 LI->copyMetadata(*AI);
16664 LI->takeName(AI);
16665 AI->replaceAllUsesWith(LI);
16666 AI->eraseFromParent();
16667 return LI;
16668}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1168
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isHalf2OrBFloat2(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static bool isHalf2(Type *Ty)
bool unsafeFPAtomicsDisabled(Function *F)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool isBFloat2(Type *Ty)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1026
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5317
bool isNegative() const
Definition: APFloat.h:1354
APInt bitcastToAPInt() const
Definition: APFloat.h:1260
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1044
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1004
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:988
bool isInfinity() const
Definition: APFloat.h:1351
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:238
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:446
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1598
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1217
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1201
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:632
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:809
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
Value * getPointerOperand()
Definition: Instructions.h:852
void setOperation(BinOp Operation)
Definition: Instructions.h:803
BinOp getOperation() const
Definition: Instructions.h:787
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:843
Value * getValOperand()
Definition: Instructions.h:856
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:829
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:860
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:451
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:202
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:575
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1551
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
bool isSigned() const
Definition: InstrTypes.h:1007
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
iterator_range< arg_iterator > args()
Definition: Function.h:855
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasPrefetch() const
Definition: GCNSubtarget.h:940
bool hasD16Images() const
Definition: GCNSubtarget.h:696
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:845
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:477
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:468
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:841
bool hasDot7Insts() const
Definition: GCNSubtarget.h:795
bool hasApertureRegs() const
Definition: GCNSubtarget.h:597
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:837
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:765
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:411
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:676
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:527
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:585
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool hasDot1Insts() const
Definition: GCNSubtarget.h:771
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:853
Align getStackAlignment() const
Definition: GCNSubtarget.h:953
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:455
bool enableFlatScratch() const
Definition: GCNSubtarget.h:652
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:623
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:461
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:873
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:741
bool useDS128() const
Definition: GCNSubtarget.h:537
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:457
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:270
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:589
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:829
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:427
bool hasIntClamp() const
Definition: GCNSubtarget.h:357
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:377
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:601
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:631
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:966
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:730
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:336
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:920
bool hasFFBL() const
Definition: GCNSubtarget.h:415
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:985
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:559
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:833
bool hasMed3_16() const
Definition: GCNSubtarget.h:423
bool hasMovrel() const
Definition: GCNSubtarget.h:989
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:847
bool hasBFI() const
Definition: GCNSubtarget.h:403
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:577
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:344
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:522
bool hasFFBH() const
Definition: GCNSubtarget.h:419
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:849
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:857
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:869
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:855
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:877
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:547
bool hasDot8Insts() const
Definition: GCNSubtarget.h:799
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:542
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:531
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:861
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:728
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:865
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:431
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasFractBug() const
Definition: GCNSubtarget.h:395
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:399
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:711
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1812
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1538
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2402
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1125
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1795
LLVMContext & getContext() const
Definition: IRBuilder.h:173
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1808
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1859
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1119
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:363
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:399
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:174
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:259
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:737
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:969
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:568
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
Definition: SelectionDAG.h:484
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:494
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:843
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:488
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:489
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:788
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:691
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:483
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:814
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:860
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:501
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:571
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:838
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:250
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:384
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1165
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:752
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1319
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1041
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1312
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1314
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1284
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1315
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1297
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:820
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:943
@ FPTRUNC_ROUND
FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
Definition: ISDOpcodes.h:494
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1310
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:933
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1311
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:976
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1455
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1317
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:915
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1441
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1231
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1090
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:751
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1264
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1031
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:960
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1120
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1313
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:521
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1280
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1320
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:910
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1059
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1036
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1308
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1021
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1254
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1291
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1316
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1008
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1084
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:828
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:918
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1140
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:952
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1322
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1306
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:479
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1027
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1307
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:866
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1225
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1251
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1305
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:981
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:899
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:937
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1137
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1113
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:793
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1321
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1578
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1558
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1071
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1581
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:276
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:274
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals