LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236 }
237
238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254
255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262
263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264
268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269
270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274
276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Expand);
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Expand);
289
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Custom);
294
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298
300
302
304 Expand);
305
306#if 0
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
331 case ISD::IS_FPCLASS:
332 break;
337 break;
338 default:
340 break;
341 }
342 }
343 }
344
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420 }
421
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Expand);
425
426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
455 Expand);
456
457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458
459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
468
469 if (Subtarget->hasSMemRealTime() ||
473
474 if (Subtarget->has16BitInsts()) {
477 } else {
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487
488 if (!Subtarget->hasBCNT(32))
490
491 if (!Subtarget->hasBCNT(64))
493
494 if (Subtarget->hasFFBH())
496
497 if (Subtarget->hasFFBL())
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517 Legal);
518
519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520 Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
526 {MVT::f32, MVT::f64}, Legal);
527
528 if (Subtarget->haveRoundOpsF64())
530 Legal);
531 else
533 MVT::f64, Custom);
534
536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537 Legal);
538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539
542
543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
550
551 if (Subtarget->has16BitInsts()) {
554 MVT::i16, Legal);
555
556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557
559 MVT::i16, Expand);
560
564 ISD::CTPOP},
565 MVT::i16, Promote);
566
568
569 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570
572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575
579
581
582 // F16 - Constant Actions.
585
586 // F16 - Load/Store Actions.
588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591
592 // BF16 - Load/Store Actions.
594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597
598 // F16 - VOP1 Actions.
601 MVT::f16, Custom);
602
605
606 // F16 - VOP2 Actions.
607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608 Expand);
612
613 // F16 - VOP3 Actions.
615 if (STI.hasMadF16())
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
638 break;
639 default:
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652
653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Legal);
655
657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660
662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665
666 setOperationAction(ISD::AND, MVT::v2i16, Promote);
667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668 setOperationAction(ISD::OR, MVT::v2i16, Promote);
669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672
674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679
681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693
695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698
700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705
706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712
714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719
720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726
728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733
735 MVT::v2i32, Expand);
737
739 MVT::v4i32, Expand);
740
742 MVT::v8i32, Expand);
743
744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
754
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 }
859
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863 MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
871 Custom);
872
874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
885
886 // TODO: Could move this to custom lowering, could benefit from combines on
887 // extract of relevant bits.
889
891
894 ISD::SUB,
896 ISD::FADD,
897 ISD::FSUB,
898 ISD::FDIV,
905 ISD::FMA,
906 ISD::SMIN,
907 ISD::SMAX,
908 ISD::UMIN,
909 ISD::UMAX,
911 ISD::AND,
912 ISD::OR,
913 ISD::XOR,
914 ISD::FSHR,
924
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
927
928 // All memory operations. Some folding on the pointer operand is done to help
929 // matching the constant offsets in the addressing modes.
954
955 // FIXME: In other contexts we pretend this is a per-function property.
957
959}
960
962 return Subtarget;
963}
964
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
967 return RCRegs;
968}
969
970//===----------------------------------------------------------------------===//
971// TargetLowering queries
972//===----------------------------------------------------------------------===//
973
974// v_mad_mix* support a conversion from f16 to f32.
975//
976// There is only one special case when denormals are enabled we don't currently,
977// where this is OK to use.
978bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
979 EVT DestVT, EVT SrcVT) const {
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
982 DestVT.getScalarType() == MVT::f32 &&
983 SrcVT.getScalarType() == MVT::f16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 LLT DestTy, LLT SrcTy) const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
992 DestTy.getScalarSizeInBits() == 32 &&
993 SrcTy.getScalarSizeInBits() == 16 &&
994 // TODO: This probably only requires no input flushing?
996}
997
999 // SI has some legal vector types, but no legal vector operations. Say no
1000 // shuffles are legal in order to prefer scalarizing some vector operations.
1001 return false;
1002}
1003
1006 EVT VT) const {
1009
1010 if (VT.isVector()) {
1011 EVT ScalarVT = VT.getScalarType();
1012 unsigned Size = ScalarVT.getSizeInBits();
1013 if (Size == 16) {
1014 if (Subtarget->has16BitInsts()) {
1015 if (VT.isInteger())
1016 return MVT::v2i16;
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1018 }
1019 return VT.isInteger() ? MVT::i32 : MVT::f32;
1020 }
1021
1022 if (Size < 16)
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1024 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1025 }
1026
1027 if (VT.getSizeInBits() > 32)
1028 return MVT::i32;
1029
1031}
1032
1035 EVT VT) const {
1038
1039 if (VT.isVector()) {
1040 unsigned NumElts = VT.getVectorNumElements();
1041 EVT ScalarVT = VT.getScalarType();
1042 unsigned Size = ScalarVT.getSizeInBits();
1043
1044 // FIXME: Should probably promote 8-bit vectors to i16.
1045 if (Size == 16 && Subtarget->has16BitInsts())
1046 return (NumElts + 1) / 2;
1047
1048 if (Size <= 32)
1049 return NumElts;
1050
1051 if (Size > 32)
1052 return NumElts * ((Size + 31) / 32);
1053 } else if (VT.getSizeInBits() > 32)
1054 return (VT.getSizeInBits() + 31) / 32;
1055
1057}
1058
1060 LLVMContext &Context, CallingConv::ID CC,
1061 EVT VT, EVT &IntermediateVT,
1062 unsigned &NumIntermediates, MVT &RegisterVT) const {
1063 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1064 unsigned NumElts = VT.getVectorNumElements();
1065 EVT ScalarVT = VT.getScalarType();
1066 unsigned Size = ScalarVT.getSizeInBits();
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068 // support, but unless we can properly handle 3-vectors, it will be still be
1069 // inconsistent.
1070 if (Size == 16 && Subtarget->has16BitInsts()) {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1074 } else {
1075 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1077 }
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1080 }
1081
1082 if (Size == 32) {
1083 RegisterVT = ScalarVT.getSimpleVT();
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1087 }
1088
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1090 // FIXME: Should probably form v2i16 pieces
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1095 }
1096
1097
1098 if (Size != 16 && Size <= 32) {
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1103 }
1104
1105 if (Size > 32) {
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((Size + 31) / 32);
1109 return NumIntermediates;
1110 }
1111 }
1112
1114 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1115}
1116
1118 const DataLayout &DL, Type *Ty,
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1121
1122 LLVMContext &Ctx = Ty->getContext();
1123 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1126 NumElts);
1127 }
1128
1129 return TLI.getValueType(DL, Ty);
1130}
1131
1132// Peek through TFE struct returns to only use the data size.
1134 const DataLayout &DL, Type *Ty,
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Ty);
1137 if (!ST)
1138 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1139
1140 // TFE intrinsics return an aggregate type.
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1144}
1145
1146/// Map address space 7 to MVT::v5i32 because that's its in-memory
1147/// representation. This return value is vector-typed because there is no
1148/// MVT::i160 and it is not clear if one can be added. While this could
1149/// cause issues during codegen, these address space 7 pointers will be
1150/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1151/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1152/// modeling, to work.
1154 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1155 return MVT::v5i32;
1157 DL.getPointerSizeInBits(AS) == 192)
1158 return MVT::v6i32;
1160}
1161/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1162/// v8i32 when padding is added.
1163/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1164/// also v8i32 with padding.
1166 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1167 DL.getPointerSizeInBits(AS) == 160) ||
1169 DL.getPointerSizeInBits(AS) == 192))
1170 return MVT::v8i32;
1172}
1173
1175 const CallInst &CI,
1176 MachineFunction &MF,
1177 unsigned IntrID) const {
1179 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1181
1182 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1185 (Intrinsic::ID)IntrID);
1186 MemoryEffects ME = Attr.getMemoryEffects();
1187 if (ME.doesNotAccessMemory())
1188 return false;
1189
1190 // TODO: Should images get their own address space?
1191 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1192
1193 if (RsrcIntr->IsImage)
1194 Info.align.reset();
1195
1196 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1197 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1198 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1199 // We conservatively set the memory operand of a buffer intrinsic to the
1200 // base resource pointer, so that we can access alias information about
1201 // those pointers. Cases like "this points at the same value
1202 // but with a different offset" are handled in
1203 // areMemAccessesTriviallyDisjoint.
1204 Info.ptrVal = RsrcArg;
1205 }
1206
1207 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1208 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1211 if (ME.onlyReadsMemory()) {
1212 if (RsrcIntr->IsImage) {
1213 unsigned MaxNumLanes = 4;
1214
1217 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1219
1220 if (!BaseOpcode->Gather4) {
1221 // If this isn't a gather, we may have excess loaded elements in the
1222 // IR type. Check the dmask for the real number of elements loaded.
1223 unsigned DMask
1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1225 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1226 }
1227
1228 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1229 CI.getType(), MaxNumLanes);
1230 } else {
1231 Info.memVT =
1233 std::numeric_limits<unsigned>::max());
1234 }
1235
1236 // FIXME: What does alignment mean for an image?
1239 } else if (ME.onlyWritesMemory()) {
1241
1242 Type *DataTy = CI.getArgOperand(0)->getType();
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1245 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1246 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1247 DMaskLanes);
1248 } else
1249 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1250
1252 } else {
1253 // Atomic
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1259
1260 switch (IntrID) {
1261 default:
1262 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1263 // XXX - Should this be volatile without known ordering?
1265 break;
1266 case Intrinsic::amdgcn_raw_buffer_load_lds:
1267 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1268 case Intrinsic::amdgcn_struct_buffer_load_lds:
1269 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1270 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1271 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1272 Info.ptrVal = CI.getArgOperand(1);
1273 return true;
1274 }
1275 }
1276 }
1277 return true;
1278 }
1279
1280 switch (IntrID) {
1281 case Intrinsic::amdgcn_ds_ordered_add:
1282 case Intrinsic::amdgcn_ds_ordered_swap: {
1284 Info.memVT = MVT::getVT(CI.getType());
1285 Info.ptrVal = CI.getOperand(0);
1286 Info.align.reset();
1288
1289 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1290 if (!Vol->isZero())
1292
1293 return true;
1294 }
1295 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1296 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1298 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1299 Info.ptrVal = nullptr;
1300 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1302 return true;
1303 }
1304 case Intrinsic::amdgcn_ds_append:
1305 case Intrinsic::amdgcn_ds_consume: {
1307 Info.memVT = MVT::getVT(CI.getType());
1308 Info.ptrVal = CI.getOperand(0);
1309 Info.align.reset();
1311
1312 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1313 if (!Vol->isZero())
1315
1316 return true;
1317 }
1318 case Intrinsic::amdgcn_global_atomic_csub: {
1320 Info.memVT = MVT::getVT(CI.getType());
1321 Info.ptrVal = CI.getOperand(0);
1322 Info.align.reset();
1326 return true;
1327 }
1328 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1330 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1331
1332 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1333 Info.align.reset();
1336 return true;
1337 }
1338 case Intrinsic::amdgcn_global_atomic_fadd:
1339 case Intrinsic::amdgcn_global_atomic_fmin:
1340 case Intrinsic::amdgcn_global_atomic_fmax:
1341 case Intrinsic::amdgcn_global_atomic_fmin_num:
1342 case Intrinsic::amdgcn_global_atomic_fmax_num:
1343 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1344 case Intrinsic::amdgcn_flat_atomic_fadd:
1345 case Intrinsic::amdgcn_flat_atomic_fmin:
1346 case Intrinsic::amdgcn_flat_atomic_fmax:
1347 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1348 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1349 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1350 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1351 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1353 Info.memVT = MVT::getVT(CI.getType());
1354 Info.ptrVal = CI.getOperand(0);
1355 Info.align.reset();
1360 return true;
1361 }
1362 case Intrinsic::amdgcn_global_load_tr_b64:
1363 case Intrinsic::amdgcn_global_load_tr_b128: {
1365 Info.memVT = MVT::getVT(CI.getType());
1366 Info.ptrVal = CI.getOperand(0);
1367 Info.align.reset();
1369 return true;
1370 }
1371 case Intrinsic::amdgcn_ds_gws_init:
1372 case Intrinsic::amdgcn_ds_gws_barrier:
1373 case Intrinsic::amdgcn_ds_gws_sema_v:
1374 case Intrinsic::amdgcn_ds_gws_sema_br:
1375 case Intrinsic::amdgcn_ds_gws_sema_p:
1376 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1378
1379 const GCNTargetMachine &TM =
1380 static_cast<const GCNTargetMachine &>(getTargetMachine());
1381
1383 Info.ptrVal = MFI->getGWSPSV(TM);
1384
1385 // This is an abstract access, but we need to specify a type and size.
1386 Info.memVT = MVT::i32;
1387 Info.size = 4;
1388 Info.align = Align(4);
1389
1390 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1392 else
1394 return true;
1395 }
1396 case Intrinsic::amdgcn_global_load_lds: {
1398 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1399 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1400 Info.ptrVal = CI.getArgOperand(1);
1402 return true;
1403 }
1404 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1406
1407 const GCNTargetMachine &TM =
1408 static_cast<const GCNTargetMachine &>(getTargetMachine());
1409
1411 Info.ptrVal = MFI->getGWSPSV(TM);
1412
1413 // This is an abstract access, but we need to specify a type and size.
1414 Info.memVT = MVT::i32;
1415 Info.size = 4;
1416 Info.align = Align(4);
1417
1419 return true;
1420 }
1421 default:
1422 return false;
1423 }
1424}
1425
1427 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1428 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1429 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1430 // The DAG's ValueType loses the addrspaces.
1431 // Add them as 2 extra Constant operands "from" and "to".
1432 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1433 unsigned DstAS = I.getType()->getPointerAddressSpace();
1434 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1435 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1436 break;
1437 }
1438 default:
1439 break;
1440 }
1441}
1442
1445 Type *&AccessTy) const {
1446 Value *Ptr = nullptr;
1447 switch (II->getIntrinsicID()) {
1448 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1449 case Intrinsic::amdgcn_ds_append:
1450 case Intrinsic::amdgcn_ds_consume:
1451 case Intrinsic::amdgcn_ds_ordered_add:
1452 case Intrinsic::amdgcn_ds_ordered_swap:
1453 case Intrinsic::amdgcn_flat_atomic_fadd:
1454 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1455 case Intrinsic::amdgcn_flat_atomic_fmax:
1456 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1457 case Intrinsic::amdgcn_flat_atomic_fmin:
1458 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1459 case Intrinsic::amdgcn_global_atomic_csub:
1460 case Intrinsic::amdgcn_global_atomic_fadd:
1461 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1462 case Intrinsic::amdgcn_global_atomic_fmax:
1463 case Intrinsic::amdgcn_global_atomic_fmax_num:
1464 case Intrinsic::amdgcn_global_atomic_fmin:
1465 case Intrinsic::amdgcn_global_atomic_fmin_num:
1466 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b64:
1468 case Intrinsic::amdgcn_global_load_tr_b128:
1469 Ptr = II->getArgOperand(0);
1470 break;
1471 case Intrinsic::amdgcn_global_load_lds:
1472 Ptr = II->getArgOperand(1);
1473 break;
1474 default:
1475 return false;
1476 }
1477 AccessTy = II->getType();
1478 Ops.push_back(Ptr);
1479 return true;
1480}
1481
1483 unsigned AddrSpace) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 decltype(SIInstrFlags::FLAT) FlatVariant =
1494
1495 return AM.Scale == 0 &&
1496 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1497 AM.BaseOffs, AddrSpace, FlatVariant));
1498}
1499
1501 if (Subtarget->hasFlatGlobalInsts())
1503
1504 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1505 // Assume the we will use FLAT for all global memory accesses
1506 // on VI.
1507 // FIXME: This assumption is currently wrong. On VI we still use
1508 // MUBUF instructions for the r + i addressing mode. As currently
1509 // implemented, the MUBUF instructions only work on buffer < 4GB.
1510 // It may be possible to support > 4GB buffers with MUBUF instructions,
1511 // by setting the stride value in the resource descriptor which would
1512 // increase the size limit to (stride * 4GB). However, this is risky,
1513 // because it has never been validated.
1515 }
1516
1517 return isLegalMUBUFAddressingMode(AM);
1518}
1519
1520bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1521 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1522 // additionally can do r + r + i with addr64. 32-bit has more addressing
1523 // mode options. Depending on the resource constant, it can also do
1524 // (i64 r0) + (i32 r1) * (i14 i).
1525 //
1526 // Private arrays end up using a scratch buffer most of the time, so also
1527 // assume those use MUBUF instructions. Scratch loads / stores are currently
1528 // implemented as mubuf instructions with offen bit set, so slightly
1529 // different than the normal addr64.
1530 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1531 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1532 return false;
1533
1534 // FIXME: Since we can split immediate into soffset and immediate offset,
1535 // would it make sense to allow any immediate?
1536
1537 switch (AM.Scale) {
1538 case 0: // r + i or just i, depending on HasBaseReg.
1539 return true;
1540 case 1:
1541 return true; // We have r + r or r + i.
1542 case 2:
1543 if (AM.HasBaseReg) {
1544 // Reject 2 * r + r.
1545 return false;
1546 }
1547
1548 // Allow 2 * r as r + r
1549 // Or 2 * r + i is allowed as r + r + i.
1550 return true;
1551 default: // Don't allow n * r
1552 return false;
1553 }
1554}
1555
1557 const AddrMode &AM, Type *Ty,
1558 unsigned AS, Instruction *I) const {
1559 // No global is ever allowed as a base.
1560 if (AM.BaseGV)
1561 return false;
1562
1563 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1564 return isLegalGlobalAddressingMode(AM);
1565
1566 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1570 // If the offset isn't a multiple of 4, it probably isn't going to be
1571 // correctly aligned.
1572 // FIXME: Can we get the real alignment here?
1573 if (AM.BaseOffs % 4 != 0)
1574 return isLegalMUBUFAddressingMode(AM);
1575
1576 if (!Subtarget->hasScalarSubwordLoads()) {
1577 // There are no SMRD extloads, so if we have to do a small type access we
1578 // will use a MUBUF load.
1579 // FIXME?: We also need to do this if unaligned, but we don't know the
1580 // alignment here.
1581 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1582 return isLegalGlobalAddressingMode(AM);
1583 }
1584
1586 // SMRD instructions have an 8-bit, dword offset on SI.
1587 if (!isUInt<8>(AM.BaseOffs / 4))
1588 return false;
1589 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1590 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1591 // in 8-bits, it can use a smaller encoding.
1592 if (!isUInt<32>(AM.BaseOffs / 4))
1593 return false;
1594 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1595 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1596 if (!isUInt<20>(AM.BaseOffs))
1597 return false;
1598 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1599 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1600 // for S_BUFFER_* instructions).
1601 if (!isInt<21>(AM.BaseOffs))
1602 return false;
1603 } else {
1604 // On GFX12, all offsets are signed 24-bit in bytes.
1605 if (!isInt<24>(AM.BaseOffs))
1606 return false;
1607 }
1608
1609 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1611 AM.BaseOffs < 0) {
1612 // Scalar (non-buffer) loads can only use a negative offset if
1613 // soffset+offset is non-negative. Since the compiler can only prove that
1614 // in a few special cases, it is safer to claim that negative offsets are
1615 // not supported.
1616 return false;
1617 }
1618
1619 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1620 return true;
1621
1622 if (AM.Scale == 1 && AM.HasBaseReg)
1623 return true;
1624
1625 return false;
1626 }
1627
1628 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1629 return Subtarget->enableFlatScratch()
1631 : isLegalMUBUFAddressingMode(AM);
1632
1633 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1634 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1635 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1636 // field.
1637 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1638 // an 8-bit dword offset but we don't know the alignment here.
1639 if (!isUInt<16>(AM.BaseOffs))
1640 return false;
1641
1642 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1643 return true;
1644
1645 if (AM.Scale == 1 && AM.HasBaseReg)
1646 return true;
1647
1648 return false;
1649 }
1650
1652 // For an unknown address space, this usually means that this is for some
1653 // reason being used for pure arithmetic, and not based on some addressing
1654 // computation. We don't have instructions that compute pointers with any
1655 // addressing modes, so treat them as having no offset like flat
1656 // instructions.
1658 }
1659
1660 // Assume a user alias of global for unknown address spaces.
1661 return isLegalGlobalAddressingMode(AM);
1662}
1663
1665 const MachineFunction &MF) const {
1667 return (MemVT.getSizeInBits() <= 4 * 32);
1668 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1669 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1670 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1671 }
1673 return (MemVT.getSizeInBits() <= 2 * 32);
1674 return true;
1675}
1676
1678 unsigned Size, unsigned AddrSpace, Align Alignment,
1679 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1680 if (IsFast)
1681 *IsFast = 0;
1682
1683 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1684 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1685 // Check if alignment requirements for ds_read/write instructions are
1686 // disabled.
1687 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1688 return false;
1689
1690 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1691 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1692 Alignment < RequiredAlignment)
1693 return false;
1694
1695 // Either, the alignment requirements are "enabled", or there is an
1696 // unaligned LDS access related hardware bug though alignment requirements
1697 // are "disabled". In either case, we need to check for proper alignment
1698 // requirements.
1699 //
1700 switch (Size) {
1701 case 64:
1702 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1703 // address is negative, then the instruction is incorrectly treated as
1704 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1705 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1706 // load later in the SILoadStoreOptimizer.
1707 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1708 return false;
1709
1710 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1711 // can do a 4 byte aligned, 8 byte access in a single operation using
1712 // ds_read2/write2_b32 with adjacent offsets.
1713 RequiredAlignment = Align(4);
1714
1715 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1716 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1717 // ds_write2_b32 depending on the alignment. In either case with either
1718 // alignment there is no faster way of doing this.
1719
1720 // The numbers returned here and below are not additive, it is a 'speed
1721 // rank'. They are just meant to be compared to decide if a certain way
1722 // of lowering an operation is faster than another. For that purpose
1723 // naturally aligned operation gets it bitsize to indicate that "it
1724 // operates with a speed comparable to N-bit wide load". With the full
1725 // alignment ds128 is slower than ds96 for example. If underaligned it
1726 // is comparable to a speed of a single dword access, which would then
1727 // mean 32 < 128 and it is faster to issue a wide load regardless.
1728 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1729 // wider load which will not be aligned anymore the latter is slower.
1730 if (IsFast)
1731 *IsFast = (Alignment >= RequiredAlignment) ? 64
1732 : (Alignment < Align(4)) ? 32
1733 : 1;
1734 return true;
1735 }
1736
1737 break;
1738 case 96:
1739 if (!Subtarget->hasDS96AndDS128())
1740 return false;
1741
1742 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1743 // gfx8 and older.
1744
1745 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1746 // Naturally aligned access is fastest. However, also report it is Fast
1747 // if memory is aligned less than DWORD. A narrow load or store will be
1748 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1749 // be more of them, so overall we will pay less penalty issuing a single
1750 // instruction.
1751
1752 // See comment on the values above.
1753 if (IsFast)
1754 *IsFast = (Alignment >= RequiredAlignment) ? 96
1755 : (Alignment < Align(4)) ? 32
1756 : 1;
1757 return true;
1758 }
1759
1760 break;
1761 case 128:
1762 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1763 return false;
1764
1765 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1766 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1767 // single operation using ds_read2/write2_b64.
1768 RequiredAlignment = Align(8);
1769
1770 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1771 // Naturally aligned access is fastest. However, also report it is Fast
1772 // if memory is aligned less than DWORD. A narrow load or store will be
1773 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1774 // will be more of them, so overall we will pay less penalty issuing a
1775 // single instruction.
1776
1777 // See comment on the values above.
1778 if (IsFast)
1779 *IsFast = (Alignment >= RequiredAlignment) ? 128
1780 : (Alignment < Align(4)) ? 32
1781 : 1;
1782 return true;
1783 }
1784
1785 break;
1786 default:
1787 if (Size > 32)
1788 return false;
1789
1790 break;
1791 }
1792
1793 // See comment on the values above.
1794 // Note that we have a single-dword or sub-dword here, so if underaligned
1795 // it is a slowest possible access, hence returned value is 0.
1796 if (IsFast)
1797 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1798
1799 return Alignment >= RequiredAlignment ||
1800 Subtarget->hasUnalignedDSAccessEnabled();
1801 }
1802
1803 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1804 bool AlignedBy4 = Alignment >= Align(4);
1805 if (IsFast)
1806 *IsFast = AlignedBy4;
1807
1808 return AlignedBy4 ||
1809 Subtarget->enableFlatScratch() ||
1810 Subtarget->hasUnalignedScratchAccess();
1811 }
1812
1813 // FIXME: We have to be conservative here and assume that flat operations
1814 // will access scratch. If we had access to the IR function, then we
1815 // could determine if any private memory was used in the function.
1816 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1817 !Subtarget->hasUnalignedScratchAccess()) {
1818 bool AlignedBy4 = Alignment >= Align(4);
1819 if (IsFast)
1820 *IsFast = AlignedBy4;
1821
1822 return AlignedBy4;
1823 }
1824
1825 // So long as they are correct, wide global memory operations perform better
1826 // than multiple smaller memory ops -- even when misaligned
1827 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1828 if (IsFast)
1829 *IsFast = Size;
1830
1831 return Alignment >= Align(4) ||
1833 }
1834
1835 // Smaller than dword value must be aligned.
1836 if (Size < 32)
1837 return false;
1838
1839 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1840 // byte-address are ignored, thus forcing Dword alignment.
1841 // This applies to private, global, and constant memory.
1842 if (IsFast)
1843 *IsFast = 1;
1844
1845 return Size >= 32 && Alignment >= Align(4);
1846}
1847
1849 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1850 unsigned *IsFast) const {
1852 Alignment, Flags, IsFast);
1853}
1854
1856 const MemOp &Op, const AttributeList &FuncAttributes) const {
1857 // FIXME: Should account for address space here.
1858
1859 // The default fallback uses the private pointer size as a guess for a type to
1860 // use. Make sure we switch these to 64-bit accesses.
1861
1862 if (Op.size() >= 16 &&
1863 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1864 return MVT::v4i32;
1865
1866 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1867 return MVT::v2i32;
1868
1869 // Use the default.
1870 return MVT::Other;
1871}
1872
1874 const MemSDNode *MemNode = cast<MemSDNode>(N);
1875 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1876}
1877
1879 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1881}
1882
1884 unsigned DestAS) const {
1885 // Flat -> private/local is a simple truncate.
1886 // Flat -> global is no-op
1887 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1888 return true;
1889
1890 const GCNTargetMachine &TM =
1891 static_cast<const GCNTargetMachine &>(getTargetMachine());
1892 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1893}
1894
1896 const MemSDNode *MemNode = cast<MemSDNode>(N);
1897
1899}
1900
1903 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1904 VT.getScalarType().bitsLE(MVT::i16))
1907}
1908
1910 Type *Ty) const {
1911 // FIXME: Could be smarter if called for vector constants.
1912 return true;
1913}
1914
1916 unsigned Index) const {
1918 return false;
1919
1920 // TODO: Add more cases that are cheap.
1921 return Index == 0;
1922}
1923
1925 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1926 switch (Op) {
1927 case ISD::LOAD:
1928 case ISD::STORE:
1929
1930 // These operations are done with 32-bit instructions anyway.
1931 case ISD::AND:
1932 case ISD::OR:
1933 case ISD::XOR:
1934 case ISD::SELECT:
1935 // TODO: Extensions?
1936 return true;
1937 default:
1938 return false;
1939 }
1940 }
1941
1942 // SimplifySetCC uses this function to determine whether or not it should
1943 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1944 if (VT == MVT::i1 && Op == ISD::SETCC)
1945 return false;
1946
1948}
1949
1950SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1951 const SDLoc &SL,
1952 SDValue Chain,
1953 uint64_t Offset) const {
1954 const DataLayout &DL = DAG.getDataLayout();
1957
1958 const ArgDescriptor *InputPtrReg;
1959 const TargetRegisterClass *RC;
1960 LLT ArgTy;
1962
1963 std::tie(InputPtrReg, RC, ArgTy) =
1965
1966 // We may not have the kernarg segment argument if we have no kernel
1967 // arguments.
1968 if (!InputPtrReg)
1969 return DAG.getConstant(Offset, SL, PtrVT);
1970
1972 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1973 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1974
1975 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1976}
1977
1978SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1979 const SDLoc &SL) const {
1982 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1983}
1984
1985SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1986 const SDLoc &SL) const {
1987
1989 std::optional<uint32_t> KnownSize =
1991 if (KnownSize.has_value())
1992 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1993 return SDValue();
1994}
1995
1996SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1997 const SDLoc &SL, SDValue Val,
1998 bool Signed,
1999 const ISD::InputArg *Arg) const {
2000 // First, if it is a widened vector, narrow it.
2001 if (VT.isVector() &&
2003 EVT NarrowedVT =
2006 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2007 DAG.getConstant(0, SL, MVT::i32));
2008 }
2009
2010 // Then convert the vector elements or scalar value.
2011 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2012 VT.bitsLT(MemVT)) {
2013 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2014 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2015 }
2016
2017 if (MemVT.isFloatingPoint())
2018 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2019 else if (Signed)
2020 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2021 else
2022 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2023
2024 return Val;
2025}
2026
2027SDValue SITargetLowering::lowerKernargMemParameter(
2028 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2029 uint64_t Offset, Align Alignment, bool Signed,
2030 const ISD::InputArg *Arg) const {
2032
2033 // Try to avoid using an extload by loading earlier than the argument address,
2034 // and extracting the relevant bits. The load should hopefully be merged with
2035 // the previous argument.
2036 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2037 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2038 int64_t AlignDownOffset = alignDown(Offset, 4);
2039 int64_t OffsetDiff = Offset - AlignDownOffset;
2040
2041 EVT IntVT = MemVT.changeTypeToInteger();
2042
2043 // TODO: If we passed in the base kernel offset we could have a better
2044 // alignment than 4, but we don't really need it.
2045 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2046 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2049
2050 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2051 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2052
2053 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2054 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2055 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2056
2057
2058 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2059 }
2060
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2062 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2065
2066 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2067 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2068}
2069
2070SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2071 const SDLoc &SL, SDValue Chain,
2072 const ISD::InputArg &Arg) const {
2074 MachineFrameInfo &MFI = MF.getFrameInfo();
2075
2076 if (Arg.Flags.isByVal()) {
2077 unsigned Size = Arg.Flags.getByValSize();
2078 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2079 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2080 }
2081
2082 unsigned ArgOffset = VA.getLocMemOffset();
2083 unsigned ArgSize = VA.getValVT().getStoreSize();
2084
2085 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2086
2087 // Create load nodes to retrieve arguments from the stack.
2088 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2089 SDValue ArgValue;
2090
2091 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2093 MVT MemVT = VA.getValVT();
2094
2095 switch (VA.getLocInfo()) {
2096 default:
2097 break;
2098 case CCValAssign::BCvt:
2099 MemVT = VA.getLocVT();
2100 break;
2101 case CCValAssign::SExt:
2102 ExtType = ISD::SEXTLOAD;
2103 break;
2104 case CCValAssign::ZExt:
2105 ExtType = ISD::ZEXTLOAD;
2106 break;
2107 case CCValAssign::AExt:
2108 ExtType = ISD::EXTLOAD;
2109 break;
2110 }
2111
2112 ArgValue = DAG.getExtLoad(
2113 ExtType, SL, VA.getLocVT(), Chain, FIN,
2115 MemVT);
2116 return ArgValue;
2117}
2118
2119SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2120 const SIMachineFunctionInfo &MFI,
2121 EVT VT,
2123 const ArgDescriptor *Reg = nullptr;
2124 const TargetRegisterClass *RC;
2125 LLT Ty;
2126
2128 const ArgDescriptor WorkGroupIDX =
2129 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2130 // If GridZ is not programmed in an entry function then the hardware will set
2131 // it to all zeros, so there is no need to mask the GridY value in the low
2132 // order bits.
2133 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2134 AMDGPU::TTMP7,
2135 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2136 const ArgDescriptor WorkGroupIDZ =
2137 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2138 if (Subtarget->hasArchitectedSGPRs() &&
2140 switch (PVID) {
2142 Reg = &WorkGroupIDX;
2143 RC = &AMDGPU::SReg_32RegClass;
2144 Ty = LLT::scalar(32);
2145 break;
2147 Reg = &WorkGroupIDY;
2148 RC = &AMDGPU::SReg_32RegClass;
2149 Ty = LLT::scalar(32);
2150 break;
2152 Reg = &WorkGroupIDZ;
2153 RC = &AMDGPU::SReg_32RegClass;
2154 Ty = LLT::scalar(32);
2155 break;
2156 default:
2157 break;
2158 }
2159 }
2160
2161 if (!Reg)
2162 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2163 if (!Reg) {
2165 // It's possible for a kernarg intrinsic call to appear in a kernel with
2166 // no allocated segment, in which case we do not add the user sgpr
2167 // argument, so just return null.
2168 return DAG.getConstant(0, SDLoc(), VT);
2169 }
2170
2171 // It's undefined behavior if a function marked with the amdgpu-no-*
2172 // attributes uses the corresponding intrinsic.
2173 return DAG.getUNDEF(VT);
2174 }
2175
2176 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2177}
2178
2180 CallingConv::ID CallConv,
2181 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2182 FunctionType *FType,
2183 SIMachineFunctionInfo *Info) {
2184 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2185 const ISD::InputArg *Arg = &Ins[I];
2186
2187 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2188 "vector type argument should have been split");
2189
2190 // First check if it's a PS input addr.
2191 if (CallConv == CallingConv::AMDGPU_PS &&
2192 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2193 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2194
2195 // Inconveniently only the first part of the split is marked as isSplit,
2196 // so skip to the end. We only want to increment PSInputNum once for the
2197 // entire split argument.
2198 if (Arg->Flags.isSplit()) {
2199 while (!Arg->Flags.isSplitEnd()) {
2200 assert((!Arg->VT.isVector() ||
2201 Arg->VT.getScalarSizeInBits() == 16) &&
2202 "unexpected vector split in ps argument type");
2203 if (!SkipArg)
2204 Splits.push_back(*Arg);
2205 Arg = &Ins[++I];
2206 }
2207 }
2208
2209 if (SkipArg) {
2210 // We can safely skip PS inputs.
2211 Skipped.set(Arg->getOrigArgIndex());
2212 ++PSInputNum;
2213 continue;
2214 }
2215
2216 Info->markPSInputAllocated(PSInputNum);
2217 if (Arg->Used)
2218 Info->markPSInputEnabled(PSInputNum);
2219
2220 ++PSInputNum;
2221 }
2222
2223 Splits.push_back(*Arg);
2224 }
2225}
2226
2227// Allocate special inputs passed in VGPRs.
2229 MachineFunction &MF,
2230 const SIRegisterInfo &TRI,
2231 SIMachineFunctionInfo &Info) const {
2232 const LLT S32 = LLT::scalar(32);
2234
2235 if (Info.hasWorkItemIDX()) {
2236 Register Reg = AMDGPU::VGPR0;
2237 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2238
2239 CCInfo.AllocateReg(Reg);
2240 unsigned Mask = (Subtarget->hasPackedTID() &&
2241 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2242 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2243 }
2244
2245 if (Info.hasWorkItemIDY()) {
2246 assert(Info.hasWorkItemIDX());
2247 if (Subtarget->hasPackedTID()) {
2248 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2249 0x3ff << 10));
2250 } else {
2251 unsigned Reg = AMDGPU::VGPR1;
2252 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2253
2254 CCInfo.AllocateReg(Reg);
2255 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2256 }
2257 }
2258
2259 if (Info.hasWorkItemIDZ()) {
2260 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2261 if (Subtarget->hasPackedTID()) {
2262 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2263 0x3ff << 20));
2264 } else {
2265 unsigned Reg = AMDGPU::VGPR2;
2266 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2267
2268 CCInfo.AllocateReg(Reg);
2269 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2270 }
2271 }
2272}
2273
2274// Try to allocate a VGPR at the end of the argument list, or if no argument
2275// VGPRs are left allocating a stack slot.
2276// If \p Mask is is given it indicates bitfield position in the register.
2277// If \p Arg is given use it with new ]p Mask instead of allocating new.
2278static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2279 ArgDescriptor Arg = ArgDescriptor()) {
2280 if (Arg.isSet())
2281 return ArgDescriptor::createArg(Arg, Mask);
2282
2283 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2284 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2285 if (RegIdx == ArgVGPRs.size()) {
2286 // Spill to stack required.
2287 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2288
2289 return ArgDescriptor::createStack(Offset, Mask);
2290 }
2291
2292 unsigned Reg = ArgVGPRs[RegIdx];
2293 Reg = CCInfo.AllocateReg(Reg);
2294 assert(Reg != AMDGPU::NoRegister);
2295
2296 MachineFunction &MF = CCInfo.getMachineFunction();
2297 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2298 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2299 return ArgDescriptor::createRegister(Reg, Mask);
2300}
2301
2303 const TargetRegisterClass *RC,
2304 unsigned NumArgRegs) {
2305 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2306 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2307 if (RegIdx == ArgSGPRs.size())
2308 report_fatal_error("ran out of SGPRs for arguments");
2309
2310 unsigned Reg = ArgSGPRs[RegIdx];
2311 Reg = CCInfo.AllocateReg(Reg);
2312 assert(Reg != AMDGPU::NoRegister);
2313
2314 MachineFunction &MF = CCInfo.getMachineFunction();
2315 MF.addLiveIn(Reg, RC);
2317}
2318
2319// If this has a fixed position, we still should allocate the register in the
2320// CCInfo state. Technically we could get away with this for values passed
2321// outside of the normal argument range.
2323 const TargetRegisterClass *RC,
2324 MCRegister Reg) {
2325 Reg = CCInfo.AllocateReg(Reg);
2326 assert(Reg != AMDGPU::NoRegister);
2327 MachineFunction &MF = CCInfo.getMachineFunction();
2328 MF.addLiveIn(Reg, RC);
2329}
2330
2331static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2332 if (Arg) {
2333 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2334 Arg.getRegister());
2335 } else
2336 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2337}
2338
2339static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2340 if (Arg) {
2341 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2342 Arg.getRegister());
2343 } else
2344 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2345}
2346
2347/// Allocate implicit function VGPR arguments at the end of allocated user
2348/// arguments.
2350 CCState &CCInfo, MachineFunction &MF,
2351 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2352 const unsigned Mask = 0x3ff;
2353 ArgDescriptor Arg;
2354
2355 if (Info.hasWorkItemIDX()) {
2356 Arg = allocateVGPR32Input(CCInfo, Mask);
2357 Info.setWorkItemIDX(Arg);
2358 }
2359
2360 if (Info.hasWorkItemIDY()) {
2361 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2362 Info.setWorkItemIDY(Arg);
2363 }
2364
2365 if (Info.hasWorkItemIDZ())
2366 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2367}
2368
2369/// Allocate implicit function VGPR arguments in fixed registers.
2371 CCState &CCInfo, MachineFunction &MF,
2372 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2373 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2374 if (!Reg)
2375 report_fatal_error("failed to allocated VGPR for implicit arguments");
2376
2377 const unsigned Mask = 0x3ff;
2378 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2379 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2380 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2381}
2382
2384 CCState &CCInfo,
2385 MachineFunction &MF,
2386 const SIRegisterInfo &TRI,
2387 SIMachineFunctionInfo &Info) const {
2388 auto &ArgInfo = Info.getArgInfo();
2389 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2390
2391 // TODO: Unify handling with private memory pointers.
2392 if (UserSGPRInfo.hasDispatchPtr())
2393 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2394
2395 const Module *M = MF.getFunction().getParent();
2396 if (UserSGPRInfo.hasQueuePtr() &&
2398 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2399
2400 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2401 // constant offset from the kernarg segment.
2402 if (Info.hasImplicitArgPtr())
2403 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2404
2405 if (UserSGPRInfo.hasDispatchID())
2406 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2407
2408 // flat_scratch_init is not applicable for non-kernel functions.
2409
2410 if (Info.hasWorkGroupIDX())
2411 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2412
2413 if (Info.hasWorkGroupIDY())
2414 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2415
2416 if (Info.hasWorkGroupIDZ())
2417 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2418
2419 if (Info.hasLDSKernelId())
2420 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2421}
2422
2423// Allocate special inputs passed in user SGPRs.
2425 MachineFunction &MF,
2426 const SIRegisterInfo &TRI,
2427 SIMachineFunctionInfo &Info) const {
2428 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2429 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2430 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2431 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2432 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2433 }
2434
2435 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2436 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2437 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2438 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2439 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2440 }
2441
2442 if (UserSGPRInfo.hasDispatchPtr()) {
2443 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2444 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2445 CCInfo.AllocateReg(DispatchPtrReg);
2446 }
2447
2448 const Module *M = MF.getFunction().getParent();
2449 if (UserSGPRInfo.hasQueuePtr() &&
2451 Register QueuePtrReg = Info.addQueuePtr(TRI);
2452 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2453 CCInfo.AllocateReg(QueuePtrReg);
2454 }
2455
2456 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2458 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2459 CCInfo.AllocateReg(InputPtrReg);
2460
2461 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2462 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2463 }
2464
2465 if (UserSGPRInfo.hasDispatchID()) {
2466 Register DispatchIDReg = Info.addDispatchID(TRI);
2467 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2468 CCInfo.AllocateReg(DispatchIDReg);
2469 }
2470
2471 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2472 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2473 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2474 CCInfo.AllocateReg(FlatScratchInitReg);
2475 }
2476
2477 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2478 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2479 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2480 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2481 }
2482
2483 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2484 // these from the dispatch pointer.
2485}
2486
2487// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2488// sequential starting from the first argument.
2490 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2492 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2493 Function &F = MF.getFunction();
2494 unsigned LastExplicitArgOffset =
2495 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2496 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2497 bool InPreloadSequence = true;
2498 unsigned InIdx = 0;
2499 for (auto &Arg : F.args()) {
2500 if (!InPreloadSequence || !Arg.hasInRegAttr())
2501 break;
2502
2503 int ArgIdx = Arg.getArgNo();
2504 // Don't preload non-original args or parts not in the current preload
2505 // sequence.
2506 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2507 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2508 break;
2509
2510 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2511 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2512 InIdx++) {
2513 assert(ArgLocs[ArgIdx].isMemLoc());
2514 auto &ArgLoc = ArgLocs[InIdx];
2515 const Align KernelArgBaseAlign = Align(16);
2516 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2517 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2518 unsigned NumAllocSGPRs =
2519 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2520
2521 // Arg is preloaded into the previous SGPR.
2522 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2523 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2524 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2525 continue;
2526 }
2527
2528 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2529 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2530 // Check for free user SGPRs for preloading.
2531 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2532 SGPRInfo.getNumFreeUserSGPRs()) {
2533 InPreloadSequence = false;
2534 break;
2535 }
2536
2537 // Preload this argument.
2538 const TargetRegisterClass *RC =
2539 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2540 SmallVectorImpl<MCRegister> *PreloadRegs =
2541 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2542
2543 if (PreloadRegs->size() > 1)
2544 RC = &AMDGPU::SGPR_32RegClass;
2545 for (auto &Reg : *PreloadRegs) {
2546 assert(Reg);
2547 MF.addLiveIn(Reg, RC);
2548 CCInfo.AllocateReg(Reg);
2549 }
2550
2551 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2552 }
2553 }
2554}
2555
2557 const SIRegisterInfo &TRI,
2558 SIMachineFunctionInfo &Info) const {
2559 // Always allocate this last since it is a synthetic preload.
2560 if (Info.hasLDSKernelId()) {
2561 Register Reg = Info.addLDSKernelId();
2562 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2563 CCInfo.AllocateReg(Reg);
2564 }
2565}
2566
2567// Allocate special input registers that are initialized per-wave.
2569 MachineFunction &MF,
2571 CallingConv::ID CallConv,
2572 bool IsShader) const {
2573 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2574 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2575 // Note: user SGPRs are handled by the front-end for graphics shaders
2576 // Pad up the used user SGPRs with dead inputs.
2577
2578 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2579 // before enabling architected SGPRs for workgroup IDs.
2580 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2581
2582 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2583 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2584 // rely on it to reach 16 since if we end up having no stack usage, it will
2585 // not really be added.
2586 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2587 Info.hasWorkGroupIDY() +
2588 Info.hasWorkGroupIDZ() +
2589 Info.hasWorkGroupInfo();
2590 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2591 Register Reg = Info.addReservedUserSGPR();
2592 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2593 CCInfo.AllocateReg(Reg);
2594 }
2595 }
2596
2597 if (!HasArchitectedSGPRs) {
2598 if (Info.hasWorkGroupIDX()) {
2599 Register Reg = Info.addWorkGroupIDX();
2600 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2601 CCInfo.AllocateReg(Reg);
2602 }
2603
2604 if (Info.hasWorkGroupIDY()) {
2605 Register Reg = Info.addWorkGroupIDY();
2606 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2607 CCInfo.AllocateReg(Reg);
2608 }
2609
2610 if (Info.hasWorkGroupIDZ()) {
2611 Register Reg = Info.addWorkGroupIDZ();
2612 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 CCInfo.AllocateReg(Reg);
2614 }
2615 }
2616
2617 if (Info.hasWorkGroupInfo()) {
2618 Register Reg = Info.addWorkGroupInfo();
2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622
2623 if (Info.hasPrivateSegmentWaveByteOffset()) {
2624 // Scratch wave offset passed in system SGPR.
2625 unsigned PrivateSegmentWaveByteOffsetReg;
2626
2627 if (IsShader) {
2628 PrivateSegmentWaveByteOffsetReg =
2629 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2630
2631 // This is true if the scratch wave byte offset doesn't have a fixed
2632 // location.
2633 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2634 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2635 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2636 }
2637 } else
2638 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2639
2640 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2641 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2642 }
2643
2644 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2645 Info.getNumPreloadedSGPRs() >= 16);
2646}
2647
2649 MachineFunction &MF,
2650 const SIRegisterInfo &TRI,
2651 SIMachineFunctionInfo &Info) {
2652 // Now that we've figured out where the scratch register inputs are, see if
2653 // should reserve the arguments and use them directly.
2654 MachineFrameInfo &MFI = MF.getFrameInfo();
2655 bool HasStackObjects = MFI.hasStackObjects();
2656 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2657
2658 // Record that we know we have non-spill stack objects so we don't need to
2659 // check all stack objects later.
2660 if (HasStackObjects)
2661 Info.setHasNonSpillStackObjects(true);
2662
2663 // Everything live out of a block is spilled with fast regalloc, so it's
2664 // almost certain that spilling will be required.
2665 if (TM.getOptLevel() == CodeGenOptLevel::None)
2666 HasStackObjects = true;
2667
2668 // For now assume stack access is needed in any callee functions, so we need
2669 // the scratch registers to pass in.
2670 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2671
2672 if (!ST.enableFlatScratch()) {
2673 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2674 // If we have stack objects, we unquestionably need the private buffer
2675 // resource. For the Code Object V2 ABI, this will be the first 4 user
2676 // SGPR inputs. We can reserve those and use them directly.
2677
2678 Register PrivateSegmentBufferReg =
2680 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2681 } else {
2682 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2683 // We tentatively reserve the last registers (skipping the last registers
2684 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2685 // we'll replace these with the ones immediately after those which were
2686 // really allocated. In the prologue copies will be inserted from the
2687 // argument to these reserved registers.
2688
2689 // Without HSA, relocations are used for the scratch pointer and the
2690 // buffer resource setup is always inserted in the prologue. Scratch wave
2691 // offset is still in an input SGPR.
2692 Info.setScratchRSrcReg(ReservedBufferReg);
2693 }
2694 }
2695
2697
2698 // For entry functions we have to set up the stack pointer if we use it,
2699 // whereas non-entry functions get this "for free". This means there is no
2700 // intrinsic advantage to using S32 over S34 in cases where we do not have
2701 // calls but do need a frame pointer (i.e. if we are requested to have one
2702 // because frame pointer elimination is disabled). To keep things simple we
2703 // only ever use S32 as the call ABI stack pointer, and so using it does not
2704 // imply we need a separate frame pointer.
2705 //
2706 // Try to use s32 as the SP, but move it if it would interfere with input
2707 // arguments. This won't work with calls though.
2708 //
2709 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2710 // registers.
2711 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2712 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2713 } else {
2715
2716 if (MFI.hasCalls())
2717 report_fatal_error("call in graphics shader with too many input SGPRs");
2718
2719 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2720 if (!MRI.isLiveIn(Reg)) {
2721 Info.setStackPtrOffsetReg(Reg);
2722 break;
2723 }
2724 }
2725
2726 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2727 report_fatal_error("failed to find register for SP");
2728 }
2729
2730 // hasFP should be accurate for entry functions even before the frame is
2731 // finalized, because it does not rely on the known stack size, only
2732 // properties like whether variable sized objects are present.
2733 if (ST.getFrameLowering()->hasFP(MF)) {
2734 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2735 }
2736}
2737
2740 return !Info->isEntryFunction();
2741}
2742
2744
2745}
2746
2748 MachineBasicBlock *Entry,
2749 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2751
2752 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2753 if (!IStart)
2754 return;
2755
2756 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2757 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2758 MachineBasicBlock::iterator MBBI = Entry->begin();
2759 for (const MCPhysReg *I = IStart; *I; ++I) {
2760 const TargetRegisterClass *RC = nullptr;
2761 if (AMDGPU::SReg_64RegClass.contains(*I))
2762 RC = &AMDGPU::SGPR_64RegClass;
2763 else if (AMDGPU::SReg_32RegClass.contains(*I))
2764 RC = &AMDGPU::SGPR_32RegClass;
2765 else
2766 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2767
2768 Register NewVR = MRI->createVirtualRegister(RC);
2769 // Create copy from CSR to a virtual register.
2770 Entry->addLiveIn(*I);
2771 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2772 .addReg(*I);
2773
2774 // Insert the copy-back instructions right before the terminator.
2775 for (auto *Exit : Exits)
2776 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2777 TII->get(TargetOpcode::COPY), *I)
2778 .addReg(NewVR);
2779 }
2780}
2781
2783 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2784 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2785 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2787
2789 const Function &Fn = MF.getFunction();
2792
2793 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2794 DiagnosticInfoUnsupported NoGraphicsHSA(
2795 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2796 DAG.getContext()->diagnose(NoGraphicsHSA);
2797 return DAG.getEntryNode();
2798 }
2799
2802 BitVector Skipped(Ins.size());
2803 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2804 *DAG.getContext());
2805
2806 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2807 bool IsKernel = AMDGPU::isKernel(CallConv);
2808 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2809
2810 if (IsGraphics) {
2811 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2812 assert(!UserSGPRInfo.hasDispatchPtr() &&
2813 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2814 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2815 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2816 (void)UserSGPRInfo;
2817 if (!Subtarget->enableFlatScratch())
2818 assert(!UserSGPRInfo.hasFlatScratchInit());
2819 if ((CallConv != CallingConv::AMDGPU_CS &&
2820 CallConv != CallingConv::AMDGPU_Gfx) ||
2821 !Subtarget->hasArchitectedSGPRs())
2822 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2823 !Info->hasWorkGroupIDZ());
2824 }
2825
2826 if (CallConv == CallingConv::AMDGPU_PS) {
2827 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2828
2829 // At least one interpolation mode must be enabled or else the GPU will
2830 // hang.
2831 //
2832 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2833 // set PSInputAddr, the user wants to enable some bits after the compilation
2834 // based on run-time states. Since we can't know what the final PSInputEna
2835 // will look like, so we shouldn't do anything here and the user should take
2836 // responsibility for the correct programming.
2837 //
2838 // Otherwise, the following restrictions apply:
2839 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2840 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2841 // enabled too.
2842 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2843 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2844 CCInfo.AllocateReg(AMDGPU::VGPR0);
2845 CCInfo.AllocateReg(AMDGPU::VGPR1);
2846 Info->markPSInputAllocated(0);
2847 Info->markPSInputEnabled(0);
2848 }
2849 if (Subtarget->isAmdPalOS()) {
2850 // For isAmdPalOS, the user does not enable some bits after compilation
2851 // based on run-time states; the register values being generated here are
2852 // the final ones set in hardware. Therefore we need to apply the
2853 // workaround to PSInputAddr and PSInputEnable together. (The case where
2854 // a bit is set in PSInputAddr but not PSInputEnable is where the
2855 // frontend set up an input arg for a particular interpolation mode, but
2856 // nothing uses that input arg. Really we should have an earlier pass
2857 // that removes such an arg.)
2858 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2859 if ((PsInputBits & 0x7F) == 0 ||
2860 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2861 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2862 }
2863 } else if (IsKernel) {
2864 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2865 } else {
2866 Splits.append(Ins.begin(), Ins.end());
2867 }
2868
2869 if (IsKernel)
2870 analyzeFormalArgumentsCompute(CCInfo, Ins);
2871
2872 if (IsEntryFunc) {
2873 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2874 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2875 if (IsKernel && Subtarget->hasKernargPreload())
2876 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2877
2878 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2879 } else if (!IsGraphics) {
2880 // For the fixed ABI, pass workitem IDs in the last argument register.
2881 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2882
2883 // FIXME: Sink this into allocateSpecialInputSGPRs
2884 if (!Subtarget->enableFlatScratch())
2885 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2886
2887 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2888 }
2889
2890 if (!IsKernel) {
2891 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2892 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2893 }
2894
2896
2897 // FIXME: This is the minimum kernel argument alignment. We should improve
2898 // this to the maximum alignment of the arguments.
2899 //
2900 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2901 // kern arg offset.
2902 const Align KernelArgBaseAlign = Align(16);
2903
2904 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2905 const ISD::InputArg &Arg = Ins[i];
2906 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2907 InVals.push_back(DAG.getUNDEF(Arg.VT));
2908 continue;
2909 }
2910
2911 CCValAssign &VA = ArgLocs[ArgIdx++];
2912 MVT VT = VA.getLocVT();
2913
2914 if (IsEntryFunc && VA.isMemLoc()) {
2915 VT = Ins[i].VT;
2916 EVT MemVT = VA.getLocVT();
2917
2918 const uint64_t Offset = VA.getLocMemOffset();
2919 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2920
2921 if (Arg.Flags.isByRef()) {
2922 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2923
2924 const GCNTargetMachine &TM =
2925 static_cast<const GCNTargetMachine &>(getTargetMachine());
2926 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2927 Arg.Flags.getPointerAddrSpace())) {
2930 }
2931
2932 InVals.push_back(Ptr);
2933 continue;
2934 }
2935
2936 SDValue NewArg;
2937 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2938 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2939 // In this case the argument is packed into the previous preload SGPR.
2940 int64_t AlignDownOffset = alignDown(Offset, 4);
2941 int64_t OffsetDiff = Offset - AlignDownOffset;
2942 EVT IntVT = MemVT.changeTypeToInteger();
2943
2947 Register Reg =
2948 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2949
2950 assert(Reg);
2951 Register VReg = MRI.getLiveInVirtReg(Reg);
2952 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2953
2954 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2955 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2956
2957 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2958 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2959 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2960 Ins[i].Flags.isSExt(), &Ins[i]);
2961
2962 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2963 } else {
2967 const SmallVectorImpl<MCRegister> &PreloadRegs =
2968 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2969
2970 SDValue Copy;
2971 if (PreloadRegs.size() == 1) {
2972 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2973 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2974 NewArg = DAG.getCopyFromReg(
2975 Chain, DL, VReg,
2977 TRI->getRegSizeInBits(*RC)));
2978
2979 } else {
2980 // If the kernarg alignment does not match the alignment of the SGPR
2981 // tuple RC that can accommodate this argument, it will be built up
2982 // via copies from from the individual SGPRs that the argument was
2983 // preloaded to.
2985 for (auto Reg : PreloadRegs) {
2986 Register VReg = MRI.getLiveInVirtReg(Reg);
2987 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2988 Elts.push_back(Copy);
2989 }
2990 NewArg =
2991 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2992 PreloadRegs.size()),
2993 DL, Elts);
2994 }
2995
2996 // If the argument was preloaded to multiple consecutive 32-bit
2997 // registers because of misalignment between addressable SGPR tuples
2998 // and the argument size, we can still assume that because of kernarg
2999 // segment alignment restrictions that NewArg's size is the same as
3000 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3001 // truncate since we cannot preload to less than a single SGPR and the
3002 // MemVT may be smaller.
3003 EVT MemVTInt =
3005 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3006 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3007
3008 NewArg = DAG.getBitcast(MemVT, NewArg);
3009 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3010 Ins[i].Flags.isSExt(), &Ins[i]);
3011 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3012 }
3013 } else {
3014 NewArg =
3015 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3016 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3017 }
3018 Chains.push_back(NewArg.getValue(1));
3019
3020 auto *ParamTy =
3021 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3023 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3024 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3025 // On SI local pointers are just offsets into LDS, so they are always
3026 // less than 16-bits. On CI and newer they could potentially be
3027 // real pointers, so we can't guarantee their size.
3028 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3029 DAG.getValueType(MVT::i16));
3030 }
3031
3032 InVals.push_back(NewArg);
3033 continue;
3034 }
3035 if (!IsEntryFunc && VA.isMemLoc()) {
3036 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3037 InVals.push_back(Val);
3038 if (!Arg.Flags.isByVal())
3039 Chains.push_back(Val.getValue(1));
3040 continue;
3041 }
3042
3043 assert(VA.isRegLoc() && "Parameter must be in a register!");
3044
3045 Register Reg = VA.getLocReg();
3046 const TargetRegisterClass *RC = nullptr;
3047 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3048 RC = &AMDGPU::VGPR_32RegClass;
3049 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3050 RC = &AMDGPU::SGPR_32RegClass;
3051 else
3052 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3053 EVT ValVT = VA.getValVT();
3054
3055 Reg = MF.addLiveIn(Reg, RC);
3056 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3057
3058 if (Arg.Flags.isSRet()) {
3059 // The return object should be reasonably addressable.
3060
3061 // FIXME: This helps when the return is a real sret. If it is a
3062 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3063 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3064 unsigned NumBits
3066 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3067 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3068 }
3069
3070 // If this is an 8 or 16-bit value, it is really passed promoted
3071 // to 32 bits. Insert an assert[sz]ext to capture this, then
3072 // truncate to the right size.
3073 switch (VA.getLocInfo()) {
3074 case CCValAssign::Full:
3075 break;
3076 case CCValAssign::BCvt:
3077 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3078 break;
3079 case CCValAssign::SExt:
3080 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3081 DAG.getValueType(ValVT));
3082 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3083 break;
3084 case CCValAssign::ZExt:
3085 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3086 DAG.getValueType(ValVT));
3087 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3088 break;
3089 case CCValAssign::AExt:
3090 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3091 break;
3092 default:
3093 llvm_unreachable("Unknown loc info!");
3094 }
3095
3096 InVals.push_back(Val);
3097 }
3098
3099 // Start adding system SGPRs.
3100 if (IsEntryFunc)
3101 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3102
3103 // DAG.getPass() returns nullptr when using new pass manager.
3104 // TODO: Use DAG.getMFAM() to access analysis result.
3105 if (DAG.getPass()) {
3106 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3107 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3108 }
3109
3110 unsigned StackArgSize = CCInfo.getStackSize();
3111 Info->setBytesInStackArgArea(StackArgSize);
3112
3113 return Chains.empty() ? Chain :
3114 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3115}
3116
3117// TODO: If return values can't fit in registers, we should return as many as
3118// possible in registers before passing on stack.
3120 CallingConv::ID CallConv,
3121 MachineFunction &MF, bool IsVarArg,
3123 LLVMContext &Context) const {
3124 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3125 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3126 // for shaders. Vector types should be explicitly handled by CC.
3127 if (AMDGPU::isEntryFunctionCC(CallConv))
3128 return true;
3129
3131 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3132 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3133 return false;
3134
3135 // We must use the stack if return would require unavailable registers.
3136 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3137 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3138 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3139 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3140 return false;
3141
3142 return true;
3143}
3144
3145SDValue
3147 bool isVarArg,
3149 const SmallVectorImpl<SDValue> &OutVals,
3150 const SDLoc &DL, SelectionDAG &DAG) const {
3153
3154 if (AMDGPU::isKernel(CallConv)) {
3155 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3156 OutVals, DL, DAG);
3157 }
3158
3159 bool IsShader = AMDGPU::isShader(CallConv);
3160
3161 Info->setIfReturnsVoid(Outs.empty());
3162 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3163
3164 // CCValAssign - represent the assignment of the return value to a location.
3167
3168 // CCState - Info about the registers and stack slots.
3169 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3170 *DAG.getContext());
3171
3172 // Analyze outgoing return values.
3173 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3174
3175 SDValue Glue;
3177 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3178
3179 // Copy the result values into the output registers.
3180 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3181 ++I, ++RealRVLocIdx) {
3182 CCValAssign &VA = RVLocs[I];
3183 assert(VA.isRegLoc() && "Can only return in registers!");
3184 // TODO: Partially return in registers if return values don't fit.
3185 SDValue Arg = OutVals[RealRVLocIdx];
3186
3187 // Copied from other backends.
3188 switch (VA.getLocInfo()) {
3189 case CCValAssign::Full:
3190 break;
3191 case CCValAssign::BCvt:
3192 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3193 break;
3194 case CCValAssign::SExt:
3195 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3196 break;
3197 case CCValAssign::ZExt:
3198 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3199 break;
3200 case CCValAssign::AExt:
3201 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3202 break;
3203 default:
3204 llvm_unreachable("Unknown loc info!");
3205 }
3206
3207 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3208 Glue = Chain.getValue(1);
3209 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3210 }
3211
3212 // FIXME: Does sret work properly?
3213 if (!Info->isEntryFunction()) {
3214 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3215 const MCPhysReg *I =
3216 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3217 if (I) {
3218 for (; *I; ++I) {
3219 if (AMDGPU::SReg_64RegClass.contains(*I))
3220 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3221 else if (AMDGPU::SReg_32RegClass.contains(*I))
3222 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3223 else
3224 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3225 }
3226 }
3227 }
3228
3229 // Update chain and glue.
3230 RetOps[0] = Chain;
3231 if (Glue.getNode())
3232 RetOps.push_back(Glue);
3233
3234 unsigned Opc = AMDGPUISD::ENDPGM;
3235 if (!IsWaveEnd)
3237 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3238}
3239
3241 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3242 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3243 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3244 SDValue ThisVal) const {
3245 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3246
3247 // Assign locations to each value returned by this call.
3249 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3250 *DAG.getContext());
3251 CCInfo.AnalyzeCallResult(Ins, RetCC);
3252
3253 // Copy all of the result registers out of their specified physreg.
3254 for (CCValAssign VA : RVLocs) {
3255 SDValue Val;
3256
3257 if (VA.isRegLoc()) {
3258 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3259 Chain = Val.getValue(1);
3260 InGlue = Val.getValue(2);
3261 } else if (VA.isMemLoc()) {
3262 report_fatal_error("TODO: return values in memory");
3263 } else
3264 llvm_unreachable("unknown argument location type");
3265
3266 switch (VA.getLocInfo()) {
3267 case CCValAssign::Full:
3268 break;
3269 case CCValAssign::BCvt:
3270 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3271 break;
3272 case CCValAssign::ZExt:
3273 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3274 DAG.getValueType(VA.getValVT()));
3275 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3276 break;
3277 case CCValAssign::SExt:
3278 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3279 DAG.getValueType(VA.getValVT()));
3280 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3281 break;
3282 case CCValAssign::AExt:
3283 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3284 break;
3285 default:
3286 llvm_unreachable("Unknown loc info!");
3287 }
3288
3289 InVals.push_back(Val);
3290 }
3291
3292 return Chain;
3293}
3294
3295// Add code to pass special inputs required depending on used features separate
3296// from the explicit user arguments present in the IR.
3298 CallLoweringInfo &CLI,
3299 CCState &CCInfo,
3300 const SIMachineFunctionInfo &Info,
3301 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3302 SmallVectorImpl<SDValue> &MemOpChains,
3303 SDValue Chain) const {
3304 // If we don't have a call site, this was a call inserted by
3305 // legalization. These can never use special inputs.
3306 if (!CLI.CB)
3307 return;
3308
3309 SelectionDAG &DAG = CLI.DAG;
3310 const SDLoc &DL = CLI.DL;
3311 const Function &F = DAG.getMachineFunction().getFunction();
3312
3313 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3314 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3315
3316 const AMDGPUFunctionArgInfo *CalleeArgInfo
3318 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3319 // DAG.getPass() returns nullptr when using new pass manager.
3320 // TODO: Use DAG.getMFAM() to access analysis result.
3321 if (DAG.getPass()) {
3322 auto &ArgUsageInfo =
3324 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3325 }
3326 }
3327
3328 // TODO: Unify with private memory register handling. This is complicated by
3329 // the fact that at least in kernels, the input argument is not necessarily
3330 // in the same location as the input.
3331 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3333 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3334 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3335 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3336 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3337 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3338 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3339 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3340 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3341 };
3342
3343 for (auto Attr : ImplicitAttrs) {
3344 const ArgDescriptor *OutgoingArg;
3345 const TargetRegisterClass *ArgRC;
3346 LLT ArgTy;
3347
3348 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3349
3350 // If the callee does not use the attribute value, skip copying the value.
3351 if (CLI.CB->hasFnAttr(Attr.second))
3352 continue;
3353
3354 std::tie(OutgoingArg, ArgRC, ArgTy) =
3355 CalleeArgInfo->getPreloadedValue(InputID);
3356 if (!OutgoingArg)
3357 continue;
3358
3359 const ArgDescriptor *IncomingArg;
3360 const TargetRegisterClass *IncomingArgRC;
3361 LLT Ty;
3362 std::tie(IncomingArg, IncomingArgRC, Ty) =
3363 CallerArgInfo.getPreloadedValue(InputID);
3364 assert(IncomingArgRC == ArgRC);
3365
3366 // All special arguments are ints for now.
3367 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3368 SDValue InputReg;
3369
3370 if (IncomingArg) {
3371 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3372 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3373 // The implicit arg ptr is special because it doesn't have a corresponding
3374 // input for kernels, and is computed from the kernarg segment pointer.
3375 InputReg = getImplicitArgPtr(DAG, DL);
3376 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3377 std::optional<uint32_t> Id =
3379 if (Id.has_value()) {
3380 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3381 } else {
3382 InputReg = DAG.getUNDEF(ArgVT);
3383 }
3384 } else {
3385 // We may have proven the input wasn't needed, although the ABI is
3386 // requiring it. We just need to allocate the register appropriately.
3387 InputReg = DAG.getUNDEF(ArgVT);
3388 }
3389
3390 if (OutgoingArg->isRegister()) {
3391 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3392 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3393 report_fatal_error("failed to allocate implicit input argument");
3394 } else {
3395 unsigned SpecialArgOffset =
3396 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3397 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3398 SpecialArgOffset);
3399 MemOpChains.push_back(ArgStore);
3400 }
3401 }
3402
3403 // Pack workitem IDs into a single register or pass it as is if already
3404 // packed.
3405 const ArgDescriptor *OutgoingArg;
3406 const TargetRegisterClass *ArgRC;
3407 LLT Ty;
3408
3409 std::tie(OutgoingArg, ArgRC, Ty) =
3411 if (!OutgoingArg)
3412 std::tie(OutgoingArg, ArgRC, Ty) =
3414 if (!OutgoingArg)
3415 std::tie(OutgoingArg, ArgRC, Ty) =
3417 if (!OutgoingArg)
3418 return;
3419
3420 const ArgDescriptor *IncomingArgX = std::get<0>(
3422 const ArgDescriptor *IncomingArgY = std::get<0>(
3424 const ArgDescriptor *IncomingArgZ = std::get<0>(
3426
3427 SDValue InputReg;
3428 SDLoc SL;
3429
3430 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3431 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3432 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3433
3434 // If incoming ids are not packed we need to pack them.
3435 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3436 NeedWorkItemIDX) {
3437 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3438 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3439 } else {
3440 InputReg = DAG.getConstant(0, DL, MVT::i32);
3441 }
3442 }
3443
3444 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3445 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3446 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3447 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3448 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3449 InputReg = InputReg.getNode() ?
3450 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3451 }
3452
3453 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3454 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3455 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3456 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3457 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3458 InputReg = InputReg.getNode() ?
3459 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3460 }
3461
3462 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3463 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3464 // We're in a situation where the outgoing function requires the workitem
3465 // ID, but the calling function does not have it (e.g a graphics function
3466 // calling a C calling convention function). This is illegal, but we need
3467 // to produce something.
3468 InputReg = DAG.getUNDEF(MVT::i32);
3469 } else {
3470 // Workitem ids are already packed, any of present incoming arguments
3471 // will carry all required fields.
3473 IncomingArgX ? *IncomingArgX :
3474 IncomingArgY ? *IncomingArgY :
3475 *IncomingArgZ, ~0u);
3476 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3477 }
3478 }
3479
3480 if (OutgoingArg->isRegister()) {
3481 if (InputReg)
3482 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3483
3484 CCInfo.AllocateReg(OutgoingArg->getRegister());
3485 } else {
3486 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3487 if (InputReg) {
3488 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3489 SpecialArgOffset);
3490 MemOpChains.push_back(ArgStore);
3491 }
3492 }
3493}
3494
3496 return CC == CallingConv::Fast;
3497}
3498
3499/// Return true if we might ever do TCO for calls with this calling convention.
3501 switch (CC) {
3502 case CallingConv::C:
3504 return true;
3505 default:
3506 return canGuaranteeTCO(CC);
3507 }
3508}
3509
3511 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3513 const SmallVectorImpl<SDValue> &OutVals,
3514 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3515 if (AMDGPU::isChainCC(CalleeCC))
3516 return true;
3517
3518 if (!mayTailCallThisCC(CalleeCC))
3519 return false;
3520
3521 // For a divergent call target, we need to do a waterfall loop over the
3522 // possible callees which precludes us from using a simple jump.
3523 if (Callee->isDivergent())
3524 return false;
3525
3527 const Function &CallerF = MF.getFunction();
3528 CallingConv::ID CallerCC = CallerF.getCallingConv();
3530 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3531
3532 // Kernels aren't callable, and don't have a live in return address so it
3533 // doesn't make sense to do a tail call with entry functions.
3534 if (!CallerPreserved)
3535 return false;
3536
3537 bool CCMatch = CallerCC == CalleeCC;
3538
3540 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3541 return true;
3542 return false;
3543 }
3544
3545 // TODO: Can we handle var args?
3546 if (IsVarArg)
3547 return false;
3548
3549 for (const Argument &Arg : CallerF.args()) {
3550 if (Arg.hasByValAttr())
3551 return false;
3552 }
3553
3554 LLVMContext &Ctx = *DAG.getContext();
3555
3556 // Check that the call results are passed in the same way.
3557 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3558 CCAssignFnForCall(CalleeCC, IsVarArg),
3559 CCAssignFnForCall(CallerCC, IsVarArg)))
3560 return false;
3561
3562 // The callee has to preserve all registers the caller needs to preserve.
3563 if (!CCMatch) {
3564 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3565 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3566 return false;
3567 }
3568
3569 // Nothing more to check if the callee is taking no arguments.
3570 if (Outs.empty())
3571 return true;
3572
3574 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3575
3576 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3577
3578 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3579 // If the stack arguments for this call do not fit into our own save area then
3580 // the call cannot be made tail.
3581 // TODO: Is this really necessary?
3582 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3583 return false;
3584
3585 const MachineRegisterInfo &MRI = MF.getRegInfo();
3586 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3587}
3588
3590 if (!CI->isTailCall())
3591 return false;
3592
3593 const Function *ParentFn = CI->getParent()->getParent();
3595 return false;
3596 return true;
3597}
3598
3599// The wave scratch offset register is used as the global base pointer.
3601 SmallVectorImpl<SDValue> &InVals) const {
3602 CallingConv::ID CallConv = CLI.CallConv;
3603 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3604
3605 SelectionDAG &DAG = CLI.DAG;
3606
3607 TargetLowering::ArgListEntry RequestedExec;
3608 if (IsChainCallConv) {
3609 // The last argument should be the value that we need to put in EXEC.
3610 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3611 // don't treat it like the rest of the arguments.
3612 RequestedExec = CLI.Args.back();
3613 assert(RequestedExec.Node && "No node for EXEC");
3614
3615 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3616 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3617
3618 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3619 CLI.Outs.pop_back();
3620 CLI.OutVals.pop_back();
3621
3622 if (RequestedExec.Ty->isIntegerTy(64)) {
3623 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3624 CLI.Outs.pop_back();
3625 CLI.OutVals.pop_back();
3626 }
3627
3628 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3629 "Haven't popped all the pieces of the EXEC mask");
3630 }
3631
3632 const SDLoc &DL = CLI.DL;
3634 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3636 SDValue Chain = CLI.Chain;
3637 SDValue Callee = CLI.Callee;
3638 bool &IsTailCall = CLI.IsTailCall;
3639 bool IsVarArg = CLI.IsVarArg;
3640 bool IsSibCall = false;
3642
3643 if (Callee.isUndef() || isNullConstant(Callee)) {
3644 if (!CLI.IsTailCall) {
3645 for (ISD::InputArg &Arg : CLI.Ins)
3646 InVals.push_back(DAG.getUNDEF(Arg.VT));
3647 }
3648
3649 return Chain;
3650 }
3651
3652 if (IsVarArg) {
3653 return lowerUnhandledCall(CLI, InVals,
3654 "unsupported call to variadic function ");
3655 }
3656
3657 if (!CLI.CB)
3658 report_fatal_error("unsupported libcall legalization");
3659
3660 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3661 return lowerUnhandledCall(CLI, InVals,
3662 "unsupported required tail call to function ");
3663 }
3664
3665 if (IsTailCall) {
3667 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3668 if (!IsTailCall &&
3669 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3670 report_fatal_error("failed to perform tail call elimination on a call "
3671 "site marked musttail or on llvm.amdgcn.cs.chain");
3672 }
3673
3674 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3675
3676 // A sibling call is one where we're under the usual C ABI and not planning
3677 // to change that but can still do a tail call:
3678 if (!TailCallOpt && IsTailCall)
3679 IsSibCall = true;
3680
3681 if (IsTailCall)
3682 ++NumTailCalls;
3683 }
3684
3687 SmallVector<SDValue, 8> MemOpChains;
3688
3689 // Analyze operands of the call, assigning locations to each operand.
3691 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3692 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3693
3694 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3695 // With a fixed ABI, allocate fixed registers before user arguments.
3696 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3697 }
3698
3699 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3700
3701 // Get a count of how many bytes are to be pushed on the stack.
3702 unsigned NumBytes = CCInfo.getStackSize();
3703
3704 if (IsSibCall) {
3705 // Since we're not changing the ABI to make this a tail call, the memory
3706 // operands are already available in the caller's incoming argument space.
3707 NumBytes = 0;
3708 }
3709
3710 // FPDiff is the byte offset of the call's argument area from the callee's.
3711 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3712 // by this amount for a tail call. In a sibling call it must be 0 because the
3713 // caller will deallocate the entire stack and the callee still expects its
3714 // arguments to begin at SP+0. Completely unused for non-tail calls.
3715 int32_t FPDiff = 0;
3716 MachineFrameInfo &MFI = MF.getFrameInfo();
3717
3718 // Adjust the stack pointer for the new arguments...
3719 // These operations are automatically eliminated by the prolog/epilog pass
3720 if (!IsSibCall)
3721 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3722
3723 if (!IsSibCall || IsChainCallConv) {
3724 if (!Subtarget->enableFlatScratch()) {
3725 SmallVector<SDValue, 4> CopyFromChains;
3726
3727 // In the HSA case, this should be an identity copy.
3728 SDValue ScratchRSrcReg
3729 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3730 RegsToPass.emplace_back(IsChainCallConv
3731 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3732 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3733 ScratchRSrcReg);
3734 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3735 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3736 }
3737 }
3738
3739 MVT PtrVT = MVT::i32;
3740
3741 // Walk the register/memloc assignments, inserting copies/loads.
3742 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3743 CCValAssign &VA = ArgLocs[i];
3744 SDValue Arg = OutVals[i];
3745
3746 // Promote the value if needed.
3747 switch (VA.getLocInfo()) {
3748 case CCValAssign::Full:
3749 break;
3750 case CCValAssign::BCvt:
3751 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3752 break;
3753 case CCValAssign::ZExt:
3754 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3755 break;
3756 case CCValAssign::SExt:
3757 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3758 break;
3759 case CCValAssign::AExt:
3760 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3761 break;
3762 case CCValAssign::FPExt:
3763 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3764 break;
3765 default:
3766 llvm_unreachable("Unknown loc info!");
3767 }
3768
3769 if (VA.isRegLoc()) {
3770 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3771 } else {
3772 assert(VA.isMemLoc());
3773
3774 SDValue DstAddr;
3775 MachinePointerInfo DstInfo;
3776
3777 unsigned LocMemOffset = VA.getLocMemOffset();
3778 int32_t Offset = LocMemOffset;
3779
3780 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3781 MaybeAlign Alignment;
3782
3783 if (IsTailCall) {
3784 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3785 unsigned OpSize = Flags.isByVal() ?
3786 Flags.getByValSize() : VA.getValVT().getStoreSize();
3787
3788 // FIXME: We can have better than the minimum byval required alignment.
3789 Alignment =
3790 Flags.isByVal()
3791 ? Flags.getNonZeroByValAlign()
3792 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3793
3794 Offset = Offset + FPDiff;
3795 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3796
3797 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3798 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3799
3800 // Make sure any stack arguments overlapping with where we're storing
3801 // are loaded before this eventual operation. Otherwise they'll be
3802 // clobbered.
3803
3804 // FIXME: Why is this really necessary? This seems to just result in a
3805 // lot of code to copy the stack and write them back to the same
3806 // locations, which are supposed to be immutable?
3807 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3808 } else {
3809 // Stores to the argument stack area are relative to the stack pointer.
3810 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3811 MVT::i32);
3812 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3813 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3814 Alignment =
3815 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3816 }
3817
3818 if (Outs[i].Flags.isByVal()) {
3819 SDValue SizeNode =
3820 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3821 SDValue Cpy =
3822 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3823 Outs[i].Flags.getNonZeroByValAlign(),
3824 /*isVol = */ false, /*AlwaysInline = */ true,
3825 /*CI=*/nullptr, std::nullopt, DstInfo,
3827
3828 MemOpChains.push_back(Cpy);
3829 } else {
3830 SDValue Store =
3831 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3832 MemOpChains.push_back(Store);
3833 }
3834 }
3835 }
3836
3837 if (!MemOpChains.empty())
3838 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3839
3840 // Build a sequence of copy-to-reg nodes chained together with token chain
3841 // and flag operands which copy the outgoing args into the appropriate regs.
3842 SDValue InGlue;
3843 for (auto &RegToPass : RegsToPass) {
3844 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3845 RegToPass.second, InGlue);
3846 InGlue = Chain.getValue(1);
3847 }
3848
3849
3850 // We don't usually want to end the call-sequence here because we would tidy
3851 // the frame up *after* the call, however in the ABI-changing tail-call case
3852 // we've carefully laid out the parameters so that when sp is reset they'll be
3853 // in the correct location.
3854 if (IsTailCall && !IsSibCall) {
3855 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3856 InGlue = Chain.getValue(1);
3857 }
3858
3859 std::vector<SDValue> Ops;
3860 Ops.push_back(Chain);
3861 Ops.push_back(Callee);
3862 // Add a redundant copy of the callee global which will not be legalized, as
3863 // we need direct access to the callee later.
3864 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3865 const GlobalValue *GV = GSD->getGlobal();
3866 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3867 } else {
3868 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3869 }
3870
3871 if (IsTailCall) {
3872 // Each tail call may have to adjust the stack by a different amount, so
3873 // this information must travel along with the operation for eventual
3874 // consumption by emitEpilogue.
3875 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3876 }
3877
3878 if (IsChainCallConv)
3879 Ops.push_back(RequestedExec.Node);
3880
3881 // Add argument registers to the end of the list so that they are known live
3882 // into the call.
3883 for (auto &RegToPass : RegsToPass) {
3884 Ops.push_back(DAG.getRegister(RegToPass.first,
3885 RegToPass.second.getValueType()));
3886 }
3887
3888 // Add a register mask operand representing the call-preserved registers.
3889 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3890 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3891 assert(Mask && "Missing call preserved mask for calling convention");
3892 Ops.push_back(DAG.getRegisterMask(Mask));
3893
3894 if (SDValue Token = CLI.ConvergenceControlToken) {
3896 GlueOps.push_back(Token);
3897 if (InGlue)
3898 GlueOps.push_back(InGlue);
3899
3900 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3901 MVT::Glue, GlueOps),
3902 0);
3903 }
3904
3905 if (InGlue)
3906 Ops.push_back(InGlue);
3907
3908 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3909
3910 // If we're doing a tall call, use a TC_RETURN here rather than an
3911 // actual call instruction.
3912 if (IsTailCall) {
3913 MFI.setHasTailCall();
3914 unsigned OPC = AMDGPUISD::TC_RETURN;
3915 switch (CallConv) {
3918 break;
3922 break;
3923 }
3924
3925 return DAG.getNode(OPC, DL, NodeTys, Ops);
3926 }
3927
3928 // Returns a chain and a flag for retval copy to use.
3929 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3930 Chain = Call.getValue(0);
3931 InGlue = Call.getValue(1);
3932
3933 uint64_t CalleePopBytes = NumBytes;
3934 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3935 if (!Ins.empty())
3936 InGlue = Chain.getValue(1);
3937
3938 // Handle result values, copying them out of physregs into vregs that we
3939 // return.
3940 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3941 InVals, /*IsThisReturn=*/false, SDValue());
3942}
3943
3944// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3945// except for applying the wave size scale to the increment amount.
3947 SDValue Op, SelectionDAG &DAG) const {
3948 const MachineFunction &MF = DAG.getMachineFunction();
3950
3951 SDLoc dl(Op);
3952 EVT VT = Op.getValueType();
3953 SDValue Tmp1 = Op;
3954 SDValue Tmp2 = Op.getValue(1);
3955 SDValue Tmp3 = Op.getOperand(2);
3956 SDValue Chain = Tmp1.getOperand(0);
3957
3958 Register SPReg = Info->getStackPtrOffsetReg();
3959
3960 // Chain the dynamic stack allocation so that it doesn't modify the stack
3961 // pointer when other instructions are using the stack.
3962 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3963
3964 SDValue Size = Tmp2.getOperand(1);
3965 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3966 Chain = SP.getValue(1);
3967 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3968 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3969 unsigned Opc =
3972
3973 SDValue ScaledSize = DAG.getNode(
3974 ISD::SHL, dl, VT, Size,
3975 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3976
3977 Align StackAlign = TFL->getStackAlign();
3978 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3979 if (Alignment && *Alignment > StackAlign) {
3980 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3981 DAG.getConstant(-(uint64_t)Alignment->value()
3982 << Subtarget->getWavefrontSizeLog2(),
3983 dl, VT));
3984 }
3985
3986 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3987 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3988
3989 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3990}
3991
3993 SelectionDAG &DAG) const {
3994 // We only handle constant sizes here to allow non-entry block, static sized
3995 // allocas. A truly dynamic value is more difficult to support because we
3996 // don't know if the size value is uniform or not. If the size isn't uniform,
3997 // we would need to do a wave reduction to get the maximum size to know how
3998 // much to increment the uniform stack pointer.
3999 SDValue Size = Op.getOperand(1);
4000 if (isa<ConstantSDNode>(Size))
4001 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4002
4004}
4005
4007 if (Op.getValueType() != MVT::i32)
4008 return Op; // Defer to cannot select error.
4009
4011 SDLoc SL(Op);
4012
4013 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4014
4015 // Convert from wave uniform to swizzled vector address. This should protect
4016 // from any edge cases where the stacksave result isn't directly used with
4017 // stackrestore.
4018 SDValue VectorAddress =
4019 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4020 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4021}
4022
4024 SelectionDAG &DAG) const {
4025 SDLoc SL(Op);
4026 assert(Op.getValueType() == MVT::i32);
4027
4028 uint32_t BothRoundHwReg =
4030 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4031
4032 SDValue IntrinID =
4033 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4034 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4035 Op.getOperand(0), IntrinID, GetRoundBothImm);
4036
4037 // There are two rounding modes, one for f32 and one for f64/f16. We only
4038 // report in the standard value range if both are the same.
4039 //
4040 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4041 // ties away from zero is not supported, and the other values are rotated by
4042 // 1.
4043 //
4044 // If the two rounding modes are not the same, report a target defined value.
4045
4046 // Mode register rounding mode fields:
4047 //
4048 // [1:0] Single-precision round mode.
4049 // [3:2] Double/Half-precision round mode.
4050 //
4051 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4052 //
4053 // Hardware Spec
4054 // Toward-0 3 0
4055 // Nearest Even 0 1
4056 // +Inf 1 2
4057 // -Inf 2 3
4058 // NearestAway0 N/A 4
4059 //
4060 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4061 // table we can index by the raw hardware mode.
4062 //
4063 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4064
4065 SDValue BitTable =
4067
4068 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4069 SDValue RoundModeTimesNumBits =
4070 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4071
4072 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4073 // knew only one mode was demanded.
4074 SDValue TableValue =
4075 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4076 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4077
4078 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4079 SDValue TableEntry =
4080 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4081
4082 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4083 // if it's an extended value.
4084 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4085 SDValue IsStandardValue =
4086 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4087 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4088 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4089 TableEntry, EnumOffset);
4090
4091 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4092}
4093
4095 SelectionDAG &DAG) const {
4096 SDLoc SL(Op);
4097
4098 SDValue NewMode = Op.getOperand(1);
4099 assert(NewMode.getValueType() == MVT::i32);
4100
4101 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4102 // hardware MODE.fp_round values.
4103 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4104 uint32_t ClampedVal = std::min(
4105 static_cast<uint32_t>(ConstMode->getZExtValue()),
4107 NewMode = DAG.getConstant(
4108 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4109 } else {
4110 // If we know the input can only be one of the supported standard modes in
4111 // the range 0-3, we can use a simplified mapping to hardware values.
4112 KnownBits KB = DAG.computeKnownBits(NewMode);
4113 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4114 // The supported standard values are 0-3. The extended values start at 8. We
4115 // need to offset by 4 if the value is in the extended range.
4116
4117 if (UseReducedTable) {
4118 // Truncate to the low 32-bits.
4119 SDValue BitTable = DAG.getConstant(
4120 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4121
4122 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4123 SDValue RoundModeTimesNumBits =
4124 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4125
4126 NewMode =
4127 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4128
4129 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4130 // the table extracted bits into inline immediates.
4131 } else {
4132 // table_index = umin(value, value - 4)
4133 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4134 SDValue BitTable =
4136
4137 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4138 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4139 SDValue IndexVal =
4140 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4141
4142 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4143 SDValue RoundModeTimesNumBits =
4144 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4145
4146 SDValue TableValue =
4147 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4148 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4149
4150 // No need to mask out the high bits since the setreg will ignore them
4151 // anyway.
4152 NewMode = TruncTable;
4153 }
4154
4155 // Insert a readfirstlane in case the value is a VGPR. We could do this
4156 // earlier and keep more operations scalar, but that interferes with
4157 // combining the source.
4158 SDValue ReadFirstLaneID =
4159 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4160 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4161 ReadFirstLaneID, NewMode);
4162 }
4163
4164 // N.B. The setreg will be later folded into s_round_mode on supported
4165 // targets.
4166 SDValue IntrinID =
4167 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4168 uint32_t BothRoundHwReg =
4170 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4171
4172 SDValue SetReg =
4173 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4174 IntrinID, RoundBothImm, NewMode);
4175
4176 return SetReg;
4177}
4178
4180 if (Op->isDivergent())
4181 return SDValue();
4182
4183 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4188 break;
4189 default:
4190 return SDValue();
4191 }
4192
4193 return Op;
4194}
4195
4196// Work around DAG legality rules only based on the result type.
4198 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4199 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4200 EVT SrcVT = Src.getValueType();
4201
4202 if (SrcVT.getScalarType() != MVT::bf16)
4203 return Op;
4204
4205 SDLoc SL(Op);
4206 SDValue BitCast =
4207 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4208
4209 EVT DstVT = Op.getValueType();
4210 if (IsStrict)
4211 llvm_unreachable("Need STRICT_BF16_TO_FP");
4212
4213 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4214}
4215
4217 SDLoc SL(Op);
4218 if (Op.getValueType() != MVT::i64)
4219 return Op;
4220
4221 uint32_t ModeHwReg =
4223 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4224 uint32_t TrapHwReg =
4226 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4227
4228 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4229 SDValue IntrinID =
4230 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4231 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4232 Op.getOperand(0), IntrinID, ModeHwRegImm);
4233 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4234 Op.getOperand(0), IntrinID, TrapHwRegImm);
4235 SDValue TokenReg =
4236 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4237 GetTrapReg.getValue(1));
4238
4239 SDValue CvtPtr =
4240 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4241 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4242
4243 return DAG.getMergeValues({Result, TokenReg}, SL);
4244}
4245
4247 SDLoc SL(Op);
4248 if (Op.getOperand(1).getValueType() != MVT::i64)
4249 return Op;
4250
4251 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4252 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4253 DAG.getConstant(0, SL, MVT::i32));
4254 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4255 DAG.getConstant(1, SL, MVT::i32));
4256
4257 SDValue ReadFirstLaneID =
4258 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4259 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4260 ReadFirstLaneID, NewModeReg);
4261 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4262 ReadFirstLaneID, NewTrapReg);
4263
4264 unsigned ModeHwReg =
4266 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4267 unsigned TrapHwReg =
4269 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4270
4271 SDValue IntrinID =
4272 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4273 SDValue SetModeReg =
4274 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4275 IntrinID, ModeHwRegImm, NewModeReg);
4276 SDValue SetTrapReg =
4277 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4278 IntrinID, TrapHwRegImm, NewTrapReg);
4279 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4280}
4281
4283 const MachineFunction &MF) const {
4285 .Case("m0", AMDGPU::M0)
4286 .Case("exec", AMDGPU::EXEC)
4287 .Case("exec_lo", AMDGPU::EXEC_LO)
4288 .Case("exec_hi", AMDGPU::EXEC_HI)
4289 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4290 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4291 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4292 .Default(Register());
4293
4294 if (Reg == AMDGPU::NoRegister) {
4295 report_fatal_error(Twine("invalid register name \""
4296 + StringRef(RegName) + "\"."));
4297
4298 }
4299
4300 if (!Subtarget->hasFlatScrRegister() &&
4301 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4302 report_fatal_error(Twine("invalid register \""
4303 + StringRef(RegName) + "\" for subtarget."));
4304 }
4305
4306 switch (Reg) {
4307 case AMDGPU::M0:
4308 case AMDGPU::EXEC_LO:
4309 case AMDGPU::EXEC_HI:
4310 case AMDGPU::FLAT_SCR_LO:
4311 case AMDGPU::FLAT_SCR_HI:
4312 if (VT.getSizeInBits() == 32)
4313 return Reg;
4314 break;
4315 case AMDGPU::EXEC:
4316 case AMDGPU::FLAT_SCR:
4317 if (VT.getSizeInBits() == 64)
4318 return Reg;
4319 break;
4320 default:
4321 llvm_unreachable("missing register type checking");
4322 }
4323
4324 report_fatal_error(Twine("invalid type for register \""
4325 + StringRef(RegName) + "\"."));
4326}
4327
4328// If kill is not the last instruction, split the block so kill is always a
4329// proper terminator.
4332 MachineBasicBlock *BB) const {
4333 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4335 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4336 return SplitBB;
4337}
4338
4339// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4340// \p MI will be the only instruction in the loop body block. Otherwise, it will
4341// be the first instruction in the remainder block.
4342//
4343/// \returns { LoopBody, Remainder }
4344static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4348
4349 // To insert the loop we need to split the block. Move everything after this
4350 // point to a new block, and insert a new empty block between the two.
4352 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4354 ++MBBI;
4355
4356 MF->insert(MBBI, LoopBB);
4357 MF->insert(MBBI, RemainderBB);
4358
4359 LoopBB->addSuccessor(LoopBB);
4360 LoopBB->addSuccessor(RemainderBB);
4361
4362 // Move the rest of the block into a new block.
4363 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4364
4365 if (InstInLoop) {
4366 auto Next = std::next(I);
4367
4368 // Move instruction to loop body.
4369 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4370
4371 // Move the rest of the block.
4372 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4373 } else {
4374 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4375 }
4376
4377 MBB.addSuccessor(LoopBB);
4378
4379 return std::pair(LoopBB, RemainderBB);
4380}
4381
4382/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4384 MachineBasicBlock *MBB = MI.getParent();
4386 auto I = MI.getIterator();
4387 auto E = std::next(I);
4388
4389 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4390 .addImm(0);
4391
4392 MIBundleBuilder Bundler(*MBB, I, E);
4393 finalizeBundle(*MBB, Bundler.begin());
4394}
4395
4398 MachineBasicBlock *BB) const {
4399 const DebugLoc &DL = MI.getDebugLoc();
4400
4402
4403 MachineBasicBlock *LoopBB;
4404 MachineBasicBlock *RemainderBB;
4406
4407 // Apparently kill flags are only valid if the def is in the same block?
4408 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4409 Src->setIsKill(false);
4410
4411 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4412
4413 MachineBasicBlock::iterator I = LoopBB->end();
4414
4415 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4417
4418 // Clear TRAP_STS.MEM_VIOL
4419 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4420 .addImm(0)
4421 .addImm(EncodedReg);
4422
4424
4425 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4426
4427 // Load and check TRAP_STS.MEM_VIOL
4428 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4429 .addImm(EncodedReg);
4430
4431 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4432 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4433 .addReg(Reg, RegState::Kill)
4434 .addImm(0);
4435 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4436 .addMBB(LoopBB);
4437
4438 return RemainderBB;
4439}
4440
4441// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4442// wavefront. If the value is uniform and just happens to be in a VGPR, this
4443// will only do one iteration. In the worst case, this will loop 64 times.
4444//
4445// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4448 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4449 const DebugLoc &DL, const MachineOperand &Idx,
4450 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4451 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4452 Register &SGPRIdxReg) {
4453
4454 MachineFunction *MF = OrigBB.getParent();
4455 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4456 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4458
4459 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4460 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4461 Register NewExec = MRI.createVirtualRegister(BoolRC);
4462 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4463 Register CondReg = MRI.createVirtualRegister(BoolRC);
4464
4465 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4466 .addReg(InitReg)
4467 .addMBB(&OrigBB)
4468 .addReg(ResultReg)
4469 .addMBB(&LoopBB);
4470
4471 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4472 .addReg(InitSaveExecReg)
4473 .addMBB(&OrigBB)
4474 .addReg(NewExec)
4475 .addMBB(&LoopBB);
4476
4477 // Read the next variant <- also loop target.
4478 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4479 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4480
4481 // Compare the just read M0 value to all possible Idx values.
4482 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4483 .addReg(CurrentIdxReg)
4484 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4485
4486 // Update EXEC, save the original EXEC value to VCC.
4487 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4488 : AMDGPU::S_AND_SAVEEXEC_B64),
4489 NewExec)
4490 .addReg(CondReg, RegState::Kill);
4491
4492 MRI.setSimpleHint(NewExec, CondReg);
4493
4494 if (UseGPRIdxMode) {
4495 if (Offset == 0) {
4496 SGPRIdxReg = CurrentIdxReg;
4497 } else {
4498 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4499 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4500 .addReg(CurrentIdxReg, RegState::Kill)
4501 .addImm(Offset);
4502 }
4503 } else {
4504 // Move index from VCC into M0
4505 if (Offset == 0) {
4506 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4507 .addReg(CurrentIdxReg, RegState::Kill);
4508 } else {
4509 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4510 .addReg(CurrentIdxReg, RegState::Kill)
4511 .addImm(Offset);
4512 }
4513 }
4514
4515 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4516 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4517 MachineInstr *InsertPt =
4518 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4519 : AMDGPU::S_XOR_B64_term), Exec)
4520 .addReg(Exec)
4521 .addReg(NewExec);
4522
4523 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4524 // s_cbranch_scc0?
4525
4526 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4527 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4528 .addMBB(&LoopBB);
4529
4530 return InsertPt->getIterator();
4531}
4532
4533// This has slightly sub-optimal regalloc when the source vector is killed by
4534// the read. The register allocator does not understand that the kill is
4535// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4536// subregister from it, using 1 more VGPR than necessary. This was saved when
4537// this was expanded after register allocation.
4540 unsigned InitResultReg, unsigned PhiReg, int Offset,
4541 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4543 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4544 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4546 const DebugLoc &DL = MI.getDebugLoc();
4548
4549 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4550 Register DstReg = MI.getOperand(0).getReg();
4551 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4552 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4553 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4554 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4555
4556 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4557
4558 // Save the EXEC mask
4559 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4560 .addReg(Exec);
4561
4562 MachineBasicBlock *LoopBB;
4563 MachineBasicBlock *RemainderBB;
4564 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4565
4566 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4567
4568 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4569 InitResultReg, DstReg, PhiReg, TmpExec,
4570 Offset, UseGPRIdxMode, SGPRIdxReg);
4571
4572 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4574 ++MBBI;
4575 MF->insert(MBBI, LandingPad);
4576 LoopBB->removeSuccessor(RemainderBB);
4577 LandingPad->addSuccessor(RemainderBB);
4578 LoopBB->addSuccessor(LandingPad);
4579 MachineBasicBlock::iterator First = LandingPad->begin();
4580 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4581 .addReg(SaveExec);
4582
4583 return InsPt;
4584}
4585
4586// Returns subreg index, offset
4587static std::pair<unsigned, int>
4589 const TargetRegisterClass *SuperRC,
4590 unsigned VecReg,
4591 int Offset) {
4592 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4593
4594 // Skip out of bounds offsets, or else we would end up using an undefined
4595 // register.
4596 if (Offset >= NumElts || Offset < 0)
4597 return std::pair(AMDGPU::sub0, Offset);
4598
4599 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4600}
4601
4604 int Offset) {
4605 MachineBasicBlock *MBB = MI.getParent();
4606 const DebugLoc &DL = MI.getDebugLoc();
4608
4609 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4610
4611 assert(Idx->getReg() != AMDGPU::NoRegister);
4612
4613 if (Offset == 0) {
4614 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4615 } else {
4616 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4617 .add(*Idx)
4618 .addImm(Offset);
4619 }
4620}
4621
4624 int Offset) {
4625 MachineBasicBlock *MBB = MI.getParent();
4626 const DebugLoc &DL = MI.getDebugLoc();
4628
4629 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4630
4631 if (Offset == 0)
4632 return Idx->getReg();
4633
4634 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4635 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4636 .add(*Idx)
4637 .addImm(Offset);
4638 return Tmp;
4639}
4640
4643 const GCNSubtarget &ST) {
4644 const SIInstrInfo *TII = ST.getInstrInfo();
4645 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4648
4649 Register Dst = MI.getOperand(0).getReg();
4650 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4651 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4652 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4653
4654 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4655 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4656
4657 unsigned SubReg;
4658 std::tie(SubReg, Offset)
4659 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4660
4661 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4662
4663 // Check for a SGPR index.
4664 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4666 const DebugLoc &DL = MI.getDebugLoc();
4667
4668 if (UseGPRIdxMode) {
4669 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4670 // to avoid interfering with other uses, so probably requires a new
4671 // optimization pass.
4673
4674 const MCInstrDesc &GPRIDXDesc =
4675 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4676 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4677 .addReg(SrcReg)
4678 .addReg(Idx)
4679 .addImm(SubReg);
4680 } else {
4682
4683 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4684 .addReg(SrcReg, 0, SubReg)
4685 .addReg(SrcReg, RegState::Implicit);
4686 }
4687
4688 MI.eraseFromParent();
4689
4690 return &MBB;
4691 }
4692
4693 // Control flow needs to be inserted if indexing with a VGPR.
4694 const DebugLoc &DL = MI.getDebugLoc();
4696
4697 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4698 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4699
4700 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4701
4702 Register SGPRIdxReg;
4703 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4704 UseGPRIdxMode, SGPRIdxReg);
4705
4706 MachineBasicBlock *LoopBB = InsPt->getParent();
4707
4708 if (UseGPRIdxMode) {
4709 const MCInstrDesc &GPRIDXDesc =
4710 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4711
4712 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4713 .addReg(SrcReg)
4714 .addReg(SGPRIdxReg)
4715 .addImm(SubReg);
4716 } else {
4717 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4718 .addReg(SrcReg, 0, SubReg)
4719 .addReg(SrcReg, RegState::Implicit);
4720 }
4721
4722 MI.eraseFromParent();
4723
4724 return LoopBB;
4725}
4726
4729 const GCNSubtarget &ST) {
4730 const SIInstrInfo *TII = ST.getInstrInfo();
4731 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4734
4735 Register Dst = MI.getOperand(0).getReg();
4736 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4737 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4738 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4739 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4740 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4741 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4742
4743 // This can be an immediate, but will be folded later.
4744 assert(Val->getReg());
4745
4746 unsigned SubReg;
4747 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4748 SrcVec->getReg(),
4749 Offset);
4750 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4751
4752 if (Idx->getReg() == AMDGPU::NoRegister) {
4754 const DebugLoc &DL = MI.getDebugLoc();
4755
4756 assert(Offset == 0);
4757
4758 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4759 .add(*SrcVec)
4760 .add(*Val)
4761 .addImm(SubReg);
4762
4763 MI.eraseFromParent();
4764 return &MBB;
4765 }
4766
4767 // Check for a SGPR index.
4768 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4770 const DebugLoc &DL = MI.getDebugLoc();
4771
4772 if (UseGPRIdxMode) {
4774
4775 const MCInstrDesc &GPRIDXDesc =
4776 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4777 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4778 .addReg(SrcVec->getReg())
4779 .add(*Val)
4780 .addReg(Idx)
4781 .addImm(SubReg);
4782 } else {
4784
4785 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4786 TRI.getRegSizeInBits(*VecRC), 32, false);
4787 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4788 .addReg(SrcVec->getReg())
4789 .add(*Val)
4790 .addImm(SubReg);
4791 }
4792 MI.eraseFromParent();
4793 return &MBB;
4794 }
4795
4796 // Control flow needs to be inserted if indexing with a VGPR.
4797 if (Val->isReg())
4798 MRI.clearKillFlags(Val->getReg());
4799
4800 const DebugLoc &DL = MI.getDebugLoc();
4801
4802 Register PhiReg = MRI.createVirtualRegister(VecRC);
4803
4804 Register SGPRIdxReg;
4805 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4806 UseGPRIdxMode, SGPRIdxReg);
4807 MachineBasicBlock *LoopBB = InsPt->getParent();
4808
4809 if (UseGPRIdxMode) {
4810 const MCInstrDesc &GPRIDXDesc =
4811 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4812
4813 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4814 .addReg(PhiReg)
4815 .add(*Val)
4816 .addReg(SGPRIdxReg)
4817 .addImm(SubReg);
4818 } else {
4819 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4820 TRI.getRegSizeInBits(*VecRC), 32, false);
4821 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4822 .addReg(PhiReg)
4823 .add(*Val)
4824 .addImm(SubReg);
4825 }
4826
4827 MI.eraseFromParent();
4828 return LoopBB;
4829}
4830
4833 const GCNSubtarget &ST,
4834 unsigned Opc) {
4836 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4837 const DebugLoc &DL = MI.getDebugLoc();
4838 const SIInstrInfo *TII = ST.getInstrInfo();
4839
4840 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4841 Register SrcReg = MI.getOperand(1).getReg();
4842 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4843 Register DstReg = MI.getOperand(0).getReg();
4844 MachineBasicBlock *RetBB = nullptr;
4845 if (isSGPR) {
4846 // These operations with a uniform value i.e. SGPR are idempotent.
4847 // Reduced value will be same as given sgpr.
4848 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4849 RetBB = &BB;
4850 } else {
4851 // TODO: Implement DPP Strategy and switch based on immediate strategy
4852 // operand. For now, for all the cases (default, Iterative and DPP we use
4853 // iterative approach by default.)
4854
4855 // To reduce the VGPR using iterative approach, we need to iterate
4856 // over all the active lanes. Lowering consists of ComputeLoop,
4857 // which iterate over only active lanes. We use copy of EXEC register
4858 // as induction variable and every active lane modifies it using bitset0
4859 // so that we will get the next active lane for next iteration.
4861 Register SrcReg = MI.getOperand(1).getReg();
4862
4863 // Create Control flow for loop
4864 // Split MI's Machine Basic block into For loop
4865 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4866
4867 // Create virtual registers required for lowering.
4868 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4869 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4870 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4871 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4872
4873 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4874 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4875 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4876
4877 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4878 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4879
4880 bool IsWave32 = ST.isWave32();
4881 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4882 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4883
4884 // Create initail values of induction variable from Exec, Accumulator and
4885 // insert branch instr to newly created ComputeBlockk
4886 uint32_t InitalValue =
4887 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4888 auto TmpSReg =
4889 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4890 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4891 .addImm(InitalValue);
4892 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4893
4894 // Start constructing ComputeLoop
4895 I = ComputeLoop->end();
4896 auto Accumulator =
4897 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4898 .addReg(InitalValReg)
4899 .addMBB(&BB);
4900 auto ActiveBits =
4901 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4902 .addReg(TmpSReg->getOperand(0).getReg())
4903 .addMBB(&BB);
4904
4905 // Perform the computations
4906 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4907 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4908 .addReg(ActiveBits->getOperand(0).getReg());
4909 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4910 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4911 .addReg(SrcReg)
4912 .addReg(FF1->getOperand(0).getReg());
4913 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4914 .addReg(Accumulator->getOperand(0).getReg())
4915 .addReg(LaneValue->getOperand(0).getReg());
4916
4917 // Manipulate the iterator to get the next active lane
4918 unsigned BITSETOpc =
4919 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4920 auto NewActiveBits =
4921 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4922 .addReg(FF1->getOperand(0).getReg())
4923 .addReg(ActiveBits->getOperand(0).getReg());
4924
4925 // Add phi nodes
4926 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4927 .addMBB(ComputeLoop);
4928 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4929 .addMBB(ComputeLoop);
4930
4931 // Creating branching
4932 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4933 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4934 .addReg(NewActiveBits->getOperand(0).getReg())
4935 .addImm(0);
4936 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4937 .addMBB(ComputeLoop);
4938
4939 RetBB = ComputeEnd;
4940 }
4941 MI.eraseFromParent();
4942 return RetBB;
4943}
4944
4946 MachineInstr &MI, MachineBasicBlock *BB) const {
4947
4949 MachineFunction *MF = BB->getParent();
4951
4952 switch (MI.getOpcode()) {
4953 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4954 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4955 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4956 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4957 case AMDGPU::S_UADDO_PSEUDO:
4958 case AMDGPU::S_USUBO_PSEUDO: {
4959 const DebugLoc &DL = MI.getDebugLoc();
4960 MachineOperand &Dest0 = MI.getOperand(0);
4961 MachineOperand &Dest1 = MI.getOperand(1);
4962 MachineOperand &Src0 = MI.getOperand(2);
4963 MachineOperand &Src1 = MI.getOperand(3);
4964
4965 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4966 ? AMDGPU::S_ADD_I32
4967 : AMDGPU::S_SUB_I32;
4968 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4969
4970 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4971 .addImm(1)
4972 .addImm(0);
4973
4974 MI.eraseFromParent();
4975 return BB;
4976 }
4977 case AMDGPU::S_ADD_U64_PSEUDO:
4978 case AMDGPU::S_SUB_U64_PSEUDO: {
4979 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4980 // For GFX12, we emit s_add_u64 and s_sub_u64.
4981 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4983 const DebugLoc &DL = MI.getDebugLoc();
4984 MachineOperand &Dest = MI.getOperand(0);
4985 MachineOperand &Src0 = MI.getOperand(1);
4986 MachineOperand &Src1 = MI.getOperand(2);
4987 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4988 if (Subtarget->hasScalarAddSub64()) {
4989 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4990 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4991 .add(Src0)
4992 .add(Src1);
4993 } else {
4994 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4995 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4996
4997 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4998 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4999
5000 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5001 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5002 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5003 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5004
5005 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5006 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5007 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5008 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5009
5010 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5011 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5012 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5013 .add(Src0Sub0)
5014 .add(Src1Sub0);
5015 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5016 .add(Src0Sub1)
5017 .add(Src1Sub1);
5018 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5019 .addReg(DestSub0)
5020 .addImm(AMDGPU::sub0)
5021 .addReg(DestSub1)
5022 .addImm(AMDGPU::sub1);
5023 }
5024 MI.eraseFromParent();
5025 return BB;
5026 }
5027 case AMDGPU::V_ADD_U64_PSEUDO:
5028 case AMDGPU::V_SUB_U64_PSEUDO: {
5030 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5031 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5032 const DebugLoc &DL = MI.getDebugLoc();
5033
5034 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5035
5036 MachineOperand &Dest = MI.getOperand(0);
5037 MachineOperand &Src0 = MI.getOperand(1);
5038 MachineOperand &Src1 = MI.getOperand(2);
5039
5040 if (IsAdd && ST.hasLshlAddB64()) {
5041 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5042 Dest.getReg())
5043 .add(Src0)
5044 .addImm(0)
5045 .add(Src1);
5046 TII->legalizeOperands(*Add);
5047 MI.eraseFromParent();
5048 return BB;
5049 }
5050
5051 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5052
5053 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5054 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5055
5056 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5057 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5058
5059 const TargetRegisterClass *Src0RC = Src0.isReg()
5060 ? MRI.getRegClass(Src0.getReg())
5061 : &AMDGPU::VReg_64RegClass;
5062 const TargetRegisterClass *Src1RC = Src1.isReg()
5063 ? MRI.getRegClass(Src1.getReg())
5064 : &AMDGPU::VReg_64RegClass;
5065
5066 const TargetRegisterClass *Src0SubRC =
5067 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5068 const TargetRegisterClass *Src1SubRC =
5069 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5070
5071 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5072 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5073 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5074 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5075
5076 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5077 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5078 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5079 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5080
5081 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5082 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5083 .addReg(CarryReg, RegState::Define)
5084 .add(SrcReg0Sub0)
5085 .add(SrcReg1Sub0)
5086 .addImm(0); // clamp bit
5087
5088 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5089 MachineInstr *HiHalf =
5090 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5091 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5092 .add(SrcReg0Sub1)
5093 .add(SrcReg1Sub1)
5094 .addReg(CarryReg, RegState::Kill)
5095 .addImm(0); // clamp bit
5096
5097 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5098 .addReg(DestSub0)
5099 .addImm(AMDGPU::sub0)
5100 .addReg(DestSub1)
5101 .addImm(AMDGPU::sub1);
5102 TII->legalizeOperands(*LoHalf);
5103 TII->legalizeOperands(*HiHalf);
5104 MI.eraseFromParent();
5105 return BB;
5106 }
5107 case AMDGPU::S_ADD_CO_PSEUDO:
5108 case AMDGPU::S_SUB_CO_PSEUDO: {
5109 // This pseudo has a chance to be selected
5110 // only from uniform add/subcarry node. All the VGPR operands
5111 // therefore assumed to be splat vectors.
5113 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5114 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5116 const DebugLoc &DL = MI.getDebugLoc();
5117 MachineOperand &Dest = MI.getOperand(0);
5118 MachineOperand &CarryDest = MI.getOperand(1);
5119 MachineOperand &Src0 = MI.getOperand(2);
5120 MachineOperand &Src1 = MI.getOperand(3);
5121 MachineOperand &Src2 = MI.getOperand(4);
5122 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5123 ? AMDGPU::S_ADDC_U32
5124 : AMDGPU::S_SUBB_U32;
5125 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5126 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5127 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5128 .addReg(Src0.getReg());
5129 Src0.setReg(RegOp0);
5130 }
5131 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5132 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5133 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5134 .addReg(Src1.getReg());
5135 Src1.setReg(RegOp1);
5136 }
5137 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5138 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5139 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5140 .addReg(Src2.getReg());
5141 Src2.setReg(RegOp2);
5142 }
5143
5144 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5145 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5146 assert(WaveSize == 64 || WaveSize == 32);
5147
5148 if (WaveSize == 64) {
5149 if (ST.hasScalarCompareEq64()) {
5150 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5151 .addReg(Src2.getReg())
5152 .addImm(0);
5153 } else {
5154 const TargetRegisterClass *SubRC =
5155 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5156 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5157 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5158 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5159 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5160 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5161
5162 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5163 .add(Src2Sub0)
5164 .add(Src2Sub1);
5165
5166 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5167 .addReg(Src2_32, RegState::Kill)
5168 .addImm(0);
5169 }
5170 } else {
5171 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5172 .addReg(Src2.getReg())
5173 .addImm(0);
5174 }
5175
5176 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5177
5178 unsigned SelOpc =
5179 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5180
5181 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5182 .addImm(-1)
5183 .addImm(0);
5184
5185 MI.eraseFromParent();
5186 return BB;
5187 }
5188 case AMDGPU::SI_INIT_M0: {
5189 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5190 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5191 .add(MI.getOperand(0));
5192 MI.eraseFromParent();
5193 return BB;
5194 }
5195 case AMDGPU::GET_GROUPSTATICSIZE: {
5196 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5197 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5198 DebugLoc DL = MI.getDebugLoc();
5199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5200 .add(MI.getOperand(0))
5201 .addImm(MFI->getLDSSize());
5202 MI.eraseFromParent();
5203 return BB;
5204 }
5205 case AMDGPU::GET_SHADERCYCLESHILO: {
5208 const DebugLoc &DL = MI.getDebugLoc();
5209 // The algorithm is:
5210 //
5211 // hi1 = getreg(SHADER_CYCLES_HI)
5212 // lo1 = getreg(SHADER_CYCLES_LO)
5213 // hi2 = getreg(SHADER_CYCLES_HI)
5214 //
5215 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5216 // Otherwise there was overflow and the result is hi2:0. In both cases the
5217 // result should represent the actual time at some point during the sequence
5218 // of three getregs.
5219 using namespace AMDGPU::Hwreg;
5220 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5222 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5223 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5225 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5226 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5227 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5228 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5229 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5230 .addReg(RegHi1)
5231 .addReg(RegHi2);
5232 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5233 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5234 .addReg(RegLo1)
5235 .addImm(0);
5236 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5237 .add(MI.getOperand(0))
5238 .addReg(RegLo)
5239 .addImm(AMDGPU::sub0)
5240 .addReg(RegHi2)
5241 .addImm(AMDGPU::sub1);
5242 MI.eraseFromParent();
5243 return BB;
5244 }
5245 case AMDGPU::SI_INDIRECT_SRC_V1:
5246 case AMDGPU::SI_INDIRECT_SRC_V2:
5247 case AMDGPU::SI_INDIRECT_SRC_V4:
5248 case AMDGPU::SI_INDIRECT_SRC_V8:
5249 case AMDGPU::SI_INDIRECT_SRC_V9:
5250 case AMDGPU::SI_INDIRECT_SRC_V10:
5251 case AMDGPU::SI_INDIRECT_SRC_V11:
5252 case AMDGPU::SI_INDIRECT_SRC_V12:
5253 case AMDGPU::SI_INDIRECT_SRC_V16:
5254 case AMDGPU::SI_INDIRECT_SRC_V32:
5255 return emitIndirectSrc(MI, *BB, *getSubtarget());
5256 case AMDGPU::SI_INDIRECT_DST_V1:
5257 case AMDGPU::SI_INDIRECT_DST_V2:
5258 case AMDGPU::SI_INDIRECT_DST_V4:
5259 case AMDGPU::SI_INDIRECT_DST_V8:
5260 case AMDGPU::SI_INDIRECT_DST_V9:
5261 case AMDGPU::SI_INDIRECT_DST_V10:
5262 case AMDGPU::SI_INDIRECT_DST_V11:
5263 case AMDGPU::SI_INDIRECT_DST_V12:
5264 case AMDGPU::SI_INDIRECT_DST_V16:
5265 case AMDGPU::SI_INDIRECT_DST_V32:
5266 return emitIndirectDst(MI, *BB, *getSubtarget());
5267 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5268 case AMDGPU::SI_KILL_I1_PSEUDO:
5269 return splitKillBlock(MI, BB);
5270 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5272 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5273 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5274
5275 Register Dst = MI.getOperand(0).getReg();
5276 const MachineOperand &Src0 = MI.getOperand(1);
5277 const MachineOperand &Src1 = MI.getOperand(2);
5278 const DebugLoc &DL = MI.getDebugLoc();
5279 Register SrcCond = MI.getOperand(3).getReg();
5280
5281 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5282 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5283 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5284 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5285
5286 const TargetRegisterClass *Src0RC = Src0.isReg()
5287 ? MRI.getRegClass(Src0.getReg())
5288 : &AMDGPU::VReg_64RegClass;
5289 const TargetRegisterClass *Src1RC = Src1.isReg()
5290 ? MRI.getRegClass(Src1.getReg())
5291 : &AMDGPU::VReg_64RegClass;
5292
5293 const TargetRegisterClass *Src0SubRC =
5294 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5295 const TargetRegisterClass *Src1SubRC =
5296 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5297
5298 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5299 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5300 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5301 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5302
5303 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5304 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5305 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5306 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5307
5308 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5309 .addReg(SrcCond);
5310 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5311 .addImm(0)
5312 .add(Src0Sub0)
5313 .addImm(0)
5314 .add(Src1Sub0)
5315 .addReg(SrcCondCopy);
5316 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5317 .addImm(0)
5318 .add(Src0Sub1)
5319 .addImm(0)
5320 .add(Src1Sub1)
5321 .addReg(SrcCondCopy);
5322
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5324 .addReg(DstLo)
5325 .addImm(AMDGPU::sub0)
5326 .addReg(DstHi)
5327 .addImm(AMDGPU::sub1);
5328 MI.eraseFromParent();
5329 return BB;
5330 }
5331 case AMDGPU::SI_BR_UNDEF: {
5333 const DebugLoc &DL = MI.getDebugLoc();
5334 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5335 .add(MI.getOperand(0));
5336 Br->getOperand(1).setIsUndef(); // read undef SCC
5337 MI.eraseFromParent();
5338 return BB;
5339 }
5340 case AMDGPU::ADJCALLSTACKUP:
5341 case AMDGPU::ADJCALLSTACKDOWN: {
5343 MachineInstrBuilder MIB(*MF, &MI);
5344 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5345 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5346 return BB;
5347 }
5348 case AMDGPU::SI_CALL_ISEL: {
5350 const DebugLoc &DL = MI.getDebugLoc();
5351
5352 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5353
5355 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5356
5357 for (const MachineOperand &MO : MI.operands())
5358 MIB.add(MO);
5359
5360 MIB.cloneMemRefs(MI);
5361 MI.eraseFromParent();
5362 return BB;
5363 }
5364 case AMDGPU::V_ADD_CO_U32_e32:
5365 case AMDGPU::V_SUB_CO_U32_e32:
5366 case AMDGPU::V_SUBREV_CO_U32_e32: {
5367 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5368 const DebugLoc &DL = MI.getDebugLoc();
5369 unsigned Opc = MI.getOpcode();
5370
5371 bool NeedClampOperand = false;
5372 if (TII->pseudoToMCOpcode(Opc) == -1) {
5373 Opc = AMDGPU::getVOPe64(Opc);
5374 NeedClampOperand = true;
5375 }
5376
5377 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5378 if (TII->isVOP3(*I)) {
5379 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5380 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5381 I.addReg(TRI->getVCC(), RegState::Define);
5382 }
5383 I.add(MI.getOperand(1))
5384 .add(MI.getOperand(2));
5385 if (NeedClampOperand)
5386 I.addImm(0); // clamp bit for e64 encoding
5387
5388 TII->legalizeOperands(*I);
5389
5390 MI.eraseFromParent();
5391 return BB;
5392 }
5393 case AMDGPU::V_ADDC_U32_e32:
5394 case AMDGPU::V_SUBB_U32_e32:
5395 case AMDGPU::V_SUBBREV_U32_e32:
5396 // These instructions have an implicit use of vcc which counts towards the
5397 // constant bus limit.
5398 TII->legalizeOperands(MI);
5399 return BB;
5400 case AMDGPU::DS_GWS_INIT:
5401 case AMDGPU::DS_GWS_SEMA_BR:
5402 case AMDGPU::DS_GWS_BARRIER:
5403 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5404 [[fallthrough]];
5405 case AMDGPU::DS_GWS_SEMA_V:
5406 case AMDGPU::DS_GWS_SEMA_P:
5407 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5408 // A s_waitcnt 0 is required to be the instruction immediately following.
5409 if (getSubtarget()->hasGWSAutoReplay()) {
5411 return BB;
5412 }
5413
5414 return emitGWSMemViolTestLoop(MI, BB);
5415 case AMDGPU::S_SETREG_B32: {
5416 // Try to optimize cases that only set the denormal mode or rounding mode.
5417 //
5418 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5419 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5420 // instead.
5421 //
5422 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5423 // allow you to have a no side effect instruction in the output of a
5424 // sideeffecting pattern.
5425 auto [ID, Offset, Width] =
5426 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5428 return BB;
5429
5430 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5431 const unsigned SetMask = WidthMask << Offset;
5432
5433 if (getSubtarget()->hasDenormModeInst()) {
5434 unsigned SetDenormOp = 0;
5435 unsigned SetRoundOp = 0;
5436
5437 // The dedicated instructions can only set the whole denorm or round mode
5438 // at once, not a subset of bits in either.
5439 if (SetMask ==
5441 // If this fully sets both the round and denorm mode, emit the two
5442 // dedicated instructions for these.
5443 SetRoundOp = AMDGPU::S_ROUND_MODE;
5444 SetDenormOp = AMDGPU::S_DENORM_MODE;
5445 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5446 SetRoundOp = AMDGPU::S_ROUND_MODE;
5447 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5448 SetDenormOp = AMDGPU::S_DENORM_MODE;
5449 }
5450
5451 if (SetRoundOp || SetDenormOp) {
5453 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5454 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5455 unsigned ImmVal = Def->getOperand(1).getImm();
5456 if (SetRoundOp) {
5457 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5458 .addImm(ImmVal & 0xf);
5459
5460 // If we also have the denorm mode, get just the denorm mode bits.
5461 ImmVal >>= 4;
5462 }
5463
5464 if (SetDenormOp) {
5465 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5466 .addImm(ImmVal & 0xf);
5467 }
5468
5469 MI.eraseFromParent();
5470 return BB;
5471 }
5472 }
5473 }
5474
5475 // If only FP bits are touched, used the no side effects pseudo.
5476 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5477 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5478 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5479
5480 return BB;
5481 }
5482 case AMDGPU::S_INVERSE_BALLOT_U32:
5483 case AMDGPU::S_INVERSE_BALLOT_U64:
5484 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5485 // necessary. After that they are equivalent to a COPY.
5486 MI.setDesc(TII->get(AMDGPU::COPY));
5487 return BB;
5488 case AMDGPU::ENDPGM_TRAP: {
5489 const DebugLoc &DL = MI.getDebugLoc();
5490 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5491 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5492 MI.addOperand(MachineOperand::CreateImm(0));
5493 return BB;
5494 }
5495
5496 // We need a block split to make the real endpgm a terminator. We also don't
5497 // want to break phis in successor blocks, so we can't just delete to the
5498 // end of the block.
5499
5500 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5502 MF->push_back(TrapBB);
5503 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5504 .addImm(0);
5505 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5506 .addMBB(TrapBB);
5507
5508 BB->addSuccessor(TrapBB);
5509 MI.eraseFromParent();
5510 return SplitBB;
5511 }
5512 case AMDGPU::SIMULATED_TRAP: {
5513 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5515 MachineBasicBlock *SplitBB =
5516 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5517 MI.eraseFromParent();
5518 return SplitBB;
5519 }
5520 default:
5521 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5522 if (!MI.mayStore())
5524 return BB;
5525 }
5527 }
5528}
5529
5531 // This currently forces unfolding various combinations of fsub into fma with
5532 // free fneg'd operands. As long as we have fast FMA (controlled by
5533 // isFMAFasterThanFMulAndFAdd), we should perform these.
5534
5535 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5536 // most of these combines appear to be cycle neutral but save on instruction
5537 // count / code size.
5538 return true;
5539}
5540
5542
5544 EVT VT) const {
5545 if (!VT.isVector()) {
5546 return MVT::i1;
5547 }
5548 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5549}
5550
5552 // TODO: Should i16 be used always if legal? For now it would force VALU
5553 // shifts.
5554 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5555}
5556
5558 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5559 ? Ty.changeElementSize(16)
5560 : Ty.changeElementSize(32);
5561}
5562
5563// Answering this is somewhat tricky and depends on the specific device which
5564// have different rates for fma or all f64 operations.
5565//
5566// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5567// regardless of which device (although the number of cycles differs between
5568// devices), so it is always profitable for f64.
5569//
5570// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5571// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5572// which we can always do even without fused FP ops since it returns the same
5573// result as the separate operations and since it is always full
5574// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5575// however does not support denormals, so we do report fma as faster if we have
5576// a fast fma device and require denormals.
5577//
5579 EVT VT) const {
5580 VT = VT.getScalarType();
5581
5582 switch (VT.getSimpleVT().SimpleTy) {
5583 case MVT::f32: {
5584 // If mad is not available this depends only on if f32 fma is full rate.
5585 if (!Subtarget->hasMadMacF32Insts())
5586 return Subtarget->hasFastFMAF32();
5587
5588 // Otherwise f32 mad is always full rate and returns the same result as
5589 // the separate operations so should be preferred over fma.
5590 // However does not support denormals.
5592 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5593
5594 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5595 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5596 }
5597 case MVT::f64:
5598 return true;
5599 case MVT::f16:
5600 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5601 default:
5602 break;
5603 }
5604
5605 return false;
5606}
5607
5609 LLT Ty) const {
5610 switch (Ty.getScalarSizeInBits()) {
5611 case 16:
5612 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5613 case 32:
5614 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5615 case 64:
5616 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5617 default:
5618 break;
5619 }
5620
5621 return false;
5622}
5623
5625 if (!Ty.isScalar())
5626 return false;
5627
5628 if (Ty.getScalarSizeInBits() == 16)
5629 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5630 if (Ty.getScalarSizeInBits() == 32)
5631 return Subtarget->hasMadMacF32Insts() &&
5632 denormalModeIsFlushAllF32(*MI.getMF());
5633
5634 return false;
5635}
5636
5638 const SDNode *N) const {
5639 // TODO: Check future ftz flag
5640 // v_mad_f32/v_mac_f32 do not support denormals.
5641 EVT VT = N->getValueType(0);
5642 if (VT == MVT::f32)
5643 return Subtarget->hasMadMacF32Insts() &&
5645 if (VT == MVT::f16) {
5646 return Subtarget->hasMadF16() &&
5648 }
5649
5650 return false;
5651}
5652
5653//===----------------------------------------------------------------------===//
5654// Custom DAG Lowering Operations
5655//===----------------------------------------------------------------------===//
5656
5657// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5658// wider vector type is legal.
5660 SelectionDAG &DAG) const {
5661 unsigned Opc = Op.getOpcode();
5662 EVT VT = Op.getValueType();
5663 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5664 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5665 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5666 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5667
5668 SDValue Lo, Hi;
5669 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5670
5671 SDLoc SL(Op);
5672 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5673 Op->getFlags());
5674 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5675 Op->getFlags());
5676
5677 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5678}
5679
5680// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5681// wider vector type is legal.
5683 SelectionDAG &DAG) const {
5684 unsigned Opc = Op.getOpcode();
5685 EVT VT = Op.getValueType();
5686 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5687 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5688 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5689 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5690
5691 SDValue Lo0, Hi0;
5692 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5693 SDValue Lo1, Hi1;
5694 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5695
5696 SDLoc SL(Op);
5697
5698 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5699 Op->getFlags());
5700 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5701 Op->getFlags());
5702
5703 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5704}
5705
5707 SelectionDAG &DAG) const {
5708 unsigned Opc = Op.getOpcode();
5709 EVT VT = Op.getValueType();
5710 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5711 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5712 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5713 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5714 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5715 VT == MVT::v32bf16);
5716
5717 SDValue Lo0, Hi0;
5718 SDValue Op0 = Op.getOperand(0);
5719 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5720 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5721 : std::pair(Op0, Op0);
5722 SDValue Lo1, Hi1;
5723 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5724 SDValue Lo2, Hi2;
5725 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5726
5727 SDLoc SL(Op);
5728 auto ResVT = DAG.GetSplitDestVTs(VT);
5729
5730 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5731 Op->getFlags());
5732 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5733 Op->getFlags());
5734
5735 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5736}
5737
5738
5740 switch (Op.getOpcode()) {
5741 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5742 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5743 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5744 case ISD::LOAD: {
5745 SDValue Result = LowerLOAD(Op, DAG);
5746 assert((!Result.getNode() ||
5747 Result.getNode()->getNumValues() == 2) &&
5748 "Load should return a value and a chain");
5749 return Result;
5750 }
5751 case ISD::FSQRT: {
5752 EVT VT = Op.getValueType();
5753 if (VT == MVT::f32)
5754 return lowerFSQRTF32(Op, DAG);
5755 if (VT == MVT::f64)
5756 return lowerFSQRTF64(Op, DAG);
5757 return SDValue();
5758 }
5759 case ISD::FSIN:
5760 case ISD::FCOS:
5761 return LowerTrig(Op, DAG);
5762 case ISD::SELECT: return LowerSELECT(Op, DAG);
5763 case ISD::FDIV: return LowerFDIV(Op, DAG);
5764 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5765 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5766 case ISD::STORE: return LowerSTORE(Op, DAG);
5767 case ISD::GlobalAddress: {
5770 return LowerGlobalAddress(MFI, Op, DAG);
5771 }
5772 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5773 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5774 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5775 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5777 return lowerINSERT_SUBVECTOR(Op, DAG);
5779 return lowerINSERT_VECTOR_ELT(Op, DAG);
5781 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5783 return lowerVECTOR_SHUFFLE(Op, DAG);
5785 return lowerSCALAR_TO_VECTOR(Op, DAG);
5786 case ISD::BUILD_VECTOR:
5787 return lowerBUILD_VECTOR(Op, DAG);
5788 case ISD::FP_ROUND:
5790 return lowerFP_ROUND(Op, DAG);
5791 case ISD::FPTRUNC_ROUND: {
5792 unsigned Opc;
5793 SDLoc DL(Op);
5794
5795 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5796 return SDValue();
5797
5798 // Get the rounding mode from the last operand
5799 int RoundMode = Op.getConstantOperandVal(1);
5800 if (RoundMode == (int)RoundingMode::TowardPositive)
5802 else if (RoundMode == (int)RoundingMode::TowardNegative)
5804 else
5805 return SDValue();
5806
5807 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5808 }
5809 case ISD::TRAP:
5810 return lowerTRAP(Op, DAG);
5811 case ISD::DEBUGTRAP:
5812 return lowerDEBUGTRAP(Op, DAG);
5813 case ISD::ABS:
5814 case ISD::FABS:
5815 case ISD::FNEG:
5816 case ISD::FCANONICALIZE:
5817 case ISD::BSWAP:
5818 return splitUnaryVectorOp(Op, DAG);
5819 case ISD::FMINNUM:
5820 case ISD::FMAXNUM:
5821 return lowerFMINNUM_FMAXNUM(Op, DAG);
5822 case ISD::FLDEXP:
5823 case ISD::STRICT_FLDEXP:
5824 return lowerFLDEXP(Op, DAG);
5825 case ISD::FMA:
5826 return splitTernaryVectorOp(Op, DAG);
5827 case ISD::FP_TO_SINT:
5828 case ISD::FP_TO_UINT:
5829 return LowerFP_TO_INT(Op, DAG);
5830 case ISD::SHL:
5831 case ISD::SRA:
5832 case ISD::SRL:
5833 case ISD::ADD:
5834 case ISD::SUB:
5835 case ISD::SMIN:
5836 case ISD::SMAX:
5837 case ISD::UMIN:
5838 case ISD::UMAX:
5839 case ISD::FADD:
5840 case ISD::FMUL:
5841 case ISD::FMINNUM_IEEE:
5842 case ISD::FMAXNUM_IEEE:
5843 case ISD::FMINIMUM:
5844 case ISD::FMAXIMUM:
5845 case ISD::UADDSAT:
5846 case ISD::USUBSAT:
5847 case ISD::SADDSAT:
5848 case ISD::SSUBSAT:
5849 return splitBinaryVectorOp(Op, DAG);
5850 case ISD::MUL:
5851 return lowerMUL(Op, DAG);
5852 case ISD::SMULO:
5853 case ISD::UMULO:
5854 return lowerXMULO(Op, DAG);
5855 case ISD::SMUL_LOHI:
5856 case ISD::UMUL_LOHI:
5857 return lowerXMUL_LOHI(Op, DAG);
5859 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5860 case ISD::STACKSAVE:
5861 return LowerSTACKSAVE(Op, DAG);
5862 case ISD::GET_ROUNDING:
5863 return lowerGET_ROUNDING(Op, DAG);
5864 case ISD::SET_ROUNDING:
5865 return lowerSET_ROUNDING(Op, DAG);
5866 case ISD::PREFETCH:
5867 return lowerPREFETCH(Op, DAG);
5868 case ISD::FP_EXTEND:
5870 return lowerFP_EXTEND(Op, DAG);
5871 case ISD::GET_FPENV:
5872 return lowerGET_FPENV(Op, DAG);
5873 case ISD::SET_FPENV:
5874 return lowerSET_FPENV(Op, DAG);
5875 }
5876 return SDValue();
5877}
5878
5879// Used for D16: Casts the result of an instruction into the right vector,
5880// packs values if loads return unpacked values.
5882 const SDLoc &DL,
5883 SelectionDAG &DAG, bool Unpacked) {
5884 if (!LoadVT.isVector())
5885 return Result;
5886
5887 // Cast back to the original packed type or to a larger type that is a
5888 // multiple of 32 bit for D16. Widening the return type is a required for
5889 // legalization.
5890 EVT FittingLoadVT = LoadVT;
5891 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5892 FittingLoadVT =
5894 LoadVT.getVectorNumElements() + 1);
5895 }
5896
5897 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5898 // Truncate to v2i16/v4i16.
5899 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5900
5901 // Workaround legalizer not scalarizing truncate after vector op
5902 // legalization but not creating intermediate vector trunc.
5904 DAG.ExtractVectorElements(Result, Elts);
5905 for (SDValue &Elt : Elts)
5906 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5907
5908 // Pad illegal v1i16/v3fi6 to v4i16
5909 if ((LoadVT.getVectorNumElements() % 2) == 1)
5910 Elts.push_back(DAG.getUNDEF(MVT::i16));
5911
5912 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5913
5914 // Bitcast to original type (v2f16/v4f16).
5915 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5916 }
5917
5918 // Cast back to the original packed type.
5919 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5920}
5921
5922SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5923 MemSDNode *M,
5924 SelectionDAG &DAG,
5926 bool IsIntrinsic) const {
5927 SDLoc DL(M);
5928
5929 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5930 EVT LoadVT = M->getValueType(0);
5931
5932 EVT EquivLoadVT = LoadVT;
5933 if (LoadVT.isVector()) {
5934 if (Unpacked) {
5935 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5936 LoadVT.getVectorNumElements());
5937 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5938 // Widen v3f16 to legal type
5939 EquivLoadVT =
5941 LoadVT.getVectorNumElements() + 1);
5942 }
5943 }
5944
5945 // Change from v4f16/v2f16 to EquivLoadVT.
5946 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5947
5949 = DAG.getMemIntrinsicNode(
5950 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5951 VTList, Ops, M->getMemoryVT(),
5952 M->getMemOperand());
5953
5954 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5955
5956 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5957}
5958
5959SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5960 SelectionDAG &DAG,
5961 ArrayRef<SDValue> Ops) const {
5962 SDLoc DL(M);
5963 EVT LoadVT = M->getValueType(0);
5964 EVT EltType = LoadVT.getScalarType();
5965 EVT IntVT = LoadVT.changeTypeToInteger();
5966
5967 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5968
5969 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5970 bool IsTFE = M->getNumValues() == 3;
5971
5972 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5976
5977 if (IsD16) {
5978 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5979 }
5980
5981 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5982 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5983 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5984 IsTFE);
5985
5986 if (isTypeLegal(LoadVT)) {
5987 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5988 M->getMemOperand(), DAG);
5989 }
5990
5991 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5992 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5993 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5994 M->getMemOperand(), DAG);
5995 return DAG.getMergeValues(
5996 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5997 DL);
5998}
5999
6001 SDNode *N, SelectionDAG &DAG) {
6002 EVT VT = N->getValueType(0);
6003 unsigned CondCode = N->getConstantOperandVal(3);
6004 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6005 return DAG.getUNDEF(VT);
6006
6007 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6008
6009 SDValue LHS = N->getOperand(1);
6010 SDValue RHS = N->getOperand(2);
6011
6012 SDLoc DL(N);
6013
6014 EVT CmpVT = LHS.getValueType();
6015 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6016 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6018 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6019 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6020 }
6021
6022 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6023
6024 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6025 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6026
6027 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6028 DAG.getCondCode(CCOpcode));
6029 if (VT.bitsEq(CCVT))
6030 return SetCC;
6031 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6032}
6033
6035 SDNode *N, SelectionDAG &DAG) {
6036 EVT VT = N->getValueType(0);
6037
6038 unsigned CondCode = N->getConstantOperandVal(3);
6039 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6040 return DAG.getUNDEF(VT);
6041
6042 SDValue Src0 = N->getOperand(1);
6043 SDValue Src1 = N->getOperand(2);
6044 EVT CmpVT = Src0.getValueType();
6045 SDLoc SL(N);
6046
6047 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6048 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6049 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6050 }
6051
6052 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6053 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6054 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6055 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6056 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6057 Src1, DAG.getCondCode(CCOpcode));
6058 if (VT.bitsEq(CCVT))
6059 return SetCC;
6060 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6061}
6062
6064 SelectionDAG &DAG) {
6065 EVT VT = N->getValueType(0);
6066 SDValue Src = N->getOperand(1);
6067 SDLoc SL(N);
6068
6069 if (Src.getOpcode() == ISD::SETCC) {
6070 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6071 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6072 Src.getOperand(1), Src.getOperand(2));
6073 }
6074 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6075 // (ballot 0) -> 0
6076 if (Arg->isZero())
6077 return DAG.getConstant(0, SL, VT);
6078
6079 // (ballot 1) -> EXEC/EXEC_LO
6080 if (Arg->isOne()) {
6081 Register Exec;
6082 if (VT.getScalarSizeInBits() == 32)
6083 Exec = AMDGPU::EXEC_LO;
6084 else if (VT.getScalarSizeInBits() == 64)
6085 Exec = AMDGPU::EXEC;
6086 else
6087 return SDValue();
6088
6089 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6090 }
6091 }
6092
6093 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6094 // ISD::SETNE)
6095 return DAG.getNode(
6096 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6097 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6098}
6099
6101 SelectionDAG &DAG) {
6102 EVT VT = N->getValueType(0);
6103 unsigned ValSize = VT.getSizeInBits();
6104 unsigned IID = N->getConstantOperandVal(0);
6105 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6106 IID == Intrinsic::amdgcn_permlanex16;
6107 SDLoc SL(N);
6108 MVT IntVT = MVT::getIntegerVT(ValSize);
6109
6110 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6111 SDValue Src2, MVT ValT) -> SDValue {
6113 switch (IID) {
6114 case Intrinsic::amdgcn_permlane16:
6115 case Intrinsic::amdgcn_permlanex16:
6116 Operands.push_back(N->getOperand(6));
6117 Operands.push_back(N->getOperand(5));
6118 Operands.push_back(N->getOperand(4));
6119 [[fallthrough]];
6120 case Intrinsic::amdgcn_writelane:
6121 Operands.push_back(Src2);
6122 [[fallthrough]];
6123 case Intrinsic::amdgcn_readlane:
6124 Operands.push_back(Src1);
6125 [[fallthrough]];
6126 case Intrinsic::amdgcn_readfirstlane:
6127 case Intrinsic::amdgcn_permlane64:
6128 Operands.push_back(Src0);
6129 break;
6130 default:
6131 llvm_unreachable("unhandled lane op");
6132 }
6133
6134 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6135 std::reverse(Operands.begin(), Operands.end());
6136
6137 if (SDNode *GL = N->getGluedNode()) {
6138 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6139 GL = GL->getOperand(0).getNode();
6140 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6141 SDValue(GL, 0)));
6142 }
6143
6144 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6145 };
6146
6147 SDValue Src0 = N->getOperand(1);
6148 SDValue Src1, Src2;
6149 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6150 IsPermLane16) {
6151 Src1 = N->getOperand(2);
6152 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6153 Src2 = N->getOperand(3);
6154 }
6155
6156 if (ValSize == 32) {
6157 // Already legal
6158 return SDValue();
6159 }
6160
6161 if (ValSize < 32) {
6162 bool IsFloat = VT.isFloatingPoint();
6163 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6164 SL, MVT::i32);
6165
6166 if (IsPermLane16) {
6167 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6168 SL, MVT::i32);
6169 }
6170
6171 if (IID == Intrinsic::amdgcn_writelane) {
6172 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6173 SL, MVT::i32);
6174 }
6175
6176 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6177 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6178 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6179 }
6180
6181 if (ValSize % 32 != 0)
6182 return SDValue();
6183
6184 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6185 EVT VT = N->getValueType(0);
6186 unsigned NE = VT.getVectorNumElements();
6187 EVT EltVT = VT.getVectorElementType();
6189 unsigned NumOperands = N->getNumOperands();
6190 SmallVector<SDValue, 4> Operands(NumOperands);
6191 SDNode *GL = N->getGluedNode();
6192
6193 // only handle convergencectrl_glue
6195
6196 for (unsigned i = 0; i != NE; ++i) {
6197 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6198 ++j) {
6199 SDValue Operand = N->getOperand(j);
6200 EVT OperandVT = Operand.getValueType();
6201 if (OperandVT.isVector()) {
6202 // A vector operand; extract a single element.
6203 EVT OperandEltVT = OperandVT.getVectorElementType();
6204 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6205 Operand, DAG.getVectorIdxConstant(i, SL));
6206 } else {
6207 // A scalar operand; just use it as is.
6208 Operands[j] = Operand;
6209 }
6210 }
6211
6212 if (GL)
6213 Operands[NumOperands - 1] =
6214 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6215 SDValue(GL->getOperand(0).getNode(), 0));
6216
6217 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6218 }
6219
6220 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6221 return DAG.getBuildVector(VecVT, SL, Scalars);
6222 };
6223
6224 if (VT.isVector()) {
6225 switch (MVT::SimpleValueType EltTy =
6227 case MVT::i32:
6228 case MVT::f32: {
6229 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6230 return unrollLaneOp(LaneOp.getNode());
6231 }
6232 case MVT::i16:
6233 case MVT::f16:
6234 case MVT::bf16: {
6235 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6237 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6238 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6239 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6240 DAG.getConstant(EltIdx, SL, MVT::i32));
6241
6242 if (IsPermLane16)
6243 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6244 DAG.getConstant(EltIdx, SL, MVT::i32));
6245
6246 if (IID == Intrinsic::amdgcn_writelane)
6247 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6248 DAG.getConstant(EltIdx, SL, MVT::i32));
6249
6250 Pieces.push_back(
6251 IsPermLane16
6252 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6253 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6254 EltIdx += 2;
6255 }
6256 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6257 }
6258 default:
6259 // Handle all other cases by bitcasting to i32 vectors
6260 break;
6261 }
6262 }
6263
6264 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6265 Src0 = DAG.getBitcast(VecVT, Src0);
6266
6267 if (IsPermLane16)
6268 Src1 = DAG.getBitcast(VecVT, Src1);
6269
6270 if (IID == Intrinsic::amdgcn_writelane)
6271 Src2 = DAG.getBitcast(VecVT, Src2);
6272
6273 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6274 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6275 return DAG.getBitcast(VT, UnrolledLaneOp);
6276}
6277
6280 SelectionDAG &DAG) const {
6281 switch (N->getOpcode()) {
6283 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6284 Results.push_back(Res);
6285 return;
6286 }
6288 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6289 Results.push_back(Res);
6290 return;
6291 }
6293 unsigned IID = N->getConstantOperandVal(0);
6294 switch (IID) {
6295 case Intrinsic::amdgcn_make_buffer_rsrc:
6296 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6297 return;
6298 case Intrinsic::amdgcn_cvt_pkrtz: {
6299 SDValue Src0 = N->getOperand(1);
6300 SDValue Src1 = N->getOperand(2);
6301 SDLoc SL(N);
6302 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6303 Src0, Src1);
6304 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6305 return;
6306 }
6307 case Intrinsic::amdgcn_cvt_pknorm_i16:
6308 case Intrinsic::amdgcn_cvt_pknorm_u16:
6309 case Intrinsic::amdgcn_cvt_pk_i16:
6310 case Intrinsic::amdgcn_cvt_pk_u16: {
6311 SDValue Src0 = N->getOperand(1);
6312 SDValue Src1 = N->getOperand(2);
6313 SDLoc SL(N);
6314 unsigned Opcode;
6315
6316 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6318 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6320 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6322 else
6324
6325 EVT VT = N->getValueType(0);
6326 if (isTypeLegal(VT))
6327 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6328 else {
6329 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6330 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6331 }
6332 return;
6333 }
6334 case Intrinsic::amdgcn_s_buffer_load: {
6335 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6336 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6337 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6338 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6339 // s_buffer_load_i8.
6340 if (!Subtarget->hasScalarSubwordLoads())
6341 return;
6342 SDValue Op = SDValue(N, 0);
6343 SDValue Rsrc = Op.getOperand(1);
6344 SDValue Offset = Op.getOperand(2);
6345 SDValue CachePolicy = Op.getOperand(3);
6346 EVT VT = Op.getValueType();
6347 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6348 SDLoc DL(Op);
6350 const DataLayout &DataLayout = DAG.getDataLayout();
6351 Align Alignment =
6357 VT.getStoreSize(), Alignment);
6358 SDValue LoadVal;
6359 if (!Offset->isDivergent()) {
6360 SDValue Ops[] = {Rsrc, // source register
6361 Offset, CachePolicy};
6362 SDValue BufferLoad =
6364 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6365 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6366 } else {
6367 SDValue Ops[] = {
6368 DAG.getEntryNode(), // Chain
6369 Rsrc, // rsrc
6370 DAG.getConstant(0, DL, MVT::i32), // vindex
6371 {}, // voffset
6372 {}, // soffset
6373 {}, // offset
6374 CachePolicy, // cachepolicy
6375 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6376 };
6377 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6378 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6379 }
6380 Results.push_back(LoadVal);
6381 return;
6382 }
6383 }
6384 break;
6385 }
6387 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6388 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6389 // FIXME: Hacky
6390 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6391 Results.push_back(Res.getOperand(I));
6392 }
6393 } else {
6394 Results.push_back(Res);
6395 Results.push_back(Res.getValue(1));
6396 }
6397 return;
6398 }
6399
6400 break;
6401 }
6402 case ISD::SELECT: {
6403 SDLoc SL(N);
6404 EVT VT = N->getValueType(0);
6405 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6406 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6407 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6408
6409 EVT SelectVT = NewVT;
6410 if (NewVT.bitsLT(MVT::i32)) {
6411 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6412 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6413 SelectVT = MVT::i32;
6414 }
6415
6416 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6417 N->getOperand(0), LHS, RHS);
6418
6419 if (NewVT != SelectVT)
6420 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6421 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6422 return;
6423 }
6424 case ISD::FNEG: {
6425 if (N->getValueType(0) != MVT::v2f16)
6426 break;
6427
6428 SDLoc SL(N);
6429 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6430
6431 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6432 BC,
6433 DAG.getConstant(0x80008000, SL, MVT::i32));
6434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6435 return;
6436 }
6437 case ISD::FABS: {
6438 if (N->getValueType(0) != MVT::v2f16)
6439 break;
6440
6441 SDLoc SL(N);
6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6443
6444 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6445 BC,
6446 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6447 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6448 return;
6449 }
6450 case ISD::FSQRT: {
6451 if (N->getValueType(0) != MVT::f16)
6452 break;
6453 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6454 break;
6455 }
6456 default:
6458 break;
6459 }
6460}
6461
6462/// Helper function for LowerBRCOND
6463static SDNode *findUser(SDValue Value, unsigned Opcode) {
6464
6465 SDNode *Parent = Value.getNode();
6466 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6467 I != E; ++I) {
6468
6469 if (I.getUse().get() != Value)
6470 continue;
6471
6472 if (I->getOpcode() == Opcode)
6473 return *I;
6474 }
6475 return nullptr;
6476}
6477
6478unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6479 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6480 switch (Intr->getConstantOperandVal(1)) {
6481 case Intrinsic::amdgcn_if:
6482 return AMDGPUISD::IF;
6483 case Intrinsic::amdgcn_else:
6484 return AMDGPUISD::ELSE;
6485 case Intrinsic::amdgcn_loop:
6486 return AMDGPUISD::LOOP;
6487 case Intrinsic::amdgcn_end_cf:
6488 llvm_unreachable("should not occur");
6489 default:
6490 return 0;
6491 }
6492 }
6493
6494 // break, if_break, else_break are all only used as inputs to loop, not
6495 // directly as branch conditions.
6496 return 0;
6497}
6498
6500 const Triple &TT = getTargetMachine().getTargetTriple();
6504}
6505
6507 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6508 return false;
6509
6510 // FIXME: Either avoid relying on address space here or change the default
6511 // address space for functions to avoid the explicit check.
6512 return (GV->getValueType()->isFunctionTy() ||
6515}
6516
6518 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6519}
6520
6522 if (!GV->hasExternalLinkage())
6523 return true;
6524
6525 const auto OS = getTargetMachine().getTargetTriple().getOS();
6526 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6527}
6528
6529/// This transforms the control flow intrinsics to get the branch destination as
6530/// last parameter, also switches branch target with BR if the need arise
6531SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6532 SelectionDAG &DAG) const {
6533 SDLoc DL(BRCOND);
6534
6535 SDNode *Intr = BRCOND.getOperand(1).getNode();
6536 SDValue Target = BRCOND.getOperand(2);
6537 SDNode *BR = nullptr;
6538 SDNode *SetCC = nullptr;
6539
6540 if (Intr->getOpcode() == ISD::SETCC) {
6541 // As long as we negate the condition everything is fine
6542 SetCC = Intr;
6543 Intr = SetCC->getOperand(0).getNode();
6544
6545 } else {
6546 // Get the target from BR if we don't negate the condition
6547 BR = findUser(BRCOND, ISD::BR);
6548 assert(BR && "brcond missing unconditional branch user");
6549 Target = BR->getOperand(1);
6550 }
6551
6552 unsigned CFNode = isCFIntrinsic(Intr);
6553 if (CFNode == 0) {
6554 // This is a uniform branch so we don't need to legalize.
6555 return BRCOND;
6556 }
6557
6558 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6559 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6560
6561 assert(!SetCC ||
6562 (SetCC->getConstantOperandVal(1) == 1 &&
6563 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6564 ISD::SETNE));
6565
6566 // operands of the new intrinsic call
6568 if (HaveChain)
6569 Ops.push_back(BRCOND.getOperand(0));
6570
6571 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6572 Ops.push_back(Target);
6573
6574 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6575
6576 // build the new intrinsic call
6577 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6578
6579 if (!HaveChain) {
6580 SDValue Ops[] = {
6581 SDValue(Result, 0),
6582 BRCOND.getOperand(0)
6583 };
6584
6585 Result = DAG.getMergeValues(Ops, DL).getNode();
6586 }
6587
6588 if (BR) {
6589 // Give the branch instruction our target
6590 SDValue Ops[] = {
6591 BR->getOperand(0),
6592 BRCOND.getOperand(2)
6593 };
6594 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6595 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6596 }
6597
6598 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6599
6600 // Copy the intrinsic results to registers
6601 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6603 if (!CopyToReg)
6604 continue;
6605
6606 Chain = DAG.getCopyToReg(
6607 Chain, DL,
6608 CopyToReg->getOperand(1),
6609 SDValue(Result, i - 1),
6610 SDValue());
6611
6612 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6613 }
6614
6615 // Remove the old intrinsic from the chain
6617 SDValue(Intr, Intr->getNumValues() - 1),
6618 Intr->getOperand(0));
6619
6620 return Chain;
6621}
6622
6623SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6624 SelectionDAG &DAG) const {
6625 MVT VT = Op.getSimpleValueType();
6626 SDLoc DL(Op);
6627 // Checking the depth
6628 if (Op.getConstantOperandVal(0) != 0)
6629 return DAG.getConstant(0, DL, VT);
6630
6633 // Check for kernel and shader functions
6634 if (Info->isEntryFunction())
6635 return DAG.getConstant(0, DL, VT);
6636
6637 MachineFrameInfo &MFI = MF.getFrameInfo();
6638 // There is a call to @llvm.returnaddress in this function
6639 MFI.setReturnAddressIsTaken(true);
6640
6642 // Get the return address reg and mark it as an implicit live-in
6643 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6644
6645 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6646}
6647
6648SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6649 SDValue Op,
6650 const SDLoc &DL,
6651 EVT VT) const {
6652 return Op.getValueType().bitsLE(VT) ?
6653 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6654 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6655 DAG.getTargetConstant(0, DL, MVT::i32));
6656}
6657
6658SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6659 assert(Op.getValueType() == MVT::f16 &&
6660 "Do not know how to custom lower FP_ROUND for non-f16 type");
6661
6662 SDValue Src = Op.getOperand(0);
6663 EVT SrcVT = Src.getValueType();
6664 if (SrcVT != MVT::f64)
6665 return Op;
6666
6667 // TODO: Handle strictfp
6668 if (Op.getOpcode() != ISD::FP_ROUND)
6669 return Op;
6670
6671 SDLoc DL(Op);
6672
6673 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6674 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6675 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6676}
6677
6678SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6679 SelectionDAG &DAG) const {
6680 EVT VT = Op.getValueType();
6681 const MachineFunction &MF = DAG.getMachineFunction();
6683 bool IsIEEEMode = Info->getMode().IEEE;
6684
6685 // FIXME: Assert during selection that this is only selected for
6686 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6687 // mode functions, but this happens to be OK since it's only done in cases
6688 // where there is known no sNaN.
6689 if (IsIEEEMode)
6690 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6691
6692 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6693 VT == MVT::v16bf16)
6694 return splitBinaryVectorOp(Op, DAG);
6695 return Op;
6696}
6697
6698SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6699 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6700 EVT VT = Op.getValueType();
6701 assert(VT == MVT::f16);
6702
6703 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6704 EVT ExpVT = Exp.getValueType();
6705 if (ExpVT == MVT::i16)
6706 return Op;
6707
6708 SDLoc DL(Op);
6709
6710 // Correct the exponent type for f16 to i16.
6711 // Clamp the range of the exponent to the instruction's range.
6712
6713 // TODO: This should be a generic narrowing legalization, and can easily be
6714 // for GlobalISel.
6715
6716 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6717 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6718
6719 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6720 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6721
6722 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6723
6724 if (IsStrict) {
6725 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6726 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6727 }
6728
6729 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6730}
6731
6732// Custom lowering for vector multiplications and s_mul_u64.
6733SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6734 EVT VT = Op.getValueType();
6735
6736 // Split vector operands.
6737 if (VT.isVector())
6738 return splitBinaryVectorOp(Op, DAG);
6739
6740 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6741
6742 // There are four ways to lower s_mul_u64:
6743 //
6744 // 1. If all the operands are uniform, then we lower it as it is.
6745 //
6746 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6747 // multiplications because there is not a vector equivalent of s_mul_u64.
6748 //
6749 // 3. If the cost model decides that it is more efficient to use vector
6750 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6751 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6752 //
6753 // 4. If the cost model decides to use vector registers and both of the
6754 // operands are zero-extended/sign-extended from 32-bits, then we split the
6755 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6756 // possible to check if the operands are zero-extended or sign-extended in
6757 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6758 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6759 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6760 // If the cost model decides that we have to use vector registers, then
6761 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6762 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6763 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6764 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6765 // SIInstrInfo.cpp .
6766
6767 if (Op->isDivergent())
6768 return SDValue();
6769
6770 SDValue Op0 = Op.getOperand(0);
6771 SDValue Op1 = Op.getOperand(1);
6772 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6773 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6774 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6775 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6776 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6777 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6778 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6779 SDLoc SL(Op);
6780 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6781 return SDValue(
6782 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6783 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6784 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6785 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6786 return SDValue(
6787 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6788 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6789 return Op;
6790}
6791
6792SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6793 EVT VT = Op.getValueType();
6794 SDLoc SL(Op);
6795 SDValue LHS = Op.getOperand(0);
6796 SDValue RHS = Op.getOperand(1);
6797 bool isSigned = Op.getOpcode() == ISD::SMULO;
6798
6799 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6800 const APInt &C = RHSC->getAPIntValue();
6801 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6802 if (C.isPowerOf2()) {
6803 // smulo(x, signed_min) is same as umulo(x, signed_min).
6804 bool UseArithShift = isSigned && !C.isMinSignedValue();
6805 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6806 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6807 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6808 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6809 SL, VT, Result, ShiftAmt),
6810 LHS, ISD::SETNE);
6811 return DAG.getMergeValues({ Result, Overflow }, SL);
6812 }
6813 }
6814
6815 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6817 SL, VT, LHS, RHS);
6818
6819 SDValue Sign = isSigned
6820 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6821 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6822 : DAG.getConstant(0, SL, VT);
6823 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6824
6825 return DAG.getMergeValues({ Result, Overflow }, SL);
6826}
6827
6828SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6829 if (Op->isDivergent()) {
6830 // Select to V_MAD_[IU]64_[IU]32.
6831 return Op;
6832 }
6833 if (Subtarget->hasSMulHi()) {
6834 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6835 return SDValue();
6836 }
6837 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6838 // calculate the high part, so we might as well do the whole thing with
6839 // V_MAD_[IU]64_[IU]32.
6840 return Op;
6841}
6842
6843SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6844 if (!Subtarget->isTrapHandlerEnabled() ||
6846 return lowerTrapEndpgm(Op, DAG);
6847
6848 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6849 lowerTrapHsaQueuePtr(Op, DAG);
6850}
6851
6852SDValue SITargetLowering::lowerTrapEndpgm(
6853 SDValue Op, SelectionDAG &DAG) const {
6854 SDLoc SL(Op);
6855 SDValue Chain = Op.getOperand(0);
6856 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6857}
6858
6859SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6860 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6863 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6865 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6868}
6869
6870SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6871 SDValue Op, SelectionDAG &DAG) const {
6872 SDLoc SL(Op);
6873 SDValue Chain = Op.getOperand(0);
6874
6875 SDValue QueuePtr;
6876 // For code object version 5, QueuePtr is passed through implicit kernarg.
6877 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6879 QueuePtr =
6880 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6881 } else {
6884 Register UserSGPR = Info->getQueuePtrUserSGPR();
6885
6886 if (UserSGPR == AMDGPU::NoRegister) {
6887 // We probably are in a function incorrectly marked with
6888 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6889 // trap, so just use a null pointer.
6890 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6891 } else {
6892 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6893 MVT::i64);
6894 }
6895 }
6896
6897 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6898 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6899 QueuePtr, SDValue());
6900
6902 SDValue Ops[] = {
6903 ToReg,
6904 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6905 SGPR01,
6906 ToReg.getValue(1)
6907 };
6908 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6909}
6910
6911SDValue SITargetLowering::lowerTrapHsa(
6912 SDValue Op, SelectionDAG &DAG) const {
6913 SDLoc SL(Op);
6914 SDValue Chain = Op.getOperand(0);
6915
6916 // We need to simulate the 's_trap 2' instruction on targets that run in
6917 // PRIV=1 (where it is treated as a nop).
6918 if (Subtarget->hasPrivEnabledTrap2NopBug())
6919 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6920
6922 SDValue Ops[] = {
6923 Chain,
6924 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6925 };
6926 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6927}
6928
6929SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6930 SDLoc SL(Op);
6931 SDValue Chain = Op.getOperand(0);
6933
6934 if (!Subtarget->isTrapHandlerEnabled() ||
6937 "debugtrap handler not supported",
6938 Op.getDebugLoc(),
6939 DS_Warning);
6940 LLVMContext &Ctx = MF.getFunction().getContext();
6941 Ctx.diagnose(NoTrap);
6942 return Chain;
6943 }
6944
6946 SDValue Ops[] = {
6947 Chain,
6948 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6949 };
6950 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6951}
6952
6953SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6954 SelectionDAG &DAG) const {
6955 if (Subtarget->hasApertureRegs()) {
6956 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6957 ? AMDGPU::SRC_SHARED_BASE
6958 : AMDGPU::SRC_PRIVATE_BASE;
6959 // Note: this feature (register) is broken. When used as a 32-bit operand,
6960 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6961 // bits.
6962 //
6963 // To work around the issue, directly emit a 64 bit mov from this register
6964 // then extract the high bits. Note that this shouldn't even result in a
6965 // shift being emitted and simply become a pair of registers (e.g.):
6966 // s_mov_b64 s[6:7], src_shared_base
6967 // v_mov_b32_e32 v1, s7
6968 //
6969 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6970 // coalescing would kick in and it would think it's okay to use the "HI"
6971 // subregister directly (instead of extracting the HI 32 bits) which is an
6972 // artificial (unusable) register.
6973 // Register TableGen definitions would need an overhaul to get rid of the
6974 // artificial "HI" aperture registers and prevent this kind of issue from
6975 // happening.
6976 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6977 DAG.getRegister(ApertureRegNo, MVT::i64));
6978 return DAG.getNode(
6979 ISD::TRUNCATE, DL, MVT::i32,
6980 DAG.getNode(ISD::SRL, DL, MVT::i64,
6981 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6982 }
6983
6984 // For code object version 5, private_base and shared_base are passed through
6985 // implicit kernargs.
6986 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6990 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6991 }
6992
6995 Register UserSGPR = Info->getQueuePtrUserSGPR();
6996 if (UserSGPR == AMDGPU::NoRegister) {
6997 // We probably are in a function incorrectly marked with
6998 // amdgpu-no-queue-ptr. This is undefined.
6999 return DAG.getUNDEF(MVT::i32);
7000 }
7001
7002 SDValue QueuePtr = CreateLiveInRegister(
7003 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7004
7005 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7006 // private_segment_aperture_base_hi.
7007 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7008
7009 SDValue Ptr =
7010 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7011
7012 // TODO: Use custom target PseudoSourceValue.
7013 // TODO: We should use the value from the IR intrinsic call, but it might not
7014 // be available and how do we get it?
7016 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7017 commonAlignment(Align(64), StructOffset),
7020}
7021
7022/// Return true if the value is a known valid address, such that a null check is
7023/// not necessary.
7025 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7026 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7027 isa<BasicBlockSDNode>(Val))
7028 return true;
7029
7030 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7031 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7032
7033 // TODO: Search through arithmetic, handle arguments and loads
7034 // marked nonnull.
7035 return false;
7036}
7037
7038SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7039 SelectionDAG &DAG) const {
7040 SDLoc SL(Op);
7041
7042 const AMDGPUTargetMachine &TM =
7043 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7044
7045 unsigned DestAS, SrcAS;
7046 SDValue Src;
7047 bool IsNonNull = false;
7048 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7049 SrcAS = ASC->getSrcAddressSpace();
7050 Src = ASC->getOperand(0);
7051 DestAS = ASC->getDestAddressSpace();
7052 } else {
7053 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7054 Op.getConstantOperandVal(0) ==
7055 Intrinsic::amdgcn_addrspacecast_nonnull);
7056 Src = Op->getOperand(1);
7057 SrcAS = Op->getConstantOperandVal(2);
7058 DestAS = Op->getConstantOperandVal(3);
7059 IsNonNull = true;
7060 }
7061
7062 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7063
7064 // flat -> local/private
7065 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7066 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7067 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7068 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7069
7070 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7071 return Ptr;
7072
7073 unsigned NullVal = TM.getNullPointerValue(DestAS);
7074 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7075 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7076
7077 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7078 SegmentNullPtr);
7079 }
7080 }
7081
7082 // local/private -> flat
7083 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7084 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7085 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7086
7087 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7088 SDValue CvtPtr =
7089 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7090 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7091
7092 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7093 return CvtPtr;
7094
7095 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7096 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7097
7098 SDValue NonNull
7099 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7100
7101 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7102 FlatNullPtr);
7103 }
7104 }
7105
7106 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7107 Op.getValueType() == MVT::i64) {
7110 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7111 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7112 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7113 }
7114
7115 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7116 Src.getValueType() == MVT::i64)
7117 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7118
7119 // global <-> flat are no-ops and never emitted.
7120
7121 const MachineFunction &MF = DAG.getMachineFunction();
7122 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7123 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7124 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7125
7126 return DAG.getUNDEF(Op->getValueType(0));
7127}
7128
7129// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7130// the small vector and inserting them into the big vector. That is better than
7131// the default expansion of doing it via a stack slot. Even though the use of
7132// the stack slot would be optimized away afterwards, the stack slot itself
7133// remains.
7134SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7135 SelectionDAG &DAG) const {
7136 SDValue Vec = Op.getOperand(0);
7137 SDValue Ins = Op.getOperand(1);
7138 SDValue Idx = Op.getOperand(2);
7139 EVT VecVT = Vec.getValueType();
7140 EVT InsVT = Ins.getValueType();
7141 EVT EltVT = VecVT.getVectorElementType();
7142 unsigned InsNumElts = InsVT.getVectorNumElements();
7143 unsigned IdxVal = Idx->getAsZExtVal();
7144 SDLoc SL(Op);
7145
7146 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7147 // Insert 32-bit registers at a time.
7148 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7149
7150 unsigned VecNumElts = VecVT.getVectorNumElements();
7151 EVT NewVecVT =
7152 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7153 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7155 MVT::i32, InsNumElts / 2);
7156
7157 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7158 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7159
7160 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7161 SDValue Elt;
7162 if (InsNumElts == 2) {
7163 Elt = Ins;
7164 } else {
7165 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7166 DAG.getConstant(I, SL, MVT::i32));
7167 }
7168 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7169 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7170 }
7171
7172 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7173 }
7174
7175 for (unsigned I = 0; I != InsNumElts; ++I) {
7176 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7177 DAG.getConstant(I, SL, MVT::i32));
7178 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7179 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7180 }
7181 return Vec;
7182}
7183
7184SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7185 SelectionDAG &DAG) const {
7186 SDValue Vec = Op.getOperand(0);
7187 SDValue InsVal = Op.getOperand(1);
7188 SDValue Idx = Op.getOperand(2);
7189 EVT VecVT = Vec.getValueType();
7190 EVT EltVT = VecVT.getVectorElementType();
7191 unsigned VecSize = VecVT.getSizeInBits();
7192 unsigned EltSize = EltVT.getSizeInBits();
7193 SDLoc SL(Op);
7194
7195 // Specially handle the case of v4i16 with static indexing.
7196 unsigned NumElts = VecVT.getVectorNumElements();
7197 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7198 if (NumElts == 4 && EltSize == 16 && KIdx) {
7199 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7200
7201 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7202 DAG.getConstant(0, SL, MVT::i32));
7203 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7204 DAG.getConstant(1, SL, MVT::i32));
7205
7206 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7207 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7208
7209 unsigned Idx = KIdx->getZExtValue();
7210 bool InsertLo = Idx < 2;
7211 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7212 InsertLo ? LoVec : HiVec,
7213 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7214 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7215
7216 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7217
7218 SDValue Concat = InsertLo ?
7219 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7220 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7221
7222 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7223 }
7224
7225 // Static indexing does not lower to stack access, and hence there is no need
7226 // for special custom lowering to avoid stack access.
7227 if (isa<ConstantSDNode>(Idx))
7228 return SDValue();
7229
7230 // Avoid stack access for dynamic indexing by custom lowering to
7231 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7232
7233 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7234
7235 MVT IntVT = MVT::getIntegerVT(VecSize);
7236
7237 // Convert vector index to bit-index and get the required bit mask.
7238 assert(isPowerOf2_32(EltSize));
7239 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7240 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7241 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7242 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7243 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7244
7245 // 1. Create a congruent vector with the target value in each element.
7246 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7247 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7248
7249 // 2. Mask off all other indices except the required index within (1).
7250 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7251
7252 // 3. Mask off the required index within the target vector.
7253 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7254 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7255 DAG.getNOT(SL, BFM, IntVT), BCVec);
7256
7257 // 4. Get (2) and (3) ORed into the target vector.
7258 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7259
7260 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7261}
7262
7263SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7264 SelectionDAG &DAG) const {
7265 SDLoc SL(Op);
7266
7267 EVT ResultVT = Op.getValueType();
7268 SDValue Vec = Op.getOperand(0);
7269 SDValue Idx = Op.getOperand(1);
7270 EVT VecVT = Vec.getValueType();
7271 unsigned VecSize = VecVT.getSizeInBits();
7272 EVT EltVT = VecVT.getVectorElementType();
7273
7274 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7275
7276 // Make sure we do any optimizations that will make it easier to fold
7277 // source modifiers before obscuring it with bit operations.
7278
7279 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7280 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7281 return Combined;
7282
7283 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7284 SDValue Lo, Hi;
7285 EVT LoVT, HiVT;
7286 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7287
7288 if (VecSize == 128) {
7289 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7290 Lo = DAG.getBitcast(LoVT,
7291 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7292 DAG.getConstant(0, SL, MVT::i32)));
7293 Hi = DAG.getBitcast(HiVT,
7294 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7295 DAG.getConstant(1, SL, MVT::i32)));
7296 } else if (VecSize == 256) {
7297 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7298 SDValue Parts[4];
7299 for (unsigned P = 0; P < 4; ++P) {
7300 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7301 DAG.getConstant(P, SL, MVT::i32));
7302 }
7303
7304 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7305 Parts[0], Parts[1]));
7306 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7307 Parts[2], Parts[3]));
7308 } else {
7309 assert(VecSize == 512);
7310
7311 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7312 SDValue Parts[8];
7313 for (unsigned P = 0; P < 8; ++P) {
7314 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7315 DAG.getConstant(P, SL, MVT::i32));
7316 }
7317
7318 Lo = DAG.getBitcast(LoVT,
7319 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7320 Parts[0], Parts[1], Parts[2], Parts[3]));
7321 Hi = DAG.getBitcast(HiVT,
7322 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7323 Parts[4], Parts[5],Parts[6], Parts[7]));
7324 }
7325
7326 EVT IdxVT = Idx.getValueType();
7327 unsigned NElem = VecVT.getVectorNumElements();
7328 assert(isPowerOf2_32(NElem));
7329 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7330 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7331 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7332 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7333 }
7334
7335 assert(VecSize <= 64);
7336
7337 MVT IntVT = MVT::getIntegerVT(VecSize);
7338
7339 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7340 SDValue VecBC = peekThroughBitcasts(Vec);
7341 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7342 SDValue Src = VecBC.getOperand(0);
7343 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7344 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7345 }
7346
7347 unsigned EltSize = EltVT.getSizeInBits();
7348 assert(isPowerOf2_32(EltSize));
7349
7350 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7351
7352 // Convert vector index to bit-index (* EltSize)
7353 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7354
7355 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7356 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7357
7358 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7359 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7360 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7361 }
7362
7363 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7364}
7365
7366static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7367 assert(Elt % 2 == 0);
7368 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7369}
7370
7371SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7372 SelectionDAG &DAG) const {
7373 SDLoc SL(Op);
7374 EVT ResultVT = Op.getValueType();
7375 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7376
7377 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7378 EVT EltVT = PackVT.getVectorElementType();
7379 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7380
7381 // vector_shuffle <0,1,6,7> lhs, rhs
7382 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7383 //
7384 // vector_shuffle <6,7,2,3> lhs, rhs
7385 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7386 //
7387 // vector_shuffle <6,7,0,1> lhs, rhs
7388 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7389
7390 // Avoid scalarizing when both halves are reading from consecutive elements.
7392 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7393 if (elementPairIsContiguous(SVN->getMask(), I)) {
7394 const int Idx = SVN->getMaskElt(I);
7395 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7396 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7397 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7398 PackVT, SVN->getOperand(VecIdx),
7399 DAG.getConstant(EltIdx, SL, MVT::i32));
7400 Pieces.push_back(SubVec);
7401 } else {
7402 const int Idx0 = SVN->getMaskElt(I);
7403 const int Idx1 = SVN->getMaskElt(I + 1);
7404 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7405 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7406 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7407 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7408
7409 SDValue Vec0 = SVN->getOperand(VecIdx0);
7410 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7411 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7412
7413 SDValue Vec1 = SVN->getOperand(VecIdx1);
7414 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7415 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7416 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7417 }
7418 }
7419
7420 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7421}
7422
7423SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7424 SelectionDAG &DAG) const {
7425 SDValue SVal = Op.getOperand(0);
7426 EVT ResultVT = Op.getValueType();
7427 EVT SValVT = SVal.getValueType();
7428 SDValue UndefVal = DAG.getUNDEF(SValVT);
7429 SDLoc SL(Op);
7430
7432 VElts.push_back(SVal);
7433 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7434 VElts.push_back(UndefVal);
7435
7436 return DAG.getBuildVector(ResultVT, SL, VElts);
7437}
7438
7439SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7440 SelectionDAG &DAG) const {
7441 SDLoc SL(Op);
7442 EVT VT = Op.getValueType();
7443
7444 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7445 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7447 VT.getVectorNumElements() / 2);
7448 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7449
7450 // Turn into pair of packed build_vectors.
7451 // TODO: Special case for constants that can be materialized with s_mov_b64.
7452 SmallVector<SDValue, 4> LoOps, HiOps;
7453 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7454 LoOps.push_back(Op.getOperand(I));
7455 HiOps.push_back(Op.getOperand(I + E));
7456 }
7457 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7458 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7459
7460 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7461 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7462
7463 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7464 { CastLo, CastHi });
7465 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7466 }
7467
7468 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7470 VT.getVectorNumElements() / 4);
7471 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7472
7473 SmallVector<SDValue, 4> Parts[4];
7474 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7475 for (unsigned P = 0; P < 4; ++P)
7476 Parts[P].push_back(Op.getOperand(I + P * E));
7477 }
7478 SDValue Casts[4];
7479 for (unsigned P = 0; P < 4; ++P) {
7480 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7481 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7482 }
7483
7484 SDValue Blend =
7485 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7486 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7487 }
7488
7489 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7491 VT.getVectorNumElements() / 8);
7492 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7493
7494 SmallVector<SDValue, 8> Parts[8];
7495 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7496 for (unsigned P = 0; P < 8; ++P)
7497 Parts[P].push_back(Op.getOperand(I + P * E));
7498 }
7499 SDValue Casts[8];
7500 for (unsigned P = 0; P < 8; ++P) {
7501 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7502 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7503 }
7504
7505 SDValue Blend =
7506 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7507 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7508 }
7509
7510 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7511 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7512
7513 SDValue Lo = Op.getOperand(0);
7514 SDValue Hi = Op.getOperand(1);
7515
7516 // Avoid adding defined bits with the zero_extend.
7517 if (Hi.isUndef()) {
7518 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7519 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7520 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7521 }
7522
7523 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7524 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7525
7526 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7527 DAG.getConstant(16, SL, MVT::i32));
7528 if (Lo.isUndef())
7529 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7530
7531 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7532 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7533
7534 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7535 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7536}
7537
7538bool
7540 // OSes that use ELF REL relocations (instead of RELA) can only store a
7541 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7542 // which can create arbitrary 64-bit addends. (This is only a problem for
7543 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7544 // the high 32 bits of the addend.)
7545 //
7546 // This should be kept in sync with how HasRelocationAddend is initialized in
7547 // the constructor of ELFAMDGPUAsmBackend.
7548 if (!Subtarget->isAmdHsaOS())
7549 return false;
7550
7551 // We can fold offsets for anything that doesn't require a GOT relocation.
7552 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7556}
7557
7558static SDValue
7560 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7561 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7562 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7563 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7564 // lowered to the following code sequence:
7565 //
7566 // For constant address space:
7567 // s_getpc_b64 s[0:1]
7568 // s_add_u32 s0, s0, $symbol
7569 // s_addc_u32 s1, s1, 0
7570 //
7571 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7572 // a fixup or relocation is emitted to replace $symbol with a literal
7573 // constant, which is a pc-relative offset from the encoding of the $symbol
7574 // operand to the global variable.
7575 //
7576 // For global address space:
7577 // s_getpc_b64 s[0:1]
7578 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7579 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7580 //
7581 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7582 // fixups or relocations are emitted to replace $symbol@*@lo and
7583 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7584 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7585 // operand to the global variable.
7586 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7587 SDValue PtrHi;
7588 if (GAFlags == SIInstrInfo::MO_NONE)
7589 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7590 else
7591 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7592 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7593}
7594
7595SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7596 SDValue Op,
7597 SelectionDAG &DAG) const {
7598 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7599 SDLoc DL(GSD);
7600 EVT PtrVT = Op.getValueType();
7601
7602 const GlobalValue *GV = GSD->getGlobal();
7608 GV->hasExternalLinkage()) {
7609 Type *Ty = GV->getValueType();
7610 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7611 // zero-sized type in other languages to declare the dynamic shared
7612 // memory which size is not known at the compile time. They will be
7613 // allocated by the runtime and placed directly after the static
7614 // allocated ones. They all share the same offset.
7615 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7616 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7617 // Adjust alignment for that dynamic shared memory array.
7619 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7620 MFI->setUsesDynamicLDS(true);
7621 return SDValue(
7622 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7623 }
7624 }
7626 }
7627
7629 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7631 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7632 }
7633
7634 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7635 SDValue AddrLo = DAG.getTargetGlobalAddress(
7636 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7637 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7638
7639 SDValue AddrHi = DAG.getTargetGlobalAddress(
7640 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7641 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7642
7643 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7644 }
7645
7646 if (shouldEmitFixup(GV))
7647 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7648
7649 if (shouldEmitPCReloc(GV))
7650 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7652
7653 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7655
7656 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7658 const DataLayout &DataLayout = DAG.getDataLayout();
7659 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7660 MachinePointerInfo PtrInfo
7662
7663 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7666}
7667
7669 const SDLoc &DL, SDValue V) const {
7670 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7671 // the destination register.
7672 //
7673 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7674 // so we will end up with redundant moves to m0.
7675 //
7676 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7677
7678 // A Null SDValue creates a glue result.
7679 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7680 V, Chain);
7681 return SDValue(M0, 0);
7682}
7683
7684SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7685 SDValue Op,
7686 MVT VT,
7687 unsigned Offset) const {
7688 SDLoc SL(Op);
7689 SDValue Param = lowerKernargMemParameter(
7690 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7691 // The local size values will have the hi 16-bits as zero.
7692 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7693 DAG.getValueType(VT));
7694}
7695
7697 EVT VT) {
7699 "non-hsa intrinsic with hsa target",
7700 DL.getDebugLoc());
7701 DAG.getContext()->diagnose(BadIntrin);
7702 return DAG.getUNDEF(VT);
7703}
7704
7706 EVT VT) {
7708 "intrinsic not supported on subtarget",
7709 DL.getDebugLoc());
7710 DAG.getContext()->diagnose(BadIntrin);
7711 return DAG.getUNDEF(VT);
7712}
7713
7715 ArrayRef<SDValue> Elts) {
7716 assert(!Elts.empty());
7717 MVT Type;
7718 unsigned NumElts = Elts.size();
7719
7720 if (NumElts <= 12) {
7721 Type = MVT::getVectorVT(MVT::f32, NumElts);
7722 } else {
7723 assert(Elts.size() <= 16);
7724 Type = MVT::v16f32;
7725 NumElts = 16;
7726 }
7727
7728 SmallVector<SDValue, 16> VecElts(NumElts);
7729 for (unsigned i = 0; i < Elts.size(); ++i) {
7730 SDValue Elt = Elts[i];
7731 if (Elt.getValueType() != MVT::f32)
7732 Elt = DAG.getBitcast(MVT::f32, Elt);
7733 VecElts[i] = Elt;
7734 }
7735 for (unsigned i = Elts.size(); i < NumElts; ++i)
7736 VecElts[i] = DAG.getUNDEF(MVT::f32);
7737
7738 if (NumElts == 1)
7739 return VecElts[0];
7740 return DAG.getBuildVector(Type, DL, VecElts);
7741}
7742
7743static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7744 SDValue Src, int ExtraElts) {
7745 EVT SrcVT = Src.getValueType();
7746
7748
7749 if (SrcVT.isVector())
7750 DAG.ExtractVectorElements(Src, Elts);
7751 else
7752 Elts.push_back(Src);
7753
7754 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7755 while (ExtraElts--)
7756 Elts.push_back(Undef);
7757
7758 return DAG.getBuildVector(CastVT, DL, Elts);
7759}
7760
7761// Re-construct the required return value for a image load intrinsic.
7762// This is more complicated due to the optional use TexFailCtrl which means the required
7763// return type is an aggregate
7765 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7766 bool Unpacked, bool IsD16, int DMaskPop,
7767 int NumVDataDwords, bool IsAtomicPacked16Bit,
7768 const SDLoc &DL) {
7769 // Determine the required return type. This is the same regardless of IsTexFail flag
7770 EVT ReqRetVT = ResultTypes[0];
7771 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7772 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7773 ? (ReqRetNumElts + 1) / 2
7774 : ReqRetNumElts;
7775
7776 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7777
7778 MVT DataDwordVT = NumDataDwords == 1 ?
7779 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7780
7781 MVT MaskPopVT = MaskPopDwords == 1 ?
7782 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7783
7784 SDValue Data(Result, 0);
7785 SDValue TexFail;
7786
7787 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7788 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7789 if (MaskPopVT.isVector()) {
7790 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7791 SDValue(Result, 0), ZeroIdx);
7792 } else {
7793 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7794 SDValue(Result, 0), ZeroIdx);
7795 }
7796 }
7797
7798 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7799 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7800 NumDataDwords - MaskPopDwords);
7801
7802 if (IsD16)
7803 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7804
7805 EVT LegalReqRetVT = ReqRetVT;
7806 if (!ReqRetVT.isVector()) {
7807 if (!Data.getValueType().isInteger())
7808 Data = DAG.getNode(ISD::BITCAST, DL,
7809 Data.getValueType().changeTypeToInteger(), Data);
7810 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7811 } else {
7812 // We need to widen the return vector to a legal type
7813 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7814 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7815 LegalReqRetVT =
7817 ReqRetVT.getVectorNumElements() + 1);
7818 }
7819 }
7820 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7821
7822 if (IsTexFail) {
7823 TexFail =
7824 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7825 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7826
7827 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7828 }
7829
7830 if (Result->getNumValues() == 1)
7831 return Data;
7832
7833 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7834}
7835
7836static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7837 SDValue *LWE, bool &IsTexFail) {
7838 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7839
7840 uint64_t Value = TexFailCtrlConst->getZExtValue();
7841 if (Value) {
7842 IsTexFail = true;
7843 }
7844
7845 SDLoc DL(TexFailCtrlConst);
7846 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7847 Value &= ~(uint64_t)0x1;
7848 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7849 Value &= ~(uint64_t)0x2;
7850
7851 return Value == 0;
7852}
7853
7855 MVT PackVectorVT,
7856 SmallVectorImpl<SDValue> &PackedAddrs,
7857 unsigned DimIdx, unsigned EndIdx,
7858 unsigned NumGradients) {
7859 SDLoc DL(Op);
7860 for (unsigned I = DimIdx; I < EndIdx; I++) {
7861 SDValue Addr = Op.getOperand(I);
7862
7863 // Gradients are packed with undef for each coordinate.
7864 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7865 // 1D: undef,dx/dh; undef,dx/dv
7866 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7867 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7868 if (((I + 1) >= EndIdx) ||
7869 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7870 I == DimIdx + NumGradients - 1))) {
7871 if (Addr.getValueType() != MVT::i16)
7872 Addr = DAG.getBitcast(MVT::i16, Addr);
7873 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7874 } else {
7875 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7876 I++;
7877 }
7878 Addr = DAG.getBitcast(MVT::f32, Addr);
7879 PackedAddrs.push_back(Addr);
7880 }
7881}
7882
7883SDValue SITargetLowering::lowerImage(SDValue Op,
7885 SelectionDAG &DAG, bool WithChain) const {
7886 SDLoc DL(Op);
7888 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7889 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7891 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7892 unsigned IntrOpcode = Intr->BaseOpcode;
7893 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7894 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7895 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7896
7897 SmallVector<EVT, 3> ResultTypes(Op->values());
7898 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7899 bool IsD16 = false;
7900 bool IsG16 = false;
7901 bool IsA16 = false;
7902 SDValue VData;
7903 int NumVDataDwords;
7904 bool AdjustRetType = false;
7905 bool IsAtomicPacked16Bit = false;
7906
7907 // Offset of intrinsic arguments
7908 const unsigned ArgOffset = WithChain ? 2 : 1;
7909
7910 unsigned DMask;
7911 unsigned DMaskLanes = 0;
7912
7913 if (BaseOpcode->Atomic) {
7914 VData = Op.getOperand(2);
7915
7916 IsAtomicPacked16Bit =
7917 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7918 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7919
7920 bool Is64Bit = VData.getValueSizeInBits() == 64;
7921 if (BaseOpcode->AtomicX2) {
7922 SDValue VData2 = Op.getOperand(3);
7923 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7924 {VData, VData2});
7925 if (Is64Bit)
7926 VData = DAG.getBitcast(MVT::v4i32, VData);
7927
7928 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7929 DMask = Is64Bit ? 0xf : 0x3;
7930 NumVDataDwords = Is64Bit ? 4 : 2;
7931 } else {
7932 DMask = Is64Bit ? 0x3 : 0x1;
7933 NumVDataDwords = Is64Bit ? 2 : 1;
7934 }
7935 } else {
7936 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7937 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7938
7939 if (BaseOpcode->Store) {
7940 VData = Op.getOperand(2);
7941
7942 MVT StoreVT = VData.getSimpleValueType();
7943 if (StoreVT.getScalarType() == MVT::f16) {
7944 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7945 return Op; // D16 is unsupported for this instruction
7946
7947 IsD16 = true;
7948 VData = handleD16VData(VData, DAG, true);
7949 }
7950
7951 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7952 } else {
7953 // Work out the num dwords based on the dmask popcount and underlying type
7954 // and whether packing is supported.
7955 MVT LoadVT = ResultTypes[0].getSimpleVT();
7956 if (LoadVT.getScalarType() == MVT::f16) {
7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7958 return Op; // D16 is unsupported for this instruction
7959
7960 IsD16 = true;
7961 }
7962
7963 // Confirm that the return type is large enough for the dmask specified
7964 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7965 (!LoadVT.isVector() && DMaskLanes > 1))
7966 return Op;
7967
7968 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7969 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7970 // instructions.
7971 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7972 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7973 NumVDataDwords = (DMaskLanes + 1) / 2;
7974 else
7975 NumVDataDwords = DMaskLanes;
7976
7977 AdjustRetType = true;
7978 }
7979 }
7980
7981 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7983
7984 // Check for 16 bit addresses or derivatives and pack if true.
7985 MVT VAddrVT =
7986 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7987 MVT VAddrScalarVT = VAddrVT.getScalarType();
7988 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7989 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7990
7991 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7992 VAddrScalarVT = VAddrVT.getScalarType();
7993 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7994 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7995
7996 // Push back extra arguments.
7997 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7998 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7999 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8000 // Special handling of bias when A16 is on. Bias is of type half but
8001 // occupies full 32-bit.
8002 SDValue Bias = DAG.getBuildVector(
8003 MVT::v2f16, DL,
8004 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8005 VAddrs.push_back(Bias);
8006 } else {
8007 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8008 "Bias needs to be converted to 16 bit in A16 mode");
8009 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8010 }
8011 }
8012
8013 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8014 // 16 bit gradients are supported, but are tied to the A16 control
8015 // so both gradients and addresses must be 16 bit
8016 LLVM_DEBUG(
8017 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8018 "require 16 bit args for both gradients and addresses");
8019 return Op;
8020 }
8021
8022 if (IsA16) {
8023 if (!ST->hasA16()) {
8024 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8025 "support 16 bit addresses\n");
8026 return Op;
8027 }
8028 }
8029
8030 // We've dealt with incorrect input so we know that if IsA16, IsG16
8031 // are set then we have to compress/pack operands (either address,
8032 // gradient or both)
8033 // In the case where a16 and gradients are tied (no G16 support) then we
8034 // have already verified that both IsA16 and IsG16 are true
8035 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8036 // Activate g16
8037 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8039 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8040 }
8041
8042 // Add gradients (packed or unpacked)
8043 if (IsG16) {
8044 // Pack the gradients
8045 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8046 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8047 ArgOffset + Intr->GradientStart,
8048 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8049 } else {
8050 for (unsigned I = ArgOffset + Intr->GradientStart;
8051 I < ArgOffset + Intr->CoordStart; I++)
8052 VAddrs.push_back(Op.getOperand(I));
8053 }
8054
8055 // Add addresses (packed or unpacked)
8056 if (IsA16) {
8057 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8058 ArgOffset + Intr->CoordStart, VAddrEnd,
8059 0 /* No gradients */);
8060 } else {
8061 // Add uncompressed address
8062 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8063 VAddrs.push_back(Op.getOperand(I));
8064 }
8065
8066 // If the register allocator cannot place the address registers contiguously
8067 // without introducing moves, then using the non-sequential address encoding
8068 // is always preferable, since it saves VALU instructions and is usually a
8069 // wash in terms of code size or even better.
8070 //
8071 // However, we currently have no way of hinting to the register allocator that
8072 // MIMG addresses should be placed contiguously when it is possible to do so,
8073 // so force non-NSA for the common 2-address case as a heuristic.
8074 //
8075 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8076 // allocation when possible.
8077 //
8078 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8079 // set of the remaining addresses.
8080 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8081 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8082 const bool UseNSA = ST->hasNSAEncoding() &&
8083 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8084 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8085 const bool UsePartialNSA =
8086 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8087
8088 SDValue VAddr;
8089 if (UsePartialNSA) {
8090 VAddr = getBuildDwordsVector(DAG, DL,
8091 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8092 }
8093 else if (!UseNSA) {
8094 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8095 }
8096
8097 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8098 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8099 SDValue Unorm;
8100 if (!BaseOpcode->Sampler) {
8101 Unorm = True;
8102 } else {
8103 uint64_t UnormConst =
8104 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8105
8106 Unorm = UnormConst ? True : False;
8107 }
8108
8109 SDValue TFE;
8110 SDValue LWE;
8111 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8112 bool IsTexFail = false;
8113 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8114 return Op;
8115
8116 if (IsTexFail) {
8117 if (!DMaskLanes) {
8118 // Expecting to get an error flag since TFC is on - and dmask is 0
8119 // Force dmask to be at least 1 otherwise the instruction will fail
8120 DMask = 0x1;
8121 DMaskLanes = 1;
8122 NumVDataDwords = 1;
8123 }
8124 NumVDataDwords += 1;
8125 AdjustRetType = true;
8126 }
8127
8128 // Has something earlier tagged that the return type needs adjusting
8129 // This happens if the instruction is a load or has set TexFailCtrl flags
8130 if (AdjustRetType) {
8131 // NumVDataDwords reflects the true number of dwords required in the return type
8132 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8133 // This is a no-op load. This can be eliminated
8134 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8135 if (isa<MemSDNode>(Op))
8136 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8137 return Undef;
8138 }
8139
8140 EVT NewVT = NumVDataDwords > 1 ?
8141 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8142 : MVT::i32;
8143
8144 ResultTypes[0] = NewVT;
8145 if (ResultTypes.size() == 3) {
8146 // Original result was aggregate type used for TexFailCtrl results
8147 // The actual instruction returns as a vector type which has now been
8148 // created. Remove the aggregate result.
8149 ResultTypes.erase(&ResultTypes[1]);
8150 }
8151 }
8152
8153 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8154 if (BaseOpcode->Atomic)
8155 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8156 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8158 return Op;
8159
8161 if (BaseOpcode->Store || BaseOpcode->Atomic)
8162 Ops.push_back(VData); // vdata
8163 if (UsePartialNSA) {
8164 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8165 Ops.push_back(VAddr);
8166 }
8167 else if (UseNSA)
8168 append_range(Ops, VAddrs);
8169 else
8170 Ops.push_back(VAddr);
8171 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8172 if (BaseOpcode->Sampler)
8173 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8174 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8175 if (IsGFX10Plus)
8176 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8177 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8178 Ops.push_back(Unorm);
8179 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8180 Ops.push_back(IsA16 && // r128, a16 for gfx9
8181 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8182 if (IsGFX10Plus)
8183 Ops.push_back(IsA16 ? True : False);
8184 if (!Subtarget->hasGFX90AInsts()) {
8185 Ops.push_back(TFE); //tfe
8186 } else if (TFE->getAsZExtVal()) {
8187 report_fatal_error("TFE is not supported on this GPU");
8188 }
8189 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8190 Ops.push_back(LWE); // lwe
8191 if (!IsGFX10Plus)
8192 Ops.push_back(DimInfo->DA ? True : False);
8193 if (BaseOpcode->HasD16)
8194 Ops.push_back(IsD16 ? True : False);
8195 if (isa<MemSDNode>(Op))
8196 Ops.push_back(Op.getOperand(0)); // chain
8197
8198 int NumVAddrDwords =
8199 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8200 int Opcode = -1;
8201
8202 if (IsGFX12Plus) {
8203 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8204 NumVDataDwords, NumVAddrDwords);
8205 } else if (IsGFX11Plus) {
8206 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8207 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8208 : AMDGPU::MIMGEncGfx11Default,
8209 NumVDataDwords, NumVAddrDwords);
8210 } else if (IsGFX10Plus) {
8211 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8212 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8213 : AMDGPU::MIMGEncGfx10Default,
8214 NumVDataDwords, NumVAddrDwords);
8215 } else {
8216 if (Subtarget->hasGFX90AInsts()) {
8217 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8218 NumVDataDwords, NumVAddrDwords);
8219 if (Opcode == -1)
8221 "requested image instruction is not supported on this GPU");
8222 }
8223 if (Opcode == -1 &&
8225 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8226 NumVDataDwords, NumVAddrDwords);
8227 if (Opcode == -1)
8228 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8229 NumVDataDwords, NumVAddrDwords);
8230 }
8231 if (Opcode == -1)
8232 return Op;
8233
8234 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8235 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8236 MachineMemOperand *MemRef = MemOp->getMemOperand();
8237 DAG.setNodeMemRefs(NewNode, {MemRef});
8238 }
8239
8240 if (BaseOpcode->AtomicX2) {
8242 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8243 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8244 }
8245 if (BaseOpcode->Store)
8246 return SDValue(NewNode, 0);
8247 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8248 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8249 NumVDataDwords, IsAtomicPacked16Bit, DL);
8250}
8251
8252SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8253 SDValue Offset, SDValue CachePolicy,
8254 SelectionDAG &DAG) const {
8256
8257 const DataLayout &DataLayout = DAG.getDataLayout();
8258 Align Alignment =
8260
8265 VT.getStoreSize(), Alignment);
8266
8267 if (!Offset->isDivergent()) {
8268 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8269
8270 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8271 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8272 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8273 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8274 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8275 SDValue BufferLoad =
8277 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8278 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8279 }
8280
8281 // Widen vec3 load to vec4.
8282 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8283 !Subtarget->hasScalarDwordx3Loads()) {
8284 EVT WidenedVT =
8286 auto WidenedOp = DAG.getMemIntrinsicNode(
8287 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8288 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8289 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8290 DAG.getVectorIdxConstant(0, DL));
8291 return Subvector;
8292 }
8293
8295 DAG.getVTList(VT), Ops, VT, MMO);
8296 }
8297
8298 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8299 // assume that the buffer is unswizzled.
8300 SDValue Ops[] = {
8301 DAG.getEntryNode(), // Chain
8302 Rsrc, // rsrc
8303 DAG.getConstant(0, DL, MVT::i32), // vindex
8304 {}, // voffset
8305 {}, // soffset
8306 {}, // offset
8307 CachePolicy, // cachepolicy
8308 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8309 };
8310 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8311 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8312 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8313 }
8314
8316 unsigned NumLoads = 1;
8317 MVT LoadVT = VT.getSimpleVT();
8318 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8319 assert((LoadVT.getScalarType() == MVT::i32 ||
8320 LoadVT.getScalarType() == MVT::f32));
8321
8322 if (NumElts == 8 || NumElts == 16) {
8323 NumLoads = NumElts / 4;
8324 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8325 }
8326
8327 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8328
8329 // Use the alignment to ensure that the required offsets will fit into the
8330 // immediate offsets.
8331 setBufferOffsets(Offset, DAG, &Ops[3],
8332 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8333
8334 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8335 for (unsigned i = 0; i < NumLoads; ++i) {
8336 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8337 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8338 LoadVT, MMO, DAG));
8339 }
8340
8341 if (NumElts == 8 || NumElts == 16)
8342 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8343
8344 return Loads[0];
8345}
8346
8347SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8348 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8349 if (!Subtarget->hasArchitectedSGPRs())
8350 return {};
8351 SDLoc SL(Op);
8352 MVT VT = MVT::i32;
8353 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8354 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8355 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8356}
8357
8358SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8359 unsigned Dim,
8360 const ArgDescriptor &Arg) const {
8361 SDLoc SL(Op);
8363 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8364 if (MaxID == 0)
8365 return DAG.getConstant(0, SL, MVT::i32);
8366
8367 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8368 SDLoc(DAG.getEntryNode()), Arg);
8369
8370 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8371 // masking operations anyway.
8372 //
8373 // TODO: We could assert the top bit is 0 for the source copy.
8374 if (Arg.isMasked())
8375 return Val;
8376
8377 // Preserve the known bits after expansion to a copy.
8379 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8380 DAG.getValueType(SmallVT));
8381}
8382
8383SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8384 SelectionDAG &DAG) const {
8386 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8387
8388 EVT VT = Op.getValueType();
8389 SDLoc DL(Op);
8390 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8391
8392 // TODO: Should this propagate fast-math-flags?
8393
8394 switch (IntrinsicID) {
8395 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8396 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8397 return emitNonHSAIntrinsicError(DAG, DL, VT);
8398 return getPreloadedValue(DAG, *MFI, VT,
8400 }
8401 case Intrinsic::amdgcn_dispatch_ptr:
8402 case Intrinsic::amdgcn_queue_ptr: {
8403 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8404 DiagnosticInfoUnsupported BadIntrin(
8405 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8406 DL.getDebugLoc());
8407 DAG.getContext()->diagnose(BadIntrin);
8408 return DAG.getUNDEF(VT);
8409 }
8410
8411 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8413 return getPreloadedValue(DAG, *MFI, VT, RegID);
8414 }
8415 case Intrinsic::amdgcn_implicitarg_ptr: {
8416 if (MFI->isEntryFunction())
8417 return getImplicitArgPtr(DAG, DL);
8418 return getPreloadedValue(DAG, *MFI, VT,
8420 }
8421 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8423 // This only makes sense to call in a kernel, so just lower to null.
8424 return DAG.getConstant(0, DL, VT);
8425 }
8426
8427 return getPreloadedValue(DAG, *MFI, VT,
8429 }
8430 case Intrinsic::amdgcn_dispatch_id: {
8431 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8432 }
8433 case Intrinsic::amdgcn_rcp:
8434 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8435 case Intrinsic::amdgcn_rsq:
8436 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8437 case Intrinsic::amdgcn_rsq_legacy:
8439 return emitRemovedIntrinsicError(DAG, DL, VT);
8440 return SDValue();
8441 case Intrinsic::amdgcn_rcp_legacy:
8443 return emitRemovedIntrinsicError(DAG, DL, VT);
8444 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8445 case Intrinsic::amdgcn_rsq_clamp: {
8447 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8448
8449 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8452
8453 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8454 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8455 DAG.getConstantFP(Max, DL, VT));
8456 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8457 DAG.getConstantFP(Min, DL, VT));
8458 }
8459 case Intrinsic::r600_read_ngroups_x:
8460 if (Subtarget->isAmdHsaOS())
8461 return emitNonHSAIntrinsicError(DAG, DL, VT);
8462
8463 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8465 false);
8466 case Intrinsic::r600_read_ngroups_y:
8467 if (Subtarget->isAmdHsaOS())
8468 return emitNonHSAIntrinsicError(DAG, DL, VT);
8469
8470 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8472 false);
8473 case Intrinsic::r600_read_ngroups_z:
8474 if (Subtarget->isAmdHsaOS())
8475 return emitNonHSAIntrinsicError(DAG, DL, VT);
8476
8477 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8479 false);
8480 case Intrinsic::r600_read_global_size_x:
8481 if (Subtarget->isAmdHsaOS())
8482 return emitNonHSAIntrinsicError(DAG, DL, VT);
8483
8484 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8486 Align(4), false);
8487 case Intrinsic::r600_read_global_size_y:
8488 if (Subtarget->isAmdHsaOS())
8489 return emitNonHSAIntrinsicError(DAG, DL, VT);
8490
8491 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8493 Align(4), false);
8494 case Intrinsic::r600_read_global_size_z:
8495 if (Subtarget->isAmdHsaOS())
8496 return emitNonHSAIntrinsicError(DAG, DL, VT);
8497
8498 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8500 Align(4), false);
8501 case Intrinsic::r600_read_local_size_x:
8502 if (Subtarget->isAmdHsaOS())
8503 return emitNonHSAIntrinsicError(DAG, DL, VT);
8504
8505 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8507 case Intrinsic::r600_read_local_size_y:
8508 if (Subtarget->isAmdHsaOS())
8509 return emitNonHSAIntrinsicError(DAG, DL, VT);
8510
8511 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8513 case Intrinsic::r600_read_local_size_z:
8514 if (Subtarget->isAmdHsaOS())
8515 return emitNonHSAIntrinsicError(DAG, DL, VT);
8516
8517 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8519 case Intrinsic::amdgcn_workgroup_id_x:
8520 return getPreloadedValue(DAG, *MFI, VT,
8522 case Intrinsic::amdgcn_workgroup_id_y:
8523 return getPreloadedValue(DAG, *MFI, VT,
8525 case Intrinsic::amdgcn_workgroup_id_z:
8526 return getPreloadedValue(DAG, *MFI, VT,
8528 case Intrinsic::amdgcn_wave_id:
8529 return lowerWaveID(DAG, Op);
8530 case Intrinsic::amdgcn_lds_kernel_id: {
8531 if (MFI->isEntryFunction())
8532 return getLDSKernelId(DAG, DL);
8533 return getPreloadedValue(DAG, *MFI, VT,
8535 }
8536 case Intrinsic::amdgcn_workitem_id_x:
8537 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8538 case Intrinsic::amdgcn_workitem_id_y:
8539 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8540 case Intrinsic::amdgcn_workitem_id_z:
8541 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8542 case Intrinsic::amdgcn_wavefrontsize:
8544 SDLoc(Op), MVT::i32);
8545 case Intrinsic::amdgcn_s_buffer_load: {
8546 unsigned CPol = Op.getConstantOperandVal(3);
8547 // s_buffer_load, because of how it's optimized, can't be volatile
8548 // so reject ones with the volatile bit set.
8549 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8552 return Op;
8553 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8554 DAG);
8555 }
8556 case Intrinsic::amdgcn_fdiv_fast:
8557 return lowerFDIV_FAST(Op, DAG);
8558 case Intrinsic::amdgcn_sin:
8559 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8560
8561 case Intrinsic::amdgcn_cos:
8562 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8563
8564 case Intrinsic::amdgcn_mul_u24:
8565 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8566 case Intrinsic::amdgcn_mul_i24:
8567 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8568
8569 case Intrinsic::amdgcn_log_clamp: {
8571 return SDValue();
8572
8573 return emitRemovedIntrinsicError(DAG, DL, VT);
8574 }
8575 case Intrinsic::amdgcn_fract:
8576 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8577
8578 case Intrinsic::amdgcn_class:
8579 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8580 Op.getOperand(1), Op.getOperand(2));
8581 case Intrinsic::amdgcn_div_fmas:
8582 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8583 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8584 Op.getOperand(4));
8585
8586 case Intrinsic::amdgcn_div_fixup:
8587 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8588 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8589
8590 case Intrinsic::amdgcn_div_scale: {
8591 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8592
8593 // Translate to the operands expected by the machine instruction. The
8594 // first parameter must be the same as the first instruction.
8595 SDValue Numerator = Op.getOperand(1);
8596 SDValue Denominator = Op.getOperand(2);
8597
8598 // Note this order is opposite of the machine instruction's operations,
8599 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8600 // intrinsic has the numerator as the first operand to match a normal
8601 // division operation.
8602
8603 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8604
8605 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8606 Denominator, Numerator);
8607 }
8608 case Intrinsic::amdgcn_icmp: {
8609 // There is a Pat that handles this variant, so return it as-is.
8610 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8611 Op.getConstantOperandVal(2) == 0 &&
8612 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8613 return Op;
8614 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8615 }
8616 case Intrinsic::amdgcn_fcmp: {
8617 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8618 }
8619 case Intrinsic::amdgcn_ballot:
8620 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8621 case Intrinsic::amdgcn_fmed3:
8622 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8623 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8624 case Intrinsic::amdgcn_fdot2:
8625 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8626 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8627 Op.getOperand(4));
8628 case Intrinsic::amdgcn_fmul_legacy:
8629 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8630 Op.getOperand(1), Op.getOperand(2));
8631 case Intrinsic::amdgcn_sffbh:
8632 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8633 case Intrinsic::amdgcn_sbfe:
8634 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8635 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8636 case Intrinsic::amdgcn_ubfe:
8637 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8638 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8639 case Intrinsic::amdgcn_cvt_pkrtz:
8640 case Intrinsic::amdgcn_cvt_pknorm_i16:
8641 case Intrinsic::amdgcn_cvt_pknorm_u16:
8642 case Intrinsic::amdgcn_cvt_pk_i16:
8643 case Intrinsic::amdgcn_cvt_pk_u16: {
8644 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8645 EVT VT = Op.getValueType();
8646 unsigned Opcode;
8647
8648 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8650 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8652 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8654 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8656 else
8658
8659 if (isTypeLegal(VT))
8660 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8661
8662 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8663 Op.getOperand(1), Op.getOperand(2));
8664 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8665 }
8666 case Intrinsic::amdgcn_fmad_ftz:
8667 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8668 Op.getOperand(2), Op.getOperand(3));
8669
8670 case Intrinsic::amdgcn_if_break:
8671 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8672 Op->getOperand(1), Op->getOperand(2)), 0);
8673
8674 case Intrinsic::amdgcn_groupstaticsize: {
8676 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8677 return Op;
8678
8679 const Module *M = MF.getFunction().getParent();
8680 const GlobalValue *GV =
8681 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8682 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8684 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8685 }
8686 case Intrinsic::amdgcn_is_shared:
8687 case Intrinsic::amdgcn_is_private: {
8688 SDLoc SL(Op);
8689 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8691 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8692 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8693 Op.getOperand(1));
8694
8695 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8696 DAG.getConstant(1, SL, MVT::i32));
8697 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8698 }
8699 case Intrinsic::amdgcn_perm:
8700 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8701 Op.getOperand(2), Op.getOperand(3));
8702 case Intrinsic::amdgcn_reloc_constant: {
8703 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8704 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8705 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8706 auto RelocSymbol = cast<GlobalVariable>(
8707 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8708 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8710 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8711 }
8712 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8713 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8714 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8715 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8716 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8717 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8718 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8719 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8720 if (Op.getOperand(4).getValueType() == MVT::i32)
8721 return SDValue();
8722
8723 SDLoc SL(Op);
8724 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8725 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8726 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8727 Op.getOperand(3), IndexKeyi32);
8728 }
8729 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8730 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8731 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8732 if (Op.getOperand(6).getValueType() == MVT::i32)
8733 return SDValue();
8734
8735 SDLoc SL(Op);
8736 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8737 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8738 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8739 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8740 IndexKeyi32, Op.getOperand(7)});
8741 }
8742 case Intrinsic::amdgcn_addrspacecast_nonnull:
8743 return lowerADDRSPACECAST(Op, DAG);
8744 case Intrinsic::amdgcn_readlane:
8745 case Intrinsic::amdgcn_readfirstlane:
8746 case Intrinsic::amdgcn_writelane:
8747 case Intrinsic::amdgcn_permlane16:
8748 case Intrinsic::amdgcn_permlanex16:
8749 case Intrinsic::amdgcn_permlane64:
8750 return lowerLaneOp(*this, Op.getNode(), DAG);
8751 default:
8752 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8754 return lowerImage(Op, ImageDimIntr, DAG, false);
8755
8756 return Op;
8757 }
8758}
8759
8760// On targets not supporting constant in soffset field, turn zero to
8761// SGPR_NULL to avoid generating an extra s_mov with zero.
8763 const GCNSubtarget *Subtarget) {
8764 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8765 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8766 return SOffset;
8767}
8768
8769SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8770 SelectionDAG &DAG,
8771 unsigned NewOpcode) const {
8772 SDLoc DL(Op);
8773
8774 SDValue VData = Op.getOperand(2);
8775 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8776 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8777 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8778 SDValue Ops[] = {
8779 Op.getOperand(0), // Chain
8780 VData, // vdata
8781 Rsrc, // rsrc
8782 DAG.getConstant(0, DL, MVT::i32), // vindex
8783 Offsets.first, // voffset
8784 SOffset, // soffset
8785 Offsets.second, // offset
8786 Op.getOperand(6), // cachepolicy
8787 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8788 };
8789
8790 auto *M = cast<MemSDNode>(Op);
8791
8792 EVT MemVT = VData.getValueType();
8793 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8794 M->getMemOperand());
8795}
8796
8797SDValue
8798SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8799 unsigned NewOpcode) const {
8800 SDLoc DL(Op);
8801
8802 SDValue VData = Op.getOperand(2);
8803 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8804 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8805 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8806 SDValue Ops[] = {
8807 Op.getOperand(0), // Chain
8808 VData, // vdata
8809 Rsrc, // rsrc
8810 Op.getOperand(4), // vindex
8811 Offsets.first, // voffset
8812 SOffset, // soffset
8813 Offsets.second, // offset
8814 Op.getOperand(7), // cachepolicy
8815 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8816 };
8817
8818 auto *M = cast<MemSDNode>(Op);
8819
8820 EVT MemVT = VData.getValueType();
8821 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8822 M->getMemOperand());
8823}
8824
8825SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8826 SelectionDAG &DAG) const {
8827 unsigned IntrID = Op.getConstantOperandVal(1);
8828 SDLoc DL(Op);
8829
8830 switch (IntrID) {
8831 case Intrinsic::amdgcn_ds_ordered_add:
8832 case Intrinsic::amdgcn_ds_ordered_swap: {
8833 MemSDNode *M = cast<MemSDNode>(Op);
8834 SDValue Chain = M->getOperand(0);
8835 SDValue M0 = M->getOperand(2);
8836 SDValue Value = M->getOperand(3);
8837 unsigned IndexOperand = M->getConstantOperandVal(7);
8838 unsigned WaveRelease = M->getConstantOperandVal(8);
8839 unsigned WaveDone = M->getConstantOperandVal(9);
8840
8841 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8842 IndexOperand &= ~0x3f;
8843 unsigned CountDw = 0;
8844
8845 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8846 CountDw = (IndexOperand >> 24) & 0xf;
8847 IndexOperand &= ~(0xf << 24);
8848
8849 if (CountDw < 1 || CountDw > 4) {
8851 "ds_ordered_count: dword count must be between 1 and 4");
8852 }
8853 }
8854
8855 if (IndexOperand)
8856 report_fatal_error("ds_ordered_count: bad index operand");
8857
8858 if (WaveDone && !WaveRelease)
8859 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8860
8861 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8862 unsigned ShaderType =
8864 unsigned Offset0 = OrderedCountIndex << 2;
8865 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8866
8867 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8868 Offset1 |= (CountDw - 1) << 6;
8869
8870 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8871 Offset1 |= ShaderType << 2;
8872
8873 unsigned Offset = Offset0 | (Offset1 << 8);
8874
8875 SDValue Ops[] = {
8876 Chain,
8877 Value,
8878 DAG.getTargetConstant(Offset, DL, MVT::i16),
8879 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8880 };
8882 M->getVTList(), Ops, M->getMemoryVT(),
8883 M->getMemOperand());
8884 }
8885 case Intrinsic::amdgcn_raw_buffer_load:
8886 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8887 case Intrinsic::amdgcn_raw_buffer_load_format:
8888 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8889 const bool IsFormat =
8890 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8891 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8892
8893 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8894 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8895 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8896 SDValue Ops[] = {
8897 Op.getOperand(0), // Chain
8898 Rsrc, // rsrc
8899 DAG.getConstant(0, DL, MVT::i32), // vindex
8900 Offsets.first, // voffset
8901 SOffset, // soffset
8902 Offsets.second, // offset
8903 Op.getOperand(5), // cachepolicy, swizzled buffer
8904 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8905 };
8906
8907 auto *M = cast<MemSDNode>(Op);
8908 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8909 }
8910 case Intrinsic::amdgcn_struct_buffer_load:
8911 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8912 case Intrinsic::amdgcn_struct_buffer_load_format:
8913 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8914 const bool IsFormat =
8915 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8916 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8917
8918 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8919 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8920 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8921 SDValue Ops[] = {
8922 Op.getOperand(0), // Chain
8923 Rsrc, // rsrc
8924 Op.getOperand(3), // vindex
8925 Offsets.first, // voffset
8926 SOffset, // soffset
8927 Offsets.second, // offset
8928 Op.getOperand(6), // cachepolicy, swizzled buffer
8929 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8930 };
8931
8932 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8933 }
8934 case Intrinsic::amdgcn_raw_tbuffer_load:
8935 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8936 MemSDNode *M = cast<MemSDNode>(Op);
8937 EVT LoadVT = Op.getValueType();
8938 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8939 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8940 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8941
8942 SDValue Ops[] = {
8943 Op.getOperand(0), // Chain
8944 Rsrc, // rsrc
8945 DAG.getConstant(0, DL, MVT::i32), // vindex
8946 Offsets.first, // voffset
8947 SOffset, // soffset
8948 Offsets.second, // offset
8949 Op.getOperand(5), // format
8950 Op.getOperand(6), // cachepolicy, swizzled buffer
8951 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8952 };
8953
8954 if (LoadVT.getScalarType() == MVT::f16)
8955 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8956 M, DAG, Ops);
8957 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8958 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8959 DAG);
8960 }
8961 case Intrinsic::amdgcn_struct_tbuffer_load:
8962 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8963 MemSDNode *M = cast<MemSDNode>(Op);
8964 EVT LoadVT = Op.getValueType();
8965 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8966 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8967 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8968
8969 SDValue Ops[] = {
8970 Op.getOperand(0), // Chain
8971 Rsrc, // rsrc
8972 Op.getOperand(3), // vindex
8973 Offsets.first, // voffset
8974 SOffset, // soffset
8975 Offsets.second, // offset
8976 Op.getOperand(6), // format
8977 Op.getOperand(7), // cachepolicy, swizzled buffer
8978 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8979 };
8980
8981 if (LoadVT.getScalarType() == MVT::f16)
8982 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8983 M, DAG, Ops);
8984 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8985 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8986 DAG);
8987 }
8988 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8989 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8990 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8991 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8992 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8993 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8994 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8995 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8996 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8997 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8999 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9000 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9002 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9003 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9004 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9005 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9006 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9007 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9008 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9009 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9011 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9012 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9013 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9014 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9015 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9017 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9018 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9019 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9020 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9021 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9023 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9024 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9026 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9027 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9029 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9030 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9032 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9033 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9034 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9035 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9036 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9038 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9039 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9041 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9042 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9043 return lowerRawBufferAtomicIntrin(Op, DAG,
9045 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9046 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9047 return lowerStructBufferAtomicIntrin(Op, DAG,
9049 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9050 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9051 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9052 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9053 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9054 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9055 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9056 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9057 return lowerStructBufferAtomicIntrin(Op, DAG,
9059 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9060 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9061 return lowerStructBufferAtomicIntrin(Op, DAG,
9063 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9064 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9065 return lowerStructBufferAtomicIntrin(Op, DAG,
9067 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9069 return lowerStructBufferAtomicIntrin(Op, DAG,
9071 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9073 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9074 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9076 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9077 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9079 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9080 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9082 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9083 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9084 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9085 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9086 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9087 return lowerStructBufferAtomicIntrin(Op, DAG,
9089
9090 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9092 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9093 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9094 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9095 SDValue Ops[] = {
9096 Op.getOperand(0), // Chain
9097 Op.getOperand(2), // src
9098 Op.getOperand(3), // cmp
9099 Rsrc, // rsrc
9100 DAG.getConstant(0, DL, MVT::i32), // vindex
9101 Offsets.first, // voffset
9102 SOffset, // soffset
9103 Offsets.second, // offset
9104 Op.getOperand(7), // cachepolicy
9105 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9106 };
9107 EVT VT = Op.getValueType();
9108 auto *M = cast<MemSDNode>(Op);
9109
9111 Op->getVTList(), Ops, VT, M->getMemOperand());
9112 }
9113 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9114 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9115 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9116 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9117 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9118 SDValue Ops[] = {
9119 Op.getOperand(0), // Chain
9120 Op.getOperand(2), // src
9121 Op.getOperand(3), // cmp
9122 Rsrc, // rsrc
9123 Op.getOperand(5), // vindex
9124 Offsets.first, // voffset
9125 SOffset, // soffset
9126 Offsets.second, // offset
9127 Op.getOperand(8), // cachepolicy
9128 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9129 };
9130 EVT VT = Op.getValueType();
9131 auto *M = cast<MemSDNode>(Op);
9132
9134 Op->getVTList(), Ops, VT, M->getMemOperand());
9135 }
9136 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9137 MemSDNode *M = cast<MemSDNode>(Op);
9138 SDValue NodePtr = M->getOperand(2);
9139 SDValue RayExtent = M->getOperand(3);
9140 SDValue RayOrigin = M->getOperand(4);
9141 SDValue RayDir = M->getOperand(5);
9142 SDValue RayInvDir = M->getOperand(6);
9143 SDValue TDescr = M->getOperand(7);
9144
9145 assert(NodePtr.getValueType() == MVT::i32 ||
9146 NodePtr.getValueType() == MVT::i64);
9147 assert(RayDir.getValueType() == MVT::v3f16 ||
9148 RayDir.getValueType() == MVT::v3f32);
9149
9150 if (!Subtarget->hasGFX10_AEncoding()) {
9151 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9152 return SDValue();
9153 }
9154
9155 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9156 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9157 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9158 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9159 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9160 const unsigned NumVDataDwords = 4;
9161 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9162 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9163 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9164 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9165 IsGFX12Plus;
9166 const unsigned BaseOpcodes[2][2] = {
9167 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9168 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9169 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9170 int Opcode;
9171 if (UseNSA) {
9172 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9173 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9174 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9175 : AMDGPU::MIMGEncGfx10NSA,
9176 NumVDataDwords, NumVAddrDwords);
9177 } else {
9178 assert(!IsGFX12Plus);
9179 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9180 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9181 : AMDGPU::MIMGEncGfx10Default,
9182 NumVDataDwords, NumVAddrDwords);
9183 }
9184 assert(Opcode != -1);
9185
9187
9188 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9190 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9191 if (Lanes[0].getValueSizeInBits() == 32) {
9192 for (unsigned I = 0; I < 3; ++I)
9193 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9194 } else {
9195 if (IsAligned) {
9196 Ops.push_back(
9197 DAG.getBitcast(MVT::i32,
9198 DAG.getBuildVector(MVT::v2f16, DL,
9199 { Lanes[0], Lanes[1] })));
9200 Ops.push_back(Lanes[2]);
9201 } else {
9202 SDValue Elt0 = Ops.pop_back_val();
9203 Ops.push_back(
9204 DAG.getBitcast(MVT::i32,
9205 DAG.getBuildVector(MVT::v2f16, DL,
9206 { Elt0, Lanes[0] })));
9207 Ops.push_back(
9208 DAG.getBitcast(MVT::i32,
9209 DAG.getBuildVector(MVT::v2f16, DL,
9210 { Lanes[1], Lanes[2] })));
9211 }
9212 }
9213 };
9214
9215 if (UseNSA && IsGFX11Plus) {
9216 Ops.push_back(NodePtr);
9217 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9218 Ops.push_back(RayOrigin);
9219 if (IsA16) {
9220 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9221 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9222 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9223 for (unsigned I = 0; I < 3; ++I) {
9224 MergedLanes.push_back(DAG.getBitcast(
9225 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9226 {DirLanes[I], InvDirLanes[I]})));
9227 }
9228 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9229 } else {
9230 Ops.push_back(RayDir);
9231 Ops.push_back(RayInvDir);
9232 }
9233 } else {
9234 if (Is64)
9235 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9236 2);
9237 else
9238 Ops.push_back(NodePtr);
9239
9240 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9241 packLanes(RayOrigin, true);
9242 packLanes(RayDir, true);
9243 packLanes(RayInvDir, false);
9244 }
9245
9246 if (!UseNSA) {
9247 // Build a single vector containing all the operands so far prepared.
9248 if (NumVAddrDwords > 12) {
9249 SDValue Undef = DAG.getUNDEF(MVT::i32);
9250 Ops.append(16 - Ops.size(), Undef);
9251 }
9252 assert(Ops.size() >= 8 && Ops.size() <= 12);
9253 SDValue MergedOps = DAG.getBuildVector(
9254 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9255 Ops.clear();
9256 Ops.push_back(MergedOps);
9257 }
9258
9259 Ops.push_back(TDescr);
9260 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9261 Ops.push_back(M->getChain());
9262
9263 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9264 MachineMemOperand *MemRef = M->getMemOperand();
9265 DAG.setNodeMemRefs(NewNode, {MemRef});
9266 return SDValue(NewNode, 0);
9267 }
9268 case Intrinsic::amdgcn_global_atomic_fmin:
9269 case Intrinsic::amdgcn_global_atomic_fmax:
9270 case Intrinsic::amdgcn_global_atomic_fmin_num:
9271 case Intrinsic::amdgcn_global_atomic_fmax_num:
9272 case Intrinsic::amdgcn_flat_atomic_fmin:
9273 case Intrinsic::amdgcn_flat_atomic_fmax:
9274 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9275 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9276 MemSDNode *M = cast<MemSDNode>(Op);
9277 SDValue Ops[] = {
9278 M->getOperand(0), // Chain
9279 M->getOperand(2), // Ptr
9280 M->getOperand(3) // Value
9281 };
9282 unsigned Opcode = 0;
9283 switch (IntrID) {
9284 case Intrinsic::amdgcn_global_atomic_fmin:
9285 case Intrinsic::amdgcn_global_atomic_fmin_num:
9286 case Intrinsic::amdgcn_flat_atomic_fmin:
9287 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9288 Opcode = ISD::ATOMIC_LOAD_FMIN;
9289 break;
9290 }
9291 case Intrinsic::amdgcn_global_atomic_fmax:
9292 case Intrinsic::amdgcn_global_atomic_fmax_num:
9293 case Intrinsic::amdgcn_flat_atomic_fmax:
9294 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9295 Opcode = ISD::ATOMIC_LOAD_FMAX;
9296 break;
9297 }
9298 default:
9299 llvm_unreachable("unhandled atomic opcode");
9300 }
9301 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9302 Ops, M->getMemOperand());
9303 }
9304 case Intrinsic::amdgcn_s_get_barrier_state: {
9305 SDValue Chain = Op->getOperand(0);
9307 unsigned Opc;
9308 bool IsInlinableBarID = false;
9309 int64_t BarID;
9310
9311 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9312 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9313 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9314 }
9315
9316 if (IsInlinableBarID) {
9317 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9318 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9319 Ops.push_back(K);
9320 } else {
9321 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9322 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9323 Ops.push_back(M0Val.getValue(0));
9324 }
9325
9326 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9327 return SDValue(NewMI, 0);
9328 }
9329 default:
9330
9331 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9333 return lowerImage(Op, ImageDimIntr, DAG, true);
9334
9335 return SDValue();
9336 }
9337}
9338
9339// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9340// dwordx4 if on SI and handle TFE loads.
9341SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9342 SDVTList VTList,
9343 ArrayRef<SDValue> Ops, EVT MemVT,
9344 MachineMemOperand *MMO,
9345 SelectionDAG &DAG) const {
9346 LLVMContext &C = *DAG.getContext();
9348 EVT VT = VTList.VTs[0];
9349
9350 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9351 bool IsTFE = VTList.NumVTs == 3;
9352 if (IsTFE) {
9353 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9354 unsigned NumOpDWords = NumValueDWords + 1;
9355 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9356 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9357 MachineMemOperand *OpDWordsMMO =
9358 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9359 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9360 OpDWordsVT, OpDWordsMMO, DAG);
9362 DAG.getVectorIdxConstant(NumValueDWords, DL));
9363 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9364 SDValue ValueDWords =
9365 NumValueDWords == 1
9366 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9368 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9369 ZeroIdx);
9370 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9371 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9372 }
9373
9374 if (!Subtarget->hasDwordx3LoadStores() &&
9375 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9376 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9377 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9378 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9379 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9380 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9381 WidenedMemVT, WidenedMMO);
9383 DAG.getVectorIdxConstant(0, DL));
9384 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9385 }
9386
9387 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9388}
9389
9390SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9391 bool ImageStore) const {
9392 EVT StoreVT = VData.getValueType();
9393
9394 // No change for f16 and legal vector D16 types.
9395 if (!StoreVT.isVector())
9396 return VData;
9397
9398 SDLoc DL(VData);
9399 unsigned NumElements = StoreVT.getVectorNumElements();
9400
9401 if (Subtarget->hasUnpackedD16VMem()) {
9402 // We need to unpack the packed data to store.
9403 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9404 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9405
9406 EVT EquivStoreVT =
9407 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9408 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9409 return DAG.UnrollVectorOp(ZExt.getNode());
9410 }
9411
9412 // The sq block of gfx8.1 does not estimate register use correctly for d16
9413 // image store instructions. The data operand is computed as if it were not a
9414 // d16 image instruction.
9415 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9416 // Bitcast to i16
9417 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9418 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9419
9420 // Decompose into scalars
9422 DAG.ExtractVectorElements(IntVData, Elts);
9423
9424 // Group pairs of i16 into v2i16 and bitcast to i32
9425 SmallVector<SDValue, 4> PackedElts;
9426 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9427 SDValue Pair =
9428 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9429 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9430 PackedElts.push_back(IntPair);
9431 }
9432 if ((NumElements % 2) == 1) {
9433 // Handle v3i16
9434 unsigned I = Elts.size() / 2;
9435 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9436 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9437 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9438 PackedElts.push_back(IntPair);
9439 }
9440
9441 // Pad using UNDEF
9442 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9443
9444 // Build final vector
9445 EVT VecVT =
9446 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9447 return DAG.getBuildVector(VecVT, DL, PackedElts);
9448 }
9449
9450 if (NumElements == 3) {
9451 EVT IntStoreVT =
9453 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9454
9455 EVT WidenedStoreVT = EVT::getVectorVT(
9456 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9457 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9458 WidenedStoreVT.getStoreSizeInBits());
9459 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9460 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9461 }
9462
9463 assert(isTypeLegal(StoreVT));
9464 return VData;
9465}
9466
9467SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9468 SelectionDAG &DAG) const {
9469 SDLoc DL(Op);
9470 SDValue Chain = Op.getOperand(0);
9471 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9473
9474 switch (IntrinsicID) {
9475 case Intrinsic::amdgcn_exp_compr: {
9476 if (!Subtarget->hasCompressedExport()) {
9477 DiagnosticInfoUnsupported BadIntrin(
9479 "intrinsic not supported on subtarget", DL.getDebugLoc());
9480 DAG.getContext()->diagnose(BadIntrin);
9481 }
9482 SDValue Src0 = Op.getOperand(4);
9483 SDValue Src1 = Op.getOperand(5);
9484 // Hack around illegal type on SI by directly selecting it.
9485 if (isTypeLegal(Src0.getValueType()))
9486 return SDValue();
9487
9488 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9489 SDValue Undef = DAG.getUNDEF(MVT::f32);
9490 const SDValue Ops[] = {
9491 Op.getOperand(2), // tgt
9492 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9493 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9494 Undef, // src2
9495 Undef, // src3
9496 Op.getOperand(7), // vm
9497 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9498 Op.getOperand(3), // en
9499 Op.getOperand(0) // Chain
9500 };
9501
9502 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9503 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9504 }
9505 case Intrinsic::amdgcn_s_barrier: {
9508 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9509 if (WGSize <= ST.getWavefrontSize())
9510 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9511 Op.getOperand(0)), 0);
9512 }
9513
9514 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9515 if (ST.hasSplitBarriers()) {
9516 SDValue K =
9518 SDValue BarSignal =
9519 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9520 MVT::Other, K, Op.getOperand(0)),
9521 0);
9522 SDValue BarWait =
9523 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9524 BarSignal.getValue(0)),
9525 0);
9526 return BarWait;
9527 }
9528
9529 return SDValue();
9530 };
9531
9532 case Intrinsic::amdgcn_struct_tbuffer_store:
9533 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9534 SDValue VData = Op.getOperand(2);
9535 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9536 if (IsD16)
9537 VData = handleD16VData(VData, DAG);
9538 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9539 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9540 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9541 SDValue Ops[] = {
9542 Chain,
9543 VData, // vdata
9544 Rsrc, // rsrc
9545 Op.getOperand(4), // vindex
9546 Offsets.first, // voffset
9547 SOffset, // soffset
9548 Offsets.second, // offset
9549 Op.getOperand(7), // format
9550 Op.getOperand(8), // cachepolicy, swizzled buffer
9551 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9552 };
9553 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9555 MemSDNode *M = cast<MemSDNode>(Op);
9556 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9557 M->getMemoryVT(), M->getMemOperand());
9558 }
9559
9560 case Intrinsic::amdgcn_raw_tbuffer_store:
9561 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9562 SDValue VData = Op.getOperand(2);
9563 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9564 if (IsD16)
9565 VData = handleD16VData(VData, DAG);
9566 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9567 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9568 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9569 SDValue Ops[] = {
9570 Chain,
9571 VData, // vdata
9572 Rsrc, // rsrc
9573 DAG.getConstant(0, DL, MVT::i32), // vindex
9574 Offsets.first, // voffset
9575 SOffset, // soffset
9576 Offsets.second, // offset
9577 Op.getOperand(6), // format
9578 Op.getOperand(7), // cachepolicy, swizzled buffer
9579 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9580 };
9581 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9583 MemSDNode *M = cast<MemSDNode>(Op);
9584 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9585 M->getMemoryVT(), M->getMemOperand());
9586 }
9587
9588 case Intrinsic::amdgcn_raw_buffer_store:
9589 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9590 case Intrinsic::amdgcn_raw_buffer_store_format:
9591 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9592 const bool IsFormat =
9593 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9594 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9595
9596 SDValue VData = Op.getOperand(2);
9597 EVT VDataVT = VData.getValueType();
9598 EVT EltType = VDataVT.getScalarType();
9599 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9600 if (IsD16) {
9601 VData = handleD16VData(VData, DAG);
9602 VDataVT = VData.getValueType();
9603 }
9604
9605 if (!isTypeLegal(VDataVT)) {
9606 VData =
9607 DAG.getNode(ISD::BITCAST, DL,
9608 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9609 }
9610
9611 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9612 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9613 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9614 SDValue Ops[] = {
9615 Chain,
9616 VData,
9617 Rsrc,
9618 DAG.getConstant(0, DL, MVT::i32), // vindex
9619 Offsets.first, // voffset
9620 SOffset, // soffset
9621 Offsets.second, // offset
9622 Op.getOperand(6), // cachepolicy, swizzled buffer
9623 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9624 };
9625 unsigned Opc =
9627 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9628 MemSDNode *M = cast<MemSDNode>(Op);
9629
9630 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9631 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9632 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9633
9634 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9635 M->getMemoryVT(), M->getMemOperand());
9636 }
9637
9638 case Intrinsic::amdgcn_struct_buffer_store:
9639 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9640 case Intrinsic::amdgcn_struct_buffer_store_format:
9641 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9642 const bool IsFormat =
9643 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9644 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9645
9646 SDValue VData = Op.getOperand(2);
9647 EVT VDataVT = VData.getValueType();
9648 EVT EltType = VDataVT.getScalarType();
9649 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9650
9651 if (IsD16) {
9652 VData = handleD16VData(VData, DAG);
9653 VDataVT = VData.getValueType();
9654 }
9655
9656 if (!isTypeLegal(VDataVT)) {
9657 VData =
9658 DAG.getNode(ISD::BITCAST, DL,
9659 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9660 }
9661
9662 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9663 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9664 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9665 SDValue Ops[] = {
9666 Chain,
9667 VData,
9668 Rsrc,
9669 Op.getOperand(4), // vindex
9670 Offsets.first, // voffset
9671 SOffset, // soffset
9672 Offsets.second, // offset
9673 Op.getOperand(7), // cachepolicy, swizzled buffer
9674 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9675 };
9676 unsigned Opc =
9678 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9679 MemSDNode *M = cast<MemSDNode>(Op);
9680
9681 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9682 EVT VDataType = VData.getValueType().getScalarType();
9683 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9684 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9685
9686 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9687 M->getMemoryVT(), M->getMemOperand());
9688 }
9689 case Intrinsic::amdgcn_raw_buffer_load_lds:
9690 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9691 case Intrinsic::amdgcn_struct_buffer_load_lds:
9692 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9693 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9694 unsigned Opc;
9695 bool HasVIndex =
9696 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9697 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9698 unsigned OpOffset = HasVIndex ? 1 : 0;
9699 SDValue VOffset = Op.getOperand(5 + OpOffset);
9700 bool HasVOffset = !isNullConstant(VOffset);
9701 unsigned Size = Op->getConstantOperandVal(4);
9702
9703 switch (Size) {
9704 default:
9705 return SDValue();
9706 case 1:
9707 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9708 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9709 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9710 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9711 break;
9712 case 2:
9713 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9714 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9715 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9716 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9717 break;
9718 case 4:
9719 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9720 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9721 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9722 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9723 break;
9724 }
9725
9726 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9727
9729
9730 if (HasVIndex && HasVOffset)
9731 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9732 { Op.getOperand(5), // VIndex
9733 VOffset }));
9734 else if (HasVIndex)
9735 Ops.push_back(Op.getOperand(5));
9736 else if (HasVOffset)
9737 Ops.push_back(VOffset);
9738
9739 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9740 Ops.push_back(Rsrc);
9741 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9742 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9743 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9744 Ops.push_back(
9745 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9747 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9748 Ops.push_back(M0Val.getValue(0)); // Chain
9749 Ops.push_back(M0Val.getValue(1)); // Glue
9750
9751 auto *M = cast<MemSDNode>(Op);
9752 MachineMemOperand *LoadMMO = M->getMemOperand();
9753 // Don't set the offset value here because the pointer points to the base of
9754 // the buffer.
9755 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9756
9757 MachinePointerInfo StorePtrI = LoadPtrI;
9758 LoadPtrI.V = PoisonValue::get(
9762
9763 auto F = LoadMMO->getFlags() &
9765 LoadMMO =
9767 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9768
9770 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9771 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9772
9773 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9774 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9775
9776 return SDValue(Load, 0);
9777 }
9778 case Intrinsic::amdgcn_global_load_lds: {
9779 unsigned Opc;
9780 unsigned Size = Op->getConstantOperandVal(4);
9781 switch (Size) {
9782 default:
9783 return SDValue();
9784 case 1:
9785 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9786 break;
9787 case 2:
9788 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9789 break;
9790 case 4:
9791 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9792 break;
9793 }
9794
9795 auto *M = cast<MemSDNode>(Op);
9796 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9797
9799
9800 SDValue Addr = Op.getOperand(2); // Global ptr
9801 SDValue VOffset;
9802 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9803 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9804 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9805 SDValue LHS = Addr.getOperand(0);
9806 SDValue RHS = Addr.getOperand(1);
9807
9808 if (LHS->isDivergent())
9809 std::swap(LHS, RHS);
9810
9811 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9812 RHS.getOperand(0).getValueType() == MVT::i32) {
9813 // add (i64 sgpr), (zero_extend (i32 vgpr))
9814 Addr = LHS;
9815 VOffset = RHS.getOperand(0);
9816 }
9817 }
9818
9819 Ops.push_back(Addr);
9820 if (!Addr->isDivergent()) {
9821 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9822 if (!VOffset)
9823 VOffset = SDValue(
9824 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9825 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9826 Ops.push_back(VOffset);
9827 }
9828
9829 Ops.push_back(Op.getOperand(5)); // Offset
9830 Ops.push_back(Op.getOperand(6)); // CPol
9831 Ops.push_back(M0Val.getValue(0)); // Chain
9832 Ops.push_back(M0Val.getValue(1)); // Glue
9833
9834 MachineMemOperand *LoadMMO = M->getMemOperand();
9835 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9836 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9837 MachinePointerInfo StorePtrI = LoadPtrI;
9838 LoadPtrI.V = PoisonValue::get(
9842 auto F = LoadMMO->getFlags() &
9844 LoadMMO =
9846 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9848 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9849 LoadMMO->getAAInfo());
9850
9851 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9852 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9853
9854 return SDValue(Load, 0);
9855 }
9856 case Intrinsic::amdgcn_end_cf:
9857 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9858 Op->getOperand(2), Chain), 0);
9859 case Intrinsic::amdgcn_s_barrier_init:
9860 case Intrinsic::amdgcn_s_barrier_join:
9861 case Intrinsic::amdgcn_s_wakeup_barrier: {
9862 SDValue Chain = Op->getOperand(0);
9864 SDValue BarOp = Op->getOperand(2);
9865 unsigned Opc;
9866 bool IsInlinableBarID = false;
9867 int64_t BarVal;
9868
9869 if (isa<ConstantSDNode>(BarOp)) {
9870 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9871 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9872 }
9873
9874 if (IsInlinableBarID) {
9875 switch (IntrinsicID) {
9876 default:
9877 return SDValue();
9878 case Intrinsic::amdgcn_s_barrier_init:
9879 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9880 break;
9881 case Intrinsic::amdgcn_s_barrier_join:
9882 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9883 break;
9884 case Intrinsic::amdgcn_s_wakeup_barrier:
9885 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9886 break;
9887 }
9888
9889 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9890 Ops.push_back(K);
9891 } else {
9892 switch (IntrinsicID) {
9893 default:
9894 return SDValue();
9895 case Intrinsic::amdgcn_s_barrier_init:
9896 Opc = AMDGPU::S_BARRIER_INIT_M0;
9897 break;
9898 case Intrinsic::amdgcn_s_barrier_join:
9899 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9900 break;
9901 case Intrinsic::amdgcn_s_wakeup_barrier:
9902 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9903 break;
9904 }
9905 }
9906
9907 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9908 SDValue M0Val;
9909 // Member count will be read from M0[16:22]
9910 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9911 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9912
9913 if (!IsInlinableBarID) {
9914 // If reference to barrier id is not an inline constant then it must be
9915 // referenced with M0[4:0]. Perform an OR with the member count to
9916 // include it in M0.
9917 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9918 Op.getOperand(2), M0Val),
9919 0);
9920 }
9921 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9922 } else if (!IsInlinableBarID) {
9923 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9924 }
9925
9926 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9927 return SDValue(NewMI, 0);
9928 }
9929 default: {
9930 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9932 return lowerImage(Op, ImageDimIntr, DAG, true);
9933
9934 return Op;
9935 }
9936 }
9937}
9938
9939// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9940// offset (the offset that is included in bounds checking and swizzling, to be
9941// split between the instruction's voffset and immoffset fields) and soffset
9942// (the offset that is excluded from bounds checking and swizzling, to go in
9943// the instruction's soffset field). This function takes the first kind of
9944// offset and figures out how to split it between voffset and immoffset.
9945std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9946 SDValue Offset, SelectionDAG &DAG) const {
9947 SDLoc DL(Offset);
9948 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9949 SDValue N0 = Offset;
9950 ConstantSDNode *C1 = nullptr;
9951
9952 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9953 N0 = SDValue();
9954 else if (DAG.isBaseWithConstantOffset(N0)) {
9955 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9956 N0 = N0.getOperand(0);
9957 }
9958
9959 if (C1) {
9960 unsigned ImmOffset = C1->getZExtValue();
9961 // If the immediate value is too big for the immoffset field, put only bits
9962 // that would normally fit in the immoffset field. The remaining value that
9963 // is copied/added for the voffset field is a large power of 2, and it
9964 // stands more chance of being CSEd with the copy/add for another similar
9965 // load/store.
9966 // However, do not do that rounding down if that is a negative
9967 // number, as it appears to be illegal to have a negative offset in the
9968 // vgpr, even if adding the immediate offset makes it positive.
9969 unsigned Overflow = ImmOffset & ~MaxImm;
9970 ImmOffset -= Overflow;
9971 if ((int32_t)Overflow < 0) {
9972 Overflow += ImmOffset;
9973 ImmOffset = 0;
9974 }
9975 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9976 if (Overflow) {
9977 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9978 if (!N0)
9979 N0 = OverflowVal;
9980 else {
9981 SDValue Ops[] = { N0, OverflowVal };
9982 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9983 }
9984 }
9985 }
9986 if (!N0)
9987 N0 = DAG.getConstant(0, DL, MVT::i32);
9988 if (!C1)
9989 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
9990 return {N0, SDValue(C1, 0)};
9991}
9992
9993// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
9994// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
9995// pointed to by Offsets.
9996void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
9997 SelectionDAG &DAG, SDValue *Offsets,
9998 Align Alignment) const {
10000 SDLoc DL(CombinedOffset);
10001 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10002 uint32_t Imm = C->getZExtValue();
10003 uint32_t SOffset, ImmOffset;
10004 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10005 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10006 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10007 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10008 return;
10009 }
10010 }
10011 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10012 SDValue N0 = CombinedOffset.getOperand(0);
10013 SDValue N1 = CombinedOffset.getOperand(1);
10014 uint32_t SOffset, ImmOffset;
10015 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10016 if (Offset >= 0 &&
10017 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10018 Offsets[0] = N0;
10019 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10020 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10021 return;
10022 }
10023 }
10024
10025 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10026 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10027 : DAG.getConstant(0, DL, MVT::i32);
10028
10029 Offsets[0] = CombinedOffset;
10030 Offsets[1] = SOffsetZero;
10031 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10032}
10033
10034SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10035 SelectionDAG &DAG) const {
10036 if (!MaybePointer.getValueType().isScalarInteger())
10037 return MaybePointer;
10038
10039 SDLoc DL(MaybePointer);
10040
10041 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10042 return Rsrc;
10043}
10044
10045// Wrap a global or flat pointer into a buffer intrinsic using the flags
10046// specified in the intrinsic.
10047SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10048 SelectionDAG &DAG) const {
10049 SDLoc Loc(Op);
10050
10051 SDValue Pointer = Op->getOperand(1);
10052 SDValue Stride = Op->getOperand(2);
10053 SDValue NumRecords = Op->getOperand(3);
10054 SDValue Flags = Op->getOperand(4);
10055
10056 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10057 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10058 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10059 std::optional<uint32_t> ConstStride = std::nullopt;
10060 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10061 ConstStride = ConstNode->getZExtValue();
10062
10063 SDValue NewHighHalf = Masked;
10064 if (!ConstStride || *ConstStride != 0) {
10065 SDValue ShiftedStride;
10066 if (ConstStride) {
10067 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10068 } else {
10069 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10070 ShiftedStride =
10071 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10072 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10073 }
10074 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10075 }
10076
10077 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10078 NewHighHalf, NumRecords, Flags);
10079 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10080 return RsrcPtr;
10081}
10082
10083// Handle 8 bit and 16 bit buffer loads
10084SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10085 EVT LoadVT, SDLoc DL,
10087 MachineMemOperand *MMO,
10088 bool IsTFE) const {
10089 EVT IntVT = LoadVT.changeTypeToInteger();
10090
10091 if (IsTFE) {
10092 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10096 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10097 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10098 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10100 DAG.getConstant(1, DL, MVT::i32));
10101 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10102 DAG.getConstant(0, DL, MVT::i32));
10103 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10104 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10105 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10106 }
10107
10108 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10110
10111 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10112 SDValue BufferLoad =
10113 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10114 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10115 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10116
10117 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10118}
10119
10120// Handle 8 bit and 16 bit buffer stores
10121SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10122 EVT VDataType, SDLoc DL,
10123 SDValue Ops[],
10124 MemSDNode *M) const {
10125 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10126 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10127
10128 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10129 Ops[1] = BufferStoreExt;
10130 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10132 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10133 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10134 M->getMemOperand());
10135}
10136
10138 ISD::LoadExtType ExtType, SDValue Op,
10139 const SDLoc &SL, EVT VT) {
10140 if (VT.bitsLT(Op.getValueType()))
10141 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10142
10143 switch (ExtType) {
10144 case ISD::SEXTLOAD:
10145 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10146 case ISD::ZEXTLOAD:
10147 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10148 case ISD::EXTLOAD:
10149 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10150 case ISD::NON_EXTLOAD:
10151 return Op;
10152 }
10153
10154 llvm_unreachable("invalid ext type");
10155}
10156
10157// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10158// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10159SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10160 SelectionDAG &DAG = DCI.DAG;
10161 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10162 return SDValue();
10163
10164 // FIXME: Constant loads should all be marked invariant.
10165 unsigned AS = Ld->getAddressSpace();
10166 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10168 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10169 return SDValue();
10170
10171 // Don't do this early, since it may interfere with adjacent load merging for
10172 // illegal types. We can avoid losing alignment information for exotic types
10173 // pre-legalize.
10174 EVT MemVT = Ld->getMemoryVT();
10175 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10176 MemVT.getSizeInBits() >= 32)
10177 return SDValue();
10178
10179 SDLoc SL(Ld);
10180
10181 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10182 "unexpected vector extload");
10183
10184 // TODO: Drop only high part of range.
10185 SDValue Ptr = Ld->getBasePtr();
10186 SDValue NewLoad = DAG.getLoad(
10187 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10188 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10189 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10190 nullptr); // Drop ranges
10191
10192 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10193 if (MemVT.isFloatingPoint()) {
10195 "unexpected fp extload");
10196 TruncVT = MemVT.changeTypeToInteger();
10197 }
10198
10199 SDValue Cvt = NewLoad;
10200 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10201 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10202 DAG.getValueType(TruncVT));
10203 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10205 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10206 } else {
10208 }
10209
10210 EVT VT = Ld->getValueType(0);
10211 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10212
10213 DCI.AddToWorklist(Cvt.getNode());
10214
10215 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10216 // the appropriate extension from the 32-bit load.
10217 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10218 DCI.AddToWorklist(Cvt.getNode());
10219
10220 // Handle conversion back to floating point if necessary.
10221 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10222
10223 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10224}
10225
10227 const SIMachineFunctionInfo &Info) {
10228 // TODO: Should check if the address can definitely not access stack.
10229 if (Info.isEntryFunction())
10230 return Info.getUserSGPRInfo().hasFlatScratchInit();
10231 return true;
10232}
10233
10234SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10235 SDLoc DL(Op);
10236 LoadSDNode *Load = cast<LoadSDNode>(Op);
10237 ISD::LoadExtType ExtType = Load->getExtensionType();
10238 EVT MemVT = Load->getMemoryVT();
10239
10240 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10241 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10242 return SDValue();
10243
10244 // FIXME: Copied from PPC
10245 // First, load into 32 bits, then truncate to 1 bit.
10246
10247 SDValue Chain = Load->getChain();
10248 SDValue BasePtr = Load->getBasePtr();
10249 MachineMemOperand *MMO = Load->getMemOperand();
10250
10251 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10252
10253 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10254 BasePtr, RealMemVT, MMO);
10255
10256 if (!MemVT.isVector()) {
10257 SDValue Ops[] = {
10258 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10259 NewLD.getValue(1)
10260 };
10261
10262 return DAG.getMergeValues(Ops, DL);
10263 }
10264
10266 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10267 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10268 DAG.getConstant(I, DL, MVT::i32));
10269
10270 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10271 }
10272
10273 SDValue Ops[] = {
10274 DAG.getBuildVector(MemVT, DL, Elts),
10275 NewLD.getValue(1)
10276 };
10277
10278 return DAG.getMergeValues(Ops, DL);
10279 }
10280
10281 if (!MemVT.isVector())
10282 return SDValue();
10283
10284 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10285 "Custom lowering for non-i32 vectors hasn't been implemented.");
10286
10287 Align Alignment = Load->getAlign();
10288 unsigned AS = Load->getAddressSpace();
10289 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10290 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10291 return SplitVectorLoad(Op, DAG);
10292 }
10293
10296 // If there is a possibility that flat instruction access scratch memory
10297 // then we need to use the same legalization rules we use for private.
10298 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10300 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10302
10303 unsigned NumElements = MemVT.getVectorNumElements();
10304
10305 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10307 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10308 if (MemVT.isPow2VectorType() ||
10309 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10310 return SDValue();
10311 return WidenOrSplitVectorLoad(Op, DAG);
10312 }
10313 // Non-uniform loads will be selected to MUBUF instructions, so they
10314 // have the same legalization requirements as global and private
10315 // loads.
10316 //
10317 }
10318
10319 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10322 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10323 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10324 Alignment >= Align(4) && NumElements < 32) {
10325 if (MemVT.isPow2VectorType() ||
10326 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10327 return SDValue();
10328 return WidenOrSplitVectorLoad(Op, DAG);
10329 }
10330 // Non-uniform loads will be selected to MUBUF instructions, so they
10331 // have the same legalization requirements as global and private
10332 // loads.
10333 //
10334 }
10335 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10338 AS == AMDGPUAS::FLAT_ADDRESS) {
10339 if (NumElements > 4)
10340 return SplitVectorLoad(Op, DAG);
10341 // v3 loads not supported on SI.
10342 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10343 return WidenOrSplitVectorLoad(Op, DAG);
10344
10345 // v3 and v4 loads are supported for private and global memory.
10346 return SDValue();
10347 }
10348 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10349 // Depending on the setting of the private_element_size field in the
10350 // resource descriptor, we can only make private accesses up to a certain
10351 // size.
10352 switch (Subtarget->getMaxPrivateElementSize()) {
10353 case 4: {
10354 SDValue Ops[2];
10355 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10356 return DAG.getMergeValues(Ops, DL);
10357 }
10358 case 8:
10359 if (NumElements > 2)
10360 return SplitVectorLoad(Op, DAG);
10361 return SDValue();
10362 case 16:
10363 // Same as global/flat
10364 if (NumElements > 4)
10365 return SplitVectorLoad(Op, DAG);
10366 // v3 loads not supported on SI.
10367 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10368 return WidenOrSplitVectorLoad(Op, DAG);
10369
10370 return SDValue();
10371 default:
10372 llvm_unreachable("unsupported private_element_size");
10373 }
10374 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10375 unsigned Fast = 0;
10376 auto Flags = Load->getMemOperand()->getFlags();
10378 Load->getAlign(), Flags, &Fast) &&
10379 Fast > 1)
10380 return SDValue();
10381
10382 if (MemVT.isVector())
10383 return SplitVectorLoad(Op, DAG);
10384 }
10385
10387 MemVT, *Load->getMemOperand())) {
10388 SDValue Ops[2];
10389 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10390 return DAG.getMergeValues(Ops, DL);
10391 }
10392
10393 return SDValue();
10394}
10395
10396SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10397 EVT VT = Op.getValueType();
10398 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10399 VT.getSizeInBits() == 512)
10400 return splitTernaryVectorOp(Op, DAG);
10401
10402 assert(VT.getSizeInBits() == 64);
10403
10404 SDLoc DL(Op);
10405 SDValue Cond = Op.getOperand(0);
10406
10407 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10408 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10409
10410 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10411 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10412
10413 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10414 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10415
10416 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10417
10418 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10419 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10420
10421 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10422
10423 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10424 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10425}
10426
10427// Catch division cases where we can use shortcuts with rcp and rsq
10428// instructions.
10429SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10430 SelectionDAG &DAG) const {
10431 SDLoc SL(Op);
10432 SDValue LHS = Op.getOperand(0);
10433 SDValue RHS = Op.getOperand(1);
10434 EVT VT = Op.getValueType();
10435 const SDNodeFlags Flags = Op->getFlags();
10436
10437 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10439
10440 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10441 // Without !fpmath accuracy information, we can't do more because we don't
10442 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10443 // f16 is always accurate enough
10444 if (!AllowInaccurateRcp && VT != MVT::f16)
10445 return SDValue();
10446
10447 if (CLHS->isExactlyValue(1.0)) {
10448 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10449 // the CI documentation has a worst case error of 1 ulp.
10450 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10451 // use it as long as we aren't trying to use denormals.
10452 //
10453 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10454
10455 // 1.0 / sqrt(x) -> rsq(x)
10456
10457 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10458 // error seems really high at 2^29 ULP.
10459 // 1.0 / x -> rcp(x)
10460 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10461 }
10462
10463 // Same as for 1.0, but expand the sign out of the constant.
10464 if (CLHS->isExactlyValue(-1.0)) {
10465 // -1.0 / x -> rcp (fneg x)
10466 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10467 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10468 }
10469 }
10470
10471 // For f16 require afn or arcp.
10472 // For f32 require afn.
10473 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10474 return SDValue();
10475
10476 // Turn into multiply by the reciprocal.
10477 // x / y -> x * (1.0 / y)
10478 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10479 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10480}
10481
10482SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10483 SelectionDAG &DAG) const {
10484 SDLoc SL(Op);
10485 SDValue X = Op.getOperand(0);
10486 SDValue Y = Op.getOperand(1);
10487 EVT VT = Op.getValueType();
10488 const SDNodeFlags Flags = Op->getFlags();
10489
10490 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10492 if (!AllowInaccurateDiv)
10493 return SDValue();
10494
10495 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10496 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10497
10498 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10499 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10500
10501 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10502 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10503 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10504 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10505 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10506 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10507}
10508
10509static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10510 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10511 SDNodeFlags Flags) {
10512 if (GlueChain->getNumValues() <= 1) {
10513 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10514 }
10515
10516 assert(GlueChain->getNumValues() == 3);
10517
10518 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10519 switch (Opcode) {
10520 default: llvm_unreachable("no chain equivalent for opcode");
10521 case ISD::FMUL:
10522 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10523 break;
10524 }
10525
10526 return DAG.getNode(Opcode, SL, VTList,
10527 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10528 Flags);
10529}
10530
10531static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10532 EVT VT, SDValue A, SDValue B, SDValue C,
10533 SDValue GlueChain, SDNodeFlags Flags) {
10534 if (GlueChain->getNumValues() <= 1) {
10535 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10536 }
10537
10538 assert(GlueChain->getNumValues() == 3);
10539
10540 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10541 switch (Opcode) {
10542 default: llvm_unreachable("no chain equivalent for opcode");
10543 case ISD::FMA:
10544 Opcode = AMDGPUISD::FMA_W_CHAIN;
10545 break;
10546 }
10547
10548 return DAG.getNode(Opcode, SL, VTList,
10549 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10550 Flags);
10551}
10552
10553SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10554 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10555 return FastLowered;
10556
10557 SDLoc SL(Op);
10558 SDValue Src0 = Op.getOperand(0);
10559 SDValue Src1 = Op.getOperand(1);
10560
10561 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10562 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10563
10564 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10565 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10566
10567 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10568 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10569
10570 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10571}
10572
10573// Faster 2.5 ULP division that does not support denormals.
10574SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10575 SDNodeFlags Flags = Op->getFlags();
10576 SDLoc SL(Op);
10577 SDValue LHS = Op.getOperand(1);
10578 SDValue RHS = Op.getOperand(2);
10579
10580 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10581
10582 const APFloat K0Val(0x1p+96f);
10583 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10584
10585 const APFloat K1Val(0x1p-32f);
10586 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10587
10588 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10589
10590 EVT SetCCVT =
10591 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10592
10593 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10594
10595 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10596
10597 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10598
10599 // rcp does not support denormals.
10600 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10601
10602 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10603
10604 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10605}
10606
10607// Returns immediate value for setting the F32 denorm mode when using the
10608// S_DENORM_MODE instruction.
10610 const SIMachineFunctionInfo *Info,
10611 const GCNSubtarget *ST) {
10612 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10613 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10614 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10615 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10616}
10617
10618SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10619 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10620 return FastLowered;
10621
10622 // The selection matcher assumes anything with a chain selecting to a
10623 // mayRaiseFPException machine instruction. Since we're introducing a chain
10624 // here, we need to explicitly report nofpexcept for the regular fdiv
10625 // lowering.
10626 SDNodeFlags Flags = Op->getFlags();
10627 Flags.setNoFPExcept(true);
10628
10629 SDLoc SL(Op);
10630 SDValue LHS = Op.getOperand(0);
10631 SDValue RHS = Op.getOperand(1);
10632
10633 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10634
10635 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10636
10637 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10638 {RHS, RHS, LHS}, Flags);
10639 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10640 {LHS, RHS, LHS}, Flags);
10641
10642 // Denominator is scaled to not be denormal, so using rcp is ok.
10643 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10644 DenominatorScaled, Flags);
10645 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10646 DenominatorScaled, Flags);
10647
10648 using namespace AMDGPU::Hwreg;
10649 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10650 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10651
10652 const MachineFunction &MF = DAG.getMachineFunction();
10654 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10655
10656 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10657 const bool HasDynamicDenormals =
10658 (DenormMode.Input == DenormalMode::Dynamic) ||
10659 (DenormMode.Output == DenormalMode::Dynamic);
10660
10661 SDValue SavedDenormMode;
10662
10663 if (!PreservesDenormals) {
10664 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10665 // lowering. The chain dependence is insufficient, and we need glue. We do
10666 // not need the glue variants in a strictfp function.
10667
10668 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10669
10670 SDValue Glue = DAG.getEntryNode();
10671 if (HasDynamicDenormals) {
10672 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10673 DAG.getVTList(MVT::i32, MVT::Glue),
10674 {BitField, Glue});
10675 SavedDenormMode = SDValue(GetReg, 0);
10676
10677 Glue = DAG.getMergeValues(
10678 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10679 }
10680
10681 SDNode *EnableDenorm;
10682 if (Subtarget->hasDenormModeInst()) {
10683 const SDValue EnableDenormValue =
10684 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10685
10686 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10687 EnableDenormValue)
10688 .getNode();
10689 } else {
10690 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10691 SL, MVT::i32);
10692 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10693 {EnableDenormValue, BitField, Glue});
10694 }
10695
10696 SDValue Ops[3] = {
10697 NegDivScale0,
10698 SDValue(EnableDenorm, 0),
10699 SDValue(EnableDenorm, 1)
10700 };
10701
10702 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10703 }
10704
10705 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10706 ApproxRcp, One, NegDivScale0, Flags);
10707
10708 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10709 ApproxRcp, Fma0, Flags);
10710
10711 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10712 Fma1, Fma1, Flags);
10713
10714 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10715 NumeratorScaled, Mul, Flags);
10716
10717 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10718 Fma2, Fma1, Mul, Fma2, Flags);
10719
10720 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10721 NumeratorScaled, Fma3, Flags);
10722
10723 if (!PreservesDenormals) {
10724 SDNode *DisableDenorm;
10725 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10726 const SDValue DisableDenormValue = getSPDenormModeValue(
10727 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10728
10729 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10730 Fma4.getValue(1), DisableDenormValue,
10731 Fma4.getValue(2)).getNode();
10732 } else {
10733 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10734 const SDValue DisableDenormValue =
10735 HasDynamicDenormals
10736 ? SavedDenormMode
10737 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10738
10739 DisableDenorm = DAG.getMachineNode(
10740 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10741 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10742 }
10743
10744 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10745 SDValue(DisableDenorm, 0), DAG.getRoot());
10746 DAG.setRoot(OutputChain);
10747 }
10748
10749 SDValue Scale = NumeratorScaled.getValue(1);
10750 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10751 {Fma4, Fma1, Fma3, Scale}, Flags);
10752
10753 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10754}
10755
10756SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10757 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10758 return FastLowered;
10759
10760 SDLoc SL(Op);
10761 SDValue X = Op.getOperand(0);
10762 SDValue Y = Op.getOperand(1);
10763
10764 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10765
10766 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10767
10768 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10769
10770 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10771
10772 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10773
10774 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10775
10776 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10777
10778 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10779
10780 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10781
10782 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10783 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10784
10785 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10786 NegDivScale0, Mul, DivScale1);
10787
10788 SDValue Scale;
10789
10790 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10791 // Workaround a hardware bug on SI where the condition output from div_scale
10792 // is not usable.
10793
10794 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10795
10796 // Figure out if the scale to use for div_fmas.
10797 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10798 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10799 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10800 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10801
10802 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10803 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10804
10805 SDValue Scale0Hi
10806 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10807 SDValue Scale1Hi
10808 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10809
10810 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10811 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10812 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10813 } else {
10814 Scale = DivScale1.getValue(1);
10815 }
10816
10817 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10818 Fma4, Fma3, Mul, Scale);
10819
10820 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10821}
10822
10823SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10824 EVT VT = Op.getValueType();
10825
10826 if (VT == MVT::f32)
10827 return LowerFDIV32(Op, DAG);
10828
10829 if (VT == MVT::f64)
10830 return LowerFDIV64(Op, DAG);
10831
10832 if (VT == MVT::f16)
10833 return LowerFDIV16(Op, DAG);
10834
10835 llvm_unreachable("Unexpected type for fdiv");
10836}
10837
10838SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10839 SDLoc dl(Op);
10840 SDValue Val = Op.getOperand(0);
10841 EVT VT = Val.getValueType();
10842 EVT ResultExpVT = Op->getValueType(1);
10843 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10844
10845 SDValue Mant = DAG.getNode(
10847 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10848
10849 SDValue Exp = DAG.getNode(
10850 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10851 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10852
10853 if (Subtarget->hasFractBug()) {
10854 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10855 SDValue Inf = DAG.getConstantFP(
10857
10858 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10859 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10860 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10861 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10862 }
10863
10864 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10865 return DAG.getMergeValues({Mant, CastExp}, dl);
10866}
10867
10868SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10869 SDLoc DL(Op);
10870 StoreSDNode *Store = cast<StoreSDNode>(Op);
10871 EVT VT = Store->getMemoryVT();
10872
10873 if (VT == MVT::i1) {
10874 return DAG.getTruncStore(Store->getChain(), DL,
10875 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10876 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10877 }
10878
10879 assert(VT.isVector() &&
10880 Store->getValue().getValueType().getScalarType() == MVT::i32);
10881
10882 unsigned AS = Store->getAddressSpace();
10883 if (Subtarget->hasLDSMisalignedBug() &&
10884 AS == AMDGPUAS::FLAT_ADDRESS &&
10885 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10886 return SplitVectorStore(Op, DAG);
10887 }
10888
10891 // If there is a possibility that flat instruction access scratch memory
10892 // then we need to use the same legalization rules we use for private.
10893 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10895 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10897
10898 unsigned NumElements = VT.getVectorNumElements();
10899 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10900 AS == AMDGPUAS::FLAT_ADDRESS) {
10901 if (NumElements > 4)
10902 return SplitVectorStore(Op, DAG);
10903 // v3 stores not supported on SI.
10904 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10905 return SplitVectorStore(Op, DAG);
10906
10908 VT, *Store->getMemOperand()))
10909 return expandUnalignedStore(Store, DAG);
10910
10911 return SDValue();
10912 }
10913 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10914 switch (Subtarget->getMaxPrivateElementSize()) {
10915 case 4:
10916 return scalarizeVectorStore(Store, DAG);
10917 case 8:
10918 if (NumElements > 2)
10919 return SplitVectorStore(Op, DAG);
10920 return SDValue();
10921 case 16:
10922 if (NumElements > 4 ||
10923 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10924 return SplitVectorStore(Op, DAG);
10925 return SDValue();
10926 default:
10927 llvm_unreachable("unsupported private_element_size");
10928 }
10929 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10930 unsigned Fast = 0;
10931 auto Flags = Store->getMemOperand()->getFlags();
10933 Store->getAlign(), Flags, &Fast) &&
10934 Fast > 1)
10935 return SDValue();
10936
10937 if (VT.isVector())
10938 return SplitVectorStore(Op, DAG);
10939
10940 return expandUnalignedStore(Store, DAG);
10941 }
10942
10943 // Probably an invalid store. If so we'll end up emitting a selection error.
10944 return SDValue();
10945}
10946
10947// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10948SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10949 SDLoc SL(Op);
10950 assert(!Subtarget->has16BitInsts());
10951 SDNodeFlags Flags = Op->getFlags();
10952 SDValue Ext =
10953 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10954
10955 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10956 SDValue Sqrt =
10957 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10958
10959 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10960 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10961}
10962
10963SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10964 SDLoc DL(Op);
10965 SDNodeFlags Flags = Op->getFlags();
10966 MVT VT = Op.getValueType().getSimpleVT();
10967 const SDValue X = Op.getOperand(0);
10968
10969 if (allowApproxFunc(DAG, Flags)) {
10970 // Instruction is 1ulp but ignores denormals.
10971 return DAG.getNode(
10973 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10974 }
10975
10976 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10977 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10978
10979 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10980
10981 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10982
10983 SDValue SqrtX =
10984 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
10985
10986 SDValue SqrtS;
10987 if (needsDenormHandlingF32(DAG, X, Flags)) {
10988 SDValue SqrtID =
10989 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
10990 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
10991
10992 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
10993 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10994 DAG.getConstant(-1, DL, MVT::i32));
10995 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
10996
10997 SDValue NegSqrtSNextDown =
10998 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
10999
11000 SDValue SqrtVP =
11001 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11002
11003 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11004 DAG.getConstant(1, DL, MVT::i32));
11005 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11006
11007 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11008 SDValue SqrtVS =
11009 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11010
11011 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11012 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11013
11014 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11015 Flags);
11016
11017 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11018 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11019 Flags);
11020 } else {
11021 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11022
11023 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11024
11025 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11026 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11027 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11028
11029 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11030 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11031 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11032
11033 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11034 SDValue SqrtD =
11035 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11036 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11037 }
11038
11039 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11040
11041 SDValue ScaledDown =
11042 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11043
11044 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11045 SDValue IsZeroOrInf =
11046 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11047 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11048
11049 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11050}
11051
11052SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11053 // For double type, the SQRT and RSQ instructions don't have required
11054 // precision, we apply Goldschmidt's algorithm to improve the result:
11055 //
11056 // y0 = rsq(x)
11057 // g0 = x * y0
11058 // h0 = 0.5 * y0
11059 //
11060 // r0 = 0.5 - h0 * g0
11061 // g1 = g0 * r0 + g0
11062 // h1 = h0 * r0 + h0
11063 //
11064 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11065 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11066 // h2 = h1 * r1 + h1
11067 //
11068 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11069 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11070 //
11071 // sqrt(x) = g3
11072
11073 SDNodeFlags Flags = Op->getFlags();
11074
11075 SDLoc DL(Op);
11076
11077 SDValue X = Op.getOperand(0);
11078 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11079
11080 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11081
11082 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11083
11084 // Scale up input if it is too small.
11085 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11086 SDValue ScaleUp =
11087 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11088 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11089
11090 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11091
11092 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11093
11094 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11095 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11096
11097 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11098 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11099
11100 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11101
11102 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11103
11104 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11105 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11106
11107 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11108
11109 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11110 SDValue SqrtD1 =
11111 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11112
11113 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11114
11115 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11116 SDValue ScaleDown =
11117 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11118 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11119
11120 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11121 // with finite only or nsz because rsq(+/-0) = +/-inf
11122
11123 // TODO: Check for DAZ and expand to subnormals
11124 SDValue IsZeroOrInf =
11125 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11126 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11127
11128 // If x is +INF, +0, or -0, use its original value
11129 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11130 Flags);
11131}
11132
11133SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11134 SDLoc DL(Op);
11135 EVT VT = Op.getValueType();
11136 SDValue Arg = Op.getOperand(0);
11137 SDValue TrigVal;
11138
11139 // Propagate fast-math flags so that the multiply we introduce can be folded
11140 // if Arg is already the result of a multiply by constant.
11141 auto Flags = Op->getFlags();
11142
11143 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11144
11145 if (Subtarget->hasTrigReducedRange()) {
11146 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11147 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11148 } else {
11149 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11150 }
11151
11152 switch (Op.getOpcode()) {
11153 case ISD::FCOS:
11154 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11155 case ISD::FSIN:
11156 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11157 default:
11158 llvm_unreachable("Wrong trig opcode");
11159 }
11160}
11161
11162SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11163 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11164 assert(AtomicNode->isCompareAndSwap());
11165 unsigned AS = AtomicNode->getAddressSpace();
11166
11167 // No custom lowering required for local address space
11169 return Op;
11170
11171 // Non-local address space requires custom lowering for atomic compare
11172 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11173 SDLoc DL(Op);
11174 SDValue ChainIn = Op.getOperand(0);
11175 SDValue Addr = Op.getOperand(1);
11176 SDValue Old = Op.getOperand(2);
11177 SDValue New = Op.getOperand(3);
11178 EVT VT = Op.getValueType();
11179 MVT SimpleVT = VT.getSimpleVT();
11180 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11181
11182 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11183 SDValue Ops[] = { ChainIn, Addr, NewOld };
11184
11185 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11186 Ops, VT, AtomicNode->getMemOperand());
11187}
11188
11189//===----------------------------------------------------------------------===//
11190// Custom DAG optimizations
11191//===----------------------------------------------------------------------===//
11192
11193SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11194 DAGCombinerInfo &DCI) const {
11195 EVT VT = N->getValueType(0);
11196 EVT ScalarVT = VT.getScalarType();
11197 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11198 return SDValue();
11199
11200 SelectionDAG &DAG = DCI.DAG;
11201 SDLoc DL(N);
11202
11203 SDValue Src = N->getOperand(0);
11204 EVT SrcVT = Src.getValueType();
11205
11206 // TODO: We could try to match extracting the higher bytes, which would be
11207 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11208 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11209 // about in practice.
11210 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11211 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11212 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11213 DCI.AddToWorklist(Cvt.getNode());
11214
11215 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11216 if (ScalarVT != MVT::f32) {
11217 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11218 DAG.getTargetConstant(0, DL, MVT::i32));
11219 }
11220 return Cvt;
11221 }
11222 }
11223
11224 return SDValue();
11225}
11226
11227SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11228 DAGCombinerInfo &DCI) const {
11229 SDValue MagnitudeOp = N->getOperand(0);
11230 SDValue SignOp = N->getOperand(1);
11231 SelectionDAG &DAG = DCI.DAG;
11232 SDLoc DL(N);
11233
11234 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11235 // lower half with a copy.
11236 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11237 if (MagnitudeOp.getValueType() == MVT::f64) {
11238 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11239 SDValue MagLo =
11240 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11241 DAG.getConstant(0, DL, MVT::i32));
11242 SDValue MagHi =
11243 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11244 DAG.getConstant(1, DL, MVT::i32));
11245
11246 SDValue HiOp =
11247 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11248
11249 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11250
11251 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11252 }
11253
11254 if (SignOp.getValueType() != MVT::f64)
11255 return SDValue();
11256
11257 // Reduce width of sign operand, we only need the highest bit.
11258 //
11259 // fcopysign f64:x, f64:y ->
11260 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11261 // TODO: In some cases it might make sense to go all the way to f16.
11262 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11263 SDValue SignAsF32 =
11264 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11265 DAG.getConstant(1, DL, MVT::i32));
11266
11267 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11268 SignAsF32);
11269}
11270
11271// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11272// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11273// bits
11274
11275// This is a variant of
11276// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11277//
11278// The normal DAG combiner will do this, but only if the add has one use since
11279// that would increase the number of instructions.
11280//
11281// This prevents us from seeing a constant offset that can be folded into a
11282// memory instruction's addressing mode. If we know the resulting add offset of
11283// a pointer can be folded into an addressing offset, we can replace the pointer
11284// operand with the add of new constant offset. This eliminates one of the uses,
11285// and may allow the remaining use to also be simplified.
11286//
11287SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11288 unsigned AddrSpace,
11289 EVT MemVT,
11290 DAGCombinerInfo &DCI) const {
11291 SDValue N0 = N->getOperand(0);
11292 SDValue N1 = N->getOperand(1);
11293
11294 // We only do this to handle cases where it's profitable when there are
11295 // multiple uses of the add, so defer to the standard combine.
11296 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11297 N0->hasOneUse())
11298 return SDValue();
11299
11300 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11301 if (!CN1)
11302 return SDValue();
11303
11304 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11305 if (!CAdd)
11306 return SDValue();
11307
11308 SelectionDAG &DAG = DCI.DAG;
11309
11310 if (N0->getOpcode() == ISD::OR &&
11311 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11312 return SDValue();
11313
11314 // If the resulting offset is too large, we can't fold it into the
11315 // addressing mode offset.
11316 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11317 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11318
11319 AddrMode AM;
11320 AM.HasBaseReg = true;
11321 AM.BaseOffs = Offset.getSExtValue();
11322 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11323 return SDValue();
11324
11325 SDLoc SL(N);
11326 EVT VT = N->getValueType(0);
11327
11328 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11329 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11330
11332 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11333 (N0.getOpcode() == ISD::OR ||
11334 N0->getFlags().hasNoUnsignedWrap()));
11335
11336 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11337}
11338
11339/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11340/// by the chain and intrinsic ID. Theoretically we would also need to check the
11341/// specific intrinsic, but they all place the pointer operand first.
11342static unsigned getBasePtrIndex(const MemSDNode *N) {
11343 switch (N->getOpcode()) {
11344 case ISD::STORE:
11347 return 2;
11348 default:
11349 return 1;
11350 }
11351}
11352
11353SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11354 DAGCombinerInfo &DCI) const {
11355 SelectionDAG &DAG = DCI.DAG;
11356 SDLoc SL(N);
11357
11358 unsigned PtrIdx = getBasePtrIndex(N);
11359 SDValue Ptr = N->getOperand(PtrIdx);
11360
11361 // TODO: We could also do this for multiplies.
11362 if (Ptr.getOpcode() == ISD::SHL) {
11363 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11364 N->getMemoryVT(), DCI);
11365 if (NewPtr) {
11366 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11367
11368 NewOps[PtrIdx] = NewPtr;
11369 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11370 }
11371 }
11372
11373 return SDValue();
11374}
11375
11376static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11377 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11378 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11379 (Opc == ISD::XOR && Val == 0);
11380}
11381
11382// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11383// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11384// integer combine opportunities since most 64-bit operations are decomposed
11385// this way. TODO: We won't want this for SALU especially if it is an inline
11386// immediate.
11387SDValue SITargetLowering::splitBinaryBitConstantOp(
11388 DAGCombinerInfo &DCI,
11389 const SDLoc &SL,
11390 unsigned Opc, SDValue LHS,
11391 const ConstantSDNode *CRHS) const {
11392 uint64_t Val = CRHS->getZExtValue();
11393 uint32_t ValLo = Lo_32(Val);
11394 uint32_t ValHi = Hi_32(Val);
11396
11397 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11398 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11399 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11400 // If we need to materialize a 64-bit immediate, it will be split up later
11401 // anyway. Avoid creating the harder to understand 64-bit immediate
11402 // materialization.
11403 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11404 }
11405
11406 return SDValue();
11407}
11408
11410 if (V.getValueType() != MVT::i1)
11411 return false;
11412 switch (V.getOpcode()) {
11413 default:
11414 break;
11415 case ISD::SETCC:
11417 return true;
11418 case ISD::AND:
11419 case ISD::OR:
11420 case ISD::XOR:
11421 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11422 }
11423 return false;
11424}
11425
11426// If a constant has all zeroes or all ones within each byte return it.
11427// Otherwise return 0.
11429 // 0xff for any zero byte in the mask
11430 uint32_t ZeroByteMask = 0;
11431 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11432 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11433 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11434 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11435 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11436 if ((NonZeroByteMask & C) != NonZeroByteMask)
11437 return 0; // Partial bytes selected.
11438 return C;
11439}
11440
11441// Check if a node selects whole bytes from its operand 0 starting at a byte
11442// boundary while masking the rest. Returns select mask as in the v_perm_b32
11443// or -1 if not succeeded.
11444// Note byte select encoding:
11445// value 0-3 selects corresponding source byte;
11446// value 0xc selects zero;
11447// value 0xff selects 0xff.
11449 assert(V.getValueSizeInBits() == 32);
11450
11451 if (V.getNumOperands() != 2)
11452 return ~0;
11453
11454 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11455 if (!N1)
11456 return ~0;
11457
11458 uint32_t C = N1->getZExtValue();
11459
11460 switch (V.getOpcode()) {
11461 default:
11462 break;
11463 case ISD::AND:
11464 if (uint32_t ConstMask = getConstantPermuteMask(C))
11465 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11466 break;
11467
11468 case ISD::OR:
11469 if (uint32_t ConstMask = getConstantPermuteMask(C))
11470 return (0x03020100 & ~ConstMask) | ConstMask;
11471 break;
11472
11473 case ISD::SHL:
11474 if (C % 8)
11475 return ~0;
11476
11477 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11478
11479 case ISD::SRL:
11480 if (C % 8)
11481 return ~0;
11482
11483 return uint32_t(0x0c0c0c0c03020100ull >> C);
11484 }
11485
11486 return ~0;
11487}
11488
11489SDValue SITargetLowering::performAndCombine(SDNode *N,
11490 DAGCombinerInfo &DCI) const {
11491 if (DCI.isBeforeLegalize())
11492 return SDValue();
11493
11494 SelectionDAG &DAG = DCI.DAG;
11495 EVT VT = N->getValueType(0);
11496 SDValue LHS = N->getOperand(0);
11497 SDValue RHS = N->getOperand(1);
11498
11499
11500 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11501 if (VT == MVT::i64 && CRHS) {
11502 if (SDValue Split
11503 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11504 return Split;
11505 }
11506
11507 if (CRHS && VT == MVT::i32) {
11508 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11509 // nb = number of trailing zeroes in mask
11510 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11511 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11512 uint64_t Mask = CRHS->getZExtValue();
11513 unsigned Bits = llvm::popcount(Mask);
11514 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11515 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11516 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11517 unsigned Shift = CShift->getZExtValue();
11518 unsigned NB = CRHS->getAPIntValue().countr_zero();
11519 unsigned Offset = NB + Shift;
11520 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11521 SDLoc SL(N);
11522 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11523 LHS->getOperand(0),
11524 DAG.getConstant(Offset, SL, MVT::i32),
11525 DAG.getConstant(Bits, SL, MVT::i32));
11526 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11527 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11528 DAG.getValueType(NarrowVT));
11529 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11530 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11531 return Shl;
11532 }
11533 }
11534 }
11535
11536 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11537 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11538 isa<ConstantSDNode>(LHS.getOperand(2))) {
11539 uint32_t Sel = getConstantPermuteMask(Mask);
11540 if (!Sel)
11541 return SDValue();
11542
11543 // Select 0xc for all zero bytes
11544 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11545 SDLoc DL(N);
11546 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11547 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11548 }
11549 }
11550
11551 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11552 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11553 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11554 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11555 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11556
11557 SDValue X = LHS.getOperand(0);
11558 SDValue Y = RHS.getOperand(0);
11559 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11560 !isTypeLegal(X.getValueType()))
11561 return SDValue();
11562
11563 if (LCC == ISD::SETO) {
11564 if (X != LHS.getOperand(1))
11565 return SDValue();
11566
11567 if (RCC == ISD::SETUNE) {
11568 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11569 if (!C1 || !C1->isInfinity() || C1->isNegative())
11570 return SDValue();
11571
11578
11579 static_assert(((~(SIInstrFlags::S_NAN |
11582 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11583 "mask not equal");
11584
11585 SDLoc DL(N);
11586 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11587 X, DAG.getConstant(Mask, DL, MVT::i32));
11588 }
11589 }
11590 }
11591
11592 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11593 std::swap(LHS, RHS);
11594
11595 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11596 RHS.hasOneUse()) {
11597 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11598 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11599 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11600 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11601 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11602 (RHS.getOperand(0) == LHS.getOperand(0) &&
11603 LHS.getOperand(0) == LHS.getOperand(1))) {
11604 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11605 unsigned NewMask = LCC == ISD::SETO ?
11606 Mask->getZExtValue() & ~OrdMask :
11607 Mask->getZExtValue() & OrdMask;
11608
11609 SDLoc DL(N);
11610 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11611 DAG.getConstant(NewMask, DL, MVT::i32));
11612 }
11613 }
11614
11615 if (VT == MVT::i32 &&
11616 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11617 // and x, (sext cc from i1) => select cc, x, 0
11618 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11619 std::swap(LHS, RHS);
11620 if (isBoolSGPR(RHS.getOperand(0)))
11621 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11622 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11623 }
11624
11625 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11627 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11628 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11629 uint32_t LHSMask = getPermuteMask(LHS);
11630 uint32_t RHSMask = getPermuteMask(RHS);
11631 if (LHSMask != ~0u && RHSMask != ~0u) {
11632 // Canonicalize the expression in an attempt to have fewer unique masks
11633 // and therefore fewer registers used to hold the masks.
11634 if (LHSMask > RHSMask) {
11635 std::swap(LHSMask, RHSMask);
11636 std::swap(LHS, RHS);
11637 }
11638
11639 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11640 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11641 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11642 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11643
11644 // Check of we need to combine values from two sources within a byte.
11645 if (!(LHSUsedLanes & RHSUsedLanes) &&
11646 // If we select high and lower word keep it for SDWA.
11647 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11648 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11649 // Each byte in each mask is either selector mask 0-3, or has higher
11650 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11651 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11652 // mask which is not 0xff wins. By anding both masks we have a correct
11653 // result except that 0x0c shall be corrected to give 0x0c only.
11654 uint32_t Mask = LHSMask & RHSMask;
11655 for (unsigned I = 0; I < 32; I += 8) {
11656 uint32_t ByteSel = 0xff << I;
11657 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11658 Mask &= (0x0c << I) & 0xffffffff;
11659 }
11660
11661 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11662 // or 0x0c.
11663 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11664 SDLoc DL(N);
11665
11666 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11667 LHS.getOperand(0), RHS.getOperand(0),
11668 DAG.getConstant(Sel, DL, MVT::i32));
11669 }
11670 }
11671 }
11672
11673 return SDValue();
11674}
11675
11676// A key component of v_perm is a mapping between byte position of the src
11677// operands, and the byte position of the dest. To provide such, we need: 1. the
11678// node that provides x byte of the dest of the OR, and 2. the byte of the node
11679// used to provide that x byte. calculateByteProvider finds which node provides
11680// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11681// and finds an ultimate src and byte position For example: The supported
11682// LoadCombine pattern for vector loads is as follows
11683// t1
11684// or
11685// / \
11686// t2 t3
11687// zext shl
11688// | | \
11689// t4 t5 16
11690// or anyext
11691// / \ |
11692// t6 t7 t8
11693// srl shl or
11694// / | / \ / \
11695// t9 t10 t11 t12 t13 t14
11696// trunc* 8 trunc* 8 and and
11697// | | / | | \
11698// t15 t16 t17 t18 t19 t20
11699// trunc* 255 srl -256
11700// | / \
11701// t15 t15 16
11702//
11703// *In this example, the truncs are from i32->i16
11704//
11705// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11706// respectively. calculateSrcByte would find (given node) -> ultimate src &
11707// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11708// After finding the mapping, we can combine the tree into vperm t15, t16,
11709// 0x05000407
11710
11711// Find the source and byte position from a node.
11712// \p DestByte is the byte position of the dest of the or that the src
11713// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11714// dest of the or byte. \p Depth tracks how many recursive iterations we have
11715// performed.
11716static const std::optional<ByteProvider<SDValue>>
11717calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11718 unsigned Depth = 0) {
11719 // We may need to recursively traverse a series of SRLs
11720 if (Depth >= 6)
11721 return std::nullopt;
11722
11723 if (Op.getValueSizeInBits() < 8)
11724 return std::nullopt;
11725
11726 if (Op.getValueType().isVector())
11727 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11728
11729 switch (Op->getOpcode()) {
11730 case ISD::TRUNCATE: {
11731 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11732 }
11733
11734 case ISD::SIGN_EXTEND:
11735 case ISD::ZERO_EXTEND:
11737 SDValue NarrowOp = Op->getOperand(0);
11738 auto NarrowVT = NarrowOp.getValueType();
11739 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11740 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11741 NarrowVT = VTSign->getVT();
11742 }
11743 if (!NarrowVT.isByteSized())
11744 return std::nullopt;
11745 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11746
11747 if (SrcIndex >= NarrowByteWidth)
11748 return std::nullopt;
11749 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11750 }
11751
11752 case ISD::SRA:
11753 case ISD::SRL: {
11754 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11755 if (!ShiftOp)
11756 return std::nullopt;
11757
11758 uint64_t BitShift = ShiftOp->getZExtValue();
11759
11760 if (BitShift % 8 != 0)
11761 return std::nullopt;
11762
11763 SrcIndex += BitShift / 8;
11764
11765 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11766 }
11767
11768 default: {
11769 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11770 }
11771 }
11772 llvm_unreachable("fully handled switch");
11773}
11774
11775// For a byte position in the result of an Or, traverse the tree and find the
11776// node (and the byte of the node) which ultimately provides this {Or,
11777// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11778// the byte position of the Op that corresponds with the originally requested
11779// byte of the Or \p Depth tracks how many recursive iterations we have
11780// performed. \p StartingIndex is the originally requested byte of the Or
11781static const std::optional<ByteProvider<SDValue>>
11782calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11783 unsigned StartingIndex = 0) {
11784 // Finding Src tree of RHS of or typically requires at least 1 additional
11785 // depth
11786 if (Depth > 6)
11787 return std::nullopt;
11788
11789 unsigned BitWidth = Op.getScalarValueSizeInBits();
11790 if (BitWidth % 8 != 0)
11791 return std::nullopt;
11792 if (Index > BitWidth / 8 - 1)
11793 return std::nullopt;
11794
11795 bool IsVec = Op.getValueType().isVector();
11796 switch (Op.getOpcode()) {
11797 case ISD::OR: {
11798 if (IsVec)
11799 return std::nullopt;
11800
11801 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11802 StartingIndex);
11803 if (!RHS)
11804 return std::nullopt;
11805 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11806 StartingIndex);
11807 if (!LHS)
11808 return std::nullopt;
11809 // A well formed Or will have two ByteProviders for each byte, one of which
11810 // is constant zero
11811 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11812 return std::nullopt;
11813 if (!LHS || LHS->isConstantZero())
11814 return RHS;
11815 if (!RHS || RHS->isConstantZero())
11816 return LHS;
11817 return std::nullopt;
11818 }
11819
11820 case ISD::AND: {
11821 if (IsVec)
11822 return std::nullopt;
11823
11824 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11825 if (!BitMaskOp)
11826 return std::nullopt;
11827
11828 uint32_t BitMask = BitMaskOp->getZExtValue();
11829 // Bits we expect for our StartingIndex
11830 uint32_t IndexMask = 0xFF << (Index * 8);
11831
11832 if ((IndexMask & BitMask) != IndexMask) {
11833 // If the result of the and partially provides the byte, then it
11834 // is not well formatted
11835 if (IndexMask & BitMask)
11836 return std::nullopt;
11838 }
11839
11840 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11841 }
11842
11843 case ISD::FSHR: {
11844 if (IsVec)
11845 return std::nullopt;
11846
11847 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11848 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11849 if (!ShiftOp || Op.getValueType().isVector())
11850 return std::nullopt;
11851
11852 uint64_t BitsProvided = Op.getValueSizeInBits();
11853 if (BitsProvided % 8 != 0)
11854 return std::nullopt;
11855
11856 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11857 if (BitShift % 8)
11858 return std::nullopt;
11859
11860 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11861 uint64_t ByteShift = BitShift / 8;
11862
11863 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11864 uint64_t BytesProvided = BitsProvided / 8;
11865 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11866 NewIndex %= BytesProvided;
11867 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11868 }
11869
11870 case ISD::SRA:
11871 case ISD::SRL: {
11872 if (IsVec)
11873 return std::nullopt;
11874
11875 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11876 if (!ShiftOp)
11877 return std::nullopt;
11878
11879 uint64_t BitShift = ShiftOp->getZExtValue();
11880 if (BitShift % 8)
11881 return std::nullopt;
11882
11883 auto BitsProvided = Op.getScalarValueSizeInBits();
11884 if (BitsProvided % 8 != 0)
11885 return std::nullopt;
11886
11887 uint64_t BytesProvided = BitsProvided / 8;
11888 uint64_t ByteShift = BitShift / 8;
11889 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11890 // If the byte we are trying to provide (as tracked by index) falls in this
11891 // range, then the SRL provides the byte. The byte of interest of the src of
11892 // the SRL is Index + ByteShift
11893 return BytesProvided - ByteShift > Index
11894 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11895 Index + ByteShift)
11897 }
11898
11899 case ISD::SHL: {
11900 if (IsVec)
11901 return std::nullopt;
11902
11903 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11904 if (!ShiftOp)
11905 return std::nullopt;
11906
11907 uint64_t BitShift = ShiftOp->getZExtValue();
11908 if (BitShift % 8 != 0)
11909 return std::nullopt;
11910 uint64_t ByteShift = BitShift / 8;
11911
11912 // If we are shifting by an amount greater than (or equal to)
11913 // the index we are trying to provide, then it provides 0s. If not,
11914 // then this bytes are not definitively 0s, and the corresponding byte
11915 // of interest is Index - ByteShift of the src
11916 return Index < ByteShift
11918 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11919 Depth + 1, StartingIndex);
11920 }
11921 case ISD::ANY_EXTEND:
11922 case ISD::SIGN_EXTEND:
11923 case ISD::ZERO_EXTEND:
11925 case ISD::AssertZext:
11926 case ISD::AssertSext: {
11927 if (IsVec)
11928 return std::nullopt;
11929
11930 SDValue NarrowOp = Op->getOperand(0);
11931 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11932 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11933 Op->getOpcode() == ISD::AssertZext ||
11934 Op->getOpcode() == ISD::AssertSext) {
11935 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11936 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11937 }
11938 if (NarrowBitWidth % 8 != 0)
11939 return std::nullopt;
11940 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11941
11942 if (Index >= NarrowByteWidth)
11943 return Op.getOpcode() == ISD::ZERO_EXTEND
11944 ? std::optional<ByteProvider<SDValue>>(
11946 : std::nullopt;
11947 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11948 }
11949
11950 case ISD::TRUNCATE: {
11951 if (IsVec)
11952 return std::nullopt;
11953
11954 uint64_t NarrowByteWidth = BitWidth / 8;
11955
11956 if (NarrowByteWidth >= Index) {
11957 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11958 StartingIndex);
11959 }
11960
11961 return std::nullopt;
11962 }
11963
11964 case ISD::CopyFromReg: {
11965 if (BitWidth / 8 > Index)
11966 return calculateSrcByte(Op, StartingIndex, Index);
11967
11968 return std::nullopt;
11969 }
11970
11971 case ISD::LOAD: {
11972 auto L = cast<LoadSDNode>(Op.getNode());
11973
11974 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11975 if (NarrowBitWidth % 8 != 0)
11976 return std::nullopt;
11977 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11978
11979 // If the width of the load does not reach byte we are trying to provide for
11980 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11981 // question
11982 if (Index >= NarrowByteWidth) {
11983 return L->getExtensionType() == ISD::ZEXTLOAD
11984 ? std::optional<ByteProvider<SDValue>>(
11986 : std::nullopt;
11987 }
11988
11989 if (NarrowByteWidth > Index) {
11990 return calculateSrcByte(Op, StartingIndex, Index);
11991 }
11992
11993 return std::nullopt;
11994 }
11995
11996 case ISD::BSWAP: {
11997 if (IsVec)
11998 return std::nullopt;
11999
12000 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12001 Depth + 1, StartingIndex);
12002 }
12003
12005 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12006 if (!IdxOp)
12007 return std::nullopt;
12008 auto VecIdx = IdxOp->getZExtValue();
12009 auto ScalarSize = Op.getScalarValueSizeInBits();
12010 if (ScalarSize < 32)
12011 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12012 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12013 StartingIndex, Index);
12014 }
12015
12016 case AMDGPUISD::PERM: {
12017 if (IsVec)
12018 return std::nullopt;
12019
12020 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12021 if (!PermMask)
12022 return std::nullopt;
12023
12024 auto IdxMask =
12025 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12026 if (IdxMask > 0x07 && IdxMask != 0x0c)
12027 return std::nullopt;
12028
12029 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12030 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12031
12032 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12035 }
12036
12037 default: {
12038 return std::nullopt;
12039 }
12040 }
12041
12042 llvm_unreachable("fully handled switch");
12043}
12044
12045// Returns true if the Operand is a scalar and is 16 bits
12046static bool isExtendedFrom16Bits(SDValue &Operand) {
12047
12048 switch (Operand.getOpcode()) {
12049 case ISD::ANY_EXTEND:
12050 case ISD::SIGN_EXTEND:
12051 case ISD::ZERO_EXTEND: {
12052 auto OpVT = Operand.getOperand(0).getValueType();
12053 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12054 }
12055 case ISD::LOAD: {
12056 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12057 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12058 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12059 ExtType == ISD::EXTLOAD) {
12060 auto MemVT = L->getMemoryVT();
12061 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12062 }
12063 return L->getMemoryVT().getSizeInBits() == 16;
12064 }
12065 default:
12066 return false;
12067 }
12068}
12069
12070// Returns true if the mask matches consecutive bytes, and the first byte
12071// begins at a power of 2 byte offset from 0th byte
12072static bool addresses16Bits(int Mask) {
12073 int Low8 = Mask & 0xff;
12074 int Hi8 = (Mask & 0xff00) >> 8;
12075
12076 assert(Low8 < 8 && Hi8 < 8);
12077 // Are the bytes contiguous in the order of increasing addresses.
12078 bool IsConsecutive = (Hi8 - Low8 == 1);
12079 // Is the first byte at location that is aligned for 16 bit instructions.
12080 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12081 // In this case, we still need code to extract the 16 bit operand, so it
12082 // is better to use i8 v_perm
12083 bool Is16Aligned = !(Low8 % 2);
12084
12085 return IsConsecutive && Is16Aligned;
12086}
12087
12088// Do not lower into v_perm if the operands are actually 16 bit
12089// and the selected bits (based on PermMask) correspond with two
12090// easily addressable 16 bit operands.
12092 SDValue &OtherOp) {
12093 int Low16 = PermMask & 0xffff;
12094 int Hi16 = (PermMask & 0xffff0000) >> 16;
12095
12096 auto TempOp = peekThroughBitcasts(Op);
12097 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12098
12099 auto OpIs16Bit =
12100 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12101 if (!OpIs16Bit)
12102 return true;
12103
12104 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12105 isExtendedFrom16Bits(TempOtherOp);
12106 if (!OtherOpIs16Bit)
12107 return true;
12108
12109 // Do we cleanly address both
12110 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12111}
12112
12114 unsigned DWordOffset) {
12115 SDValue Ret;
12116
12117 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12118 // ByteProvider must be at least 8 bits
12119 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12120
12121 if (TypeSize <= 32)
12122 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12123
12124 if (Src.getValueType().isVector()) {
12125 auto ScalarTySize = Src.getScalarValueSizeInBits();
12126 auto ScalarTy = Src.getValueType().getScalarType();
12127 if (ScalarTySize == 32) {
12128 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12129 DAG.getConstant(DWordOffset, SL, MVT::i32));
12130 }
12131 if (ScalarTySize > 32) {
12132 Ret = DAG.getNode(
12133 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12134 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12135 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12136 if (ShiftVal)
12137 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12138 DAG.getConstant(ShiftVal, SL, MVT::i32));
12139 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12140 }
12141
12142 assert(ScalarTySize < 32);
12143 auto NumElements = TypeSize / ScalarTySize;
12144 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12145 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12146 auto NumElementsIn32 = 32 / ScalarTySize;
12147 auto NumAvailElements = DWordOffset < Trunc32Elements
12148 ? NumElementsIn32
12149 : NumElements - NormalizedTrunc;
12150
12152 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12153 NumAvailElements);
12154
12155 Ret = DAG.getBuildVector(
12156 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12157 VecSrcs);
12158 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12159 }
12160
12161 /// Scalar Type
12162 auto ShiftVal = 32 * DWordOffset;
12163 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12164 DAG.getConstant(ShiftVal, SL, MVT::i32));
12165 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12166}
12167
12169 SelectionDAG &DAG = DCI.DAG;
12170 [[maybe_unused]] EVT VT = N->getValueType(0);
12172
12173 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12174 assert(VT == MVT::i32);
12175 for (int i = 0; i < 4; i++) {
12176 // Find the ByteProvider that provides the ith byte of the result of OR
12177 std::optional<ByteProvider<SDValue>> P =
12178 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12179 // TODO support constantZero
12180 if (!P || P->isConstantZero())
12181 return SDValue();
12182
12183 PermNodes.push_back(*P);
12184 }
12185 if (PermNodes.size() != 4)
12186 return SDValue();
12187
12188 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12189 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12190 uint64_t PermMask = 0x00000000;
12191 for (size_t i = 0; i < PermNodes.size(); i++) {
12192 auto PermOp = PermNodes[i];
12193 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12194 // by sizeof(Src2) = 4
12195 int SrcByteAdjust = 4;
12196
12197 // If the Src uses a byte from a different DWORD, then it corresponds
12198 // with a difference source
12199 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12200 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12201 if (SecondSrc)
12202 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12203 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12204 return SDValue();
12205
12206 // Set the index of the second distinct Src node
12207 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12208 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12209 SrcByteAdjust = 0;
12210 }
12211 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12213 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12214 }
12215 SDLoc DL(N);
12216 SDValue Op = *PermNodes[FirstSrc.first].Src;
12217 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12218 assert(Op.getValueSizeInBits() == 32);
12219
12220 // Check that we are not just extracting the bytes in order from an op
12221 if (!SecondSrc) {
12222 int Low16 = PermMask & 0xffff;
12223 int Hi16 = (PermMask & 0xffff0000) >> 16;
12224
12225 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12226 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12227
12228 // The perm op would really just produce Op. So combine into Op
12229 if (WellFormedLow && WellFormedHi)
12230 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12231 }
12232
12233 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12234
12235 if (SecondSrc) {
12236 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12237 assert(OtherOp.getValueSizeInBits() == 32);
12238 }
12239
12240 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12241
12242 assert(Op.getValueType().isByteSized() &&
12243 OtherOp.getValueType().isByteSized());
12244
12245 // If the ultimate src is less than 32 bits, then we will only be
12246 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12247 // CalculateByteProvider would not have returned Op as source if we
12248 // used a byte that is outside its ValueType. Thus, we are free to
12249 // ANY_EXTEND as the extended bits are dont-cares.
12250 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12251 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12252
12253 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12254 DAG.getConstant(PermMask, DL, MVT::i32));
12255 }
12256 return SDValue();
12257}
12258
12259SDValue SITargetLowering::performOrCombine(SDNode *N,
12260 DAGCombinerInfo &DCI) const {
12261 SelectionDAG &DAG = DCI.DAG;
12262 SDValue LHS = N->getOperand(0);
12263 SDValue RHS = N->getOperand(1);
12264
12265 EVT VT = N->getValueType(0);
12266 if (VT == MVT::i1) {
12267 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12268 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12269 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12270 SDValue Src = LHS.getOperand(0);
12271 if (Src != RHS.getOperand(0))
12272 return SDValue();
12273
12274 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12275 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12276 if (!CLHS || !CRHS)
12277 return SDValue();
12278
12279 // Only 10 bits are used.
12280 static const uint32_t MaxMask = 0x3ff;
12281
12282 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12283 SDLoc DL(N);
12284 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12285 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12286 }
12287
12288 return SDValue();
12289 }
12290
12291 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12292 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12293 LHS.getOpcode() == AMDGPUISD::PERM &&
12294 isa<ConstantSDNode>(LHS.getOperand(2))) {
12295 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12296 if (!Sel)
12297 return SDValue();
12298
12299 Sel |= LHS.getConstantOperandVal(2);
12300 SDLoc DL(N);
12301 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12302 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12303 }
12304
12305 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12307 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12308 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12309
12310 // If all the uses of an or need to extract the individual elements, do not
12311 // attempt to lower into v_perm
12312 auto usesCombinedOperand = [](SDNode *OrUse) {
12313 // If we have any non-vectorized use, then it is a candidate for v_perm
12314 if (OrUse->getOpcode() != ISD::BITCAST ||
12315 !OrUse->getValueType(0).isVector())
12316 return true;
12317
12318 // If we have any non-vectorized use, then it is a candidate for v_perm
12319 for (auto VUse : OrUse->uses()) {
12320 if (!VUse->getValueType(0).isVector())
12321 return true;
12322
12323 // If the use of a vector is a store, then combining via a v_perm
12324 // is beneficial.
12325 // TODO -- whitelist more uses
12326 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12327 if (VUse->getOpcode() == VectorwiseOp)
12328 return true;
12329 }
12330 return false;
12331 };
12332
12333 if (!any_of(N->uses(), usesCombinedOperand))
12334 return SDValue();
12335
12336 uint32_t LHSMask = getPermuteMask(LHS);
12337 uint32_t RHSMask = getPermuteMask(RHS);
12338
12339 if (LHSMask != ~0u && RHSMask != ~0u) {
12340 // Canonicalize the expression in an attempt to have fewer unique masks
12341 // and therefore fewer registers used to hold the masks.
12342 if (LHSMask > RHSMask) {
12343 std::swap(LHSMask, RHSMask);
12344 std::swap(LHS, RHS);
12345 }
12346
12347 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12348 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12349 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12350 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12351
12352 // Check of we need to combine values from two sources within a byte.
12353 if (!(LHSUsedLanes & RHSUsedLanes) &&
12354 // If we select high and lower word keep it for SDWA.
12355 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12356 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12357 // Kill zero bytes selected by other mask. Zero value is 0xc.
12358 LHSMask &= ~RHSUsedLanes;
12359 RHSMask &= ~LHSUsedLanes;
12360 // Add 4 to each active LHS lane
12361 LHSMask |= LHSUsedLanes & 0x04040404;
12362 // Combine masks
12363 uint32_t Sel = LHSMask | RHSMask;
12364 SDLoc DL(N);
12365
12366 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12367 LHS.getOperand(0), RHS.getOperand(0),
12368 DAG.getConstant(Sel, DL, MVT::i32));
12369 }
12370 }
12371 if (LHSMask == ~0u || RHSMask == ~0u) {
12372 if (SDValue Perm = matchPERM(N, DCI))
12373 return Perm;
12374 }
12375 }
12376
12377 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12378 return SDValue();
12379
12380 // TODO: This could be a generic combine with a predicate for extracting the
12381 // high half of an integer being free.
12382
12383 // (or i64:x, (zero_extend i32:y)) ->
12384 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12385 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12386 RHS.getOpcode() != ISD::ZERO_EXTEND)
12387 std::swap(LHS, RHS);
12388
12389 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12390 SDValue ExtSrc = RHS.getOperand(0);
12391 EVT SrcVT = ExtSrc.getValueType();
12392 if (SrcVT == MVT::i32) {
12393 SDLoc SL(N);
12394 SDValue LowLHS, HiBits;
12395 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12396 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12397
12398 DCI.AddToWorklist(LowOr.getNode());
12399 DCI.AddToWorklist(HiBits.getNode());
12400
12401 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12402 LowOr, HiBits);
12403 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12404 }
12405 }
12406
12407 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12408 if (CRHS) {
12409 if (SDValue Split
12410 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12411 N->getOperand(0), CRHS))
12412 return Split;
12413 }
12414
12415 return SDValue();
12416}
12417
12418SDValue SITargetLowering::performXorCombine(SDNode *N,
12419 DAGCombinerInfo &DCI) const {
12420 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12421 return RV;
12422
12423 SDValue LHS = N->getOperand(0);
12424 SDValue RHS = N->getOperand(1);
12425
12426 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12427 SelectionDAG &DAG = DCI.DAG;
12428
12429 EVT VT = N->getValueType(0);
12430 if (CRHS && VT == MVT::i64) {
12431 if (SDValue Split
12432 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12433 return Split;
12434 }
12435
12436 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12437 // fneg-like xors into 64-bit select.
12438 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12439 // This looks like an fneg, try to fold as a source modifier.
12440 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12441 shouldFoldFNegIntoSrc(N, LHS)) {
12442 // xor (select c, a, b), 0x80000000 ->
12443 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12444 SDLoc DL(N);
12445 SDValue CastLHS =
12446 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12447 SDValue CastRHS =
12448 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12449 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12450 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12451 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12452 LHS->getOperand(0), FNegLHS, FNegRHS);
12453 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12454 }
12455 }
12456
12457 return SDValue();
12458}
12459
12460SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12461 DAGCombinerInfo &DCI) const {
12462 if (!Subtarget->has16BitInsts() ||
12463 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12464 return SDValue();
12465
12466 EVT VT = N->getValueType(0);
12467 if (VT != MVT::i32)
12468 return SDValue();
12469
12470 SDValue Src = N->getOperand(0);
12471 if (Src.getValueType() != MVT::i16)
12472 return SDValue();
12473
12474 return SDValue();
12475}
12476
12477SDValue
12478SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12479 DAGCombinerInfo &DCI) const {
12480 SDValue Src = N->getOperand(0);
12481 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12482
12483 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12484 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12485 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12486 VTSign->getVT() == MVT::i8) ||
12487 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12488 VTSign->getVT() == MVT::i16))) {
12489 assert(Subtarget->hasScalarSubwordLoads() &&
12490 "s_buffer_load_{u8, i8} are supported "
12491 "in GFX12 (or newer) architectures.");
12492 EVT VT = Src.getValueType();
12493 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12496 SDLoc DL(N);
12497 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12498 SDValue Ops[] = {
12499 Src.getOperand(0), // source register
12500 Src.getOperand(1), // offset
12501 Src.getOperand(2) // cachePolicy
12502 };
12503 auto *M = cast<MemSDNode>(Src);
12504 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12505 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12506 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12507 return LoadVal;
12508 }
12509 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12510 VTSign->getVT() == MVT::i8) ||
12511 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12512 VTSign->getVT() == MVT::i16)) &&
12513 Src.hasOneUse()) {
12514 auto *M = cast<MemSDNode>(Src);
12515 SDValue Ops[] = {
12516 Src.getOperand(0), // Chain
12517 Src.getOperand(1), // rsrc
12518 Src.getOperand(2), // vindex
12519 Src.getOperand(3), // voffset
12520 Src.getOperand(4), // soffset
12521 Src.getOperand(5), // offset
12522 Src.getOperand(6),
12523 Src.getOperand(7)
12524 };
12525 // replace with BUFFER_LOAD_BYTE/SHORT
12526 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12527 Src.getOperand(0).getValueType());
12528 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12530 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12531 ResList,
12532 Ops, M->getMemoryVT(),
12533 M->getMemOperand());
12534 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12535 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12536 }
12537 return SDValue();
12538}
12539
12540SDValue SITargetLowering::performClassCombine(SDNode *N,
12541 DAGCombinerInfo &DCI) const {
12542 SelectionDAG &DAG = DCI.DAG;
12543 SDValue Mask = N->getOperand(1);
12544
12545 // fp_class x, 0 -> false
12546 if (isNullConstant(Mask))
12547 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12548
12549 if (N->getOperand(0).isUndef())
12550 return DAG.getUNDEF(MVT::i1);
12551
12552 return SDValue();
12553}
12554
12555SDValue SITargetLowering::performRcpCombine(SDNode *N,
12556 DAGCombinerInfo &DCI) const {
12557 EVT VT = N->getValueType(0);
12558 SDValue N0 = N->getOperand(0);
12559
12560 if (N0.isUndef()) {
12561 return DCI.DAG.getConstantFP(
12563 VT);
12564 }
12565
12566 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12567 N0.getOpcode() == ISD::SINT_TO_FP)) {
12568 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12569 N->getFlags());
12570 }
12571
12572 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12573 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12574 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12575 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12576 N0.getOperand(0), N->getFlags());
12577 }
12578
12580}
12581
12583 unsigned MaxDepth) const {
12584 unsigned Opcode = Op.getOpcode();
12585 if (Opcode == ISD::FCANONICALIZE)
12586 return true;
12587
12588 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12589 const auto &F = CFP->getValueAPF();
12590 if (F.isNaN() && F.isSignaling())
12591 return false;
12592 if (!F.isDenormal())
12593 return true;
12594
12595 DenormalMode Mode =
12596 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12597 return Mode == DenormalMode::getIEEE();
12598 }
12599
12600 // If source is a result of another standard FP operation it is already in
12601 // canonical form.
12602 if (MaxDepth == 0)
12603 return false;
12604
12605 switch (Opcode) {
12606 // These will flush denorms if required.
12607 case ISD::FADD:
12608 case ISD::FSUB:
12609 case ISD::FMUL:
12610 case ISD::FCEIL:
12611 case ISD::FFLOOR:
12612 case ISD::FMA:
12613 case ISD::FMAD:
12614 case ISD::FSQRT:
12615 case ISD::FDIV:
12616 case ISD::FREM:
12617 case ISD::FP_ROUND:
12618 case ISD::FP_EXTEND:
12619 case ISD::FP16_TO_FP:
12620 case ISD::FP_TO_FP16:
12621 case ISD::BF16_TO_FP:
12622 case ISD::FP_TO_BF16:
12623 case ISD::FLDEXP:
12626 case AMDGPUISD::RCP:
12627 case AMDGPUISD::RSQ:
12631 case AMDGPUISD::LOG:
12632 case AMDGPUISD::EXP:
12636 case AMDGPUISD::FRACT:
12643 case AMDGPUISD::SIN_HW:
12644 case AMDGPUISD::COS_HW:
12645 return true;
12646
12647 // It can/will be lowered or combined as a bit operation.
12648 // Need to check their input recursively to handle.
12649 case ISD::FNEG:
12650 case ISD::FABS:
12651 case ISD::FCOPYSIGN:
12652 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12653
12654 case ISD::AND:
12655 if (Op.getValueType() == MVT::i32) {
12656 // Be careful as we only know it is a bitcast floating point type. It
12657 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12658 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12659 // is valid to optimize for all types.
12660 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12661 if (RHS->getZExtValue() == 0xffff0000) {
12662 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12663 }
12664 }
12665 }
12666 break;
12667
12668 case ISD::FSIN:
12669 case ISD::FCOS:
12670 case ISD::FSINCOS:
12671 return Op.getValueType().getScalarType() != MVT::f16;
12672
12673 case ISD::FMINNUM:
12674 case ISD::FMAXNUM:
12675 case ISD::FMINNUM_IEEE:
12676 case ISD::FMAXNUM_IEEE:
12677 case ISD::FMINIMUM:
12678 case ISD::FMAXIMUM:
12679 case AMDGPUISD::CLAMP:
12680 case AMDGPUISD::FMED3:
12681 case AMDGPUISD::FMAX3:
12682 case AMDGPUISD::FMIN3:
12684 case AMDGPUISD::FMINIMUM3: {
12685 // FIXME: Shouldn't treat the generic operations different based these.
12686 // However, we aren't really required to flush the result from
12687 // minnum/maxnum..
12688
12689 // snans will be quieted, so we only need to worry about denormals.
12690 if (Subtarget->supportsMinMaxDenormModes() ||
12691 // FIXME: denormalsEnabledForType is broken for dynamic
12692 denormalsEnabledForType(DAG, Op.getValueType()))
12693 return true;
12694
12695 // Flushing may be required.
12696 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12697 // targets need to check their input recursively.
12698
12699 // FIXME: Does this apply with clamp? It's implemented with max.
12700 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12701 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12702 return false;
12703 }
12704
12705 return true;
12706 }
12707 case ISD::SELECT: {
12708 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12709 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12710 }
12711 case ISD::BUILD_VECTOR: {
12712 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12713 SDValue SrcOp = Op.getOperand(i);
12714 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12715 return false;
12716 }
12717
12718 return true;
12719 }
12722 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12723 }
12725 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12726 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12727 }
12728 case ISD::UNDEF:
12729 // Could be anything.
12730 return false;
12731
12732 case ISD::BITCAST:
12733 // TODO: This is incorrect as it loses track of the operand's type. We may
12734 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12735 // same bits that are canonicalized in one type need not be in the other.
12736 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12737 case ISD::TRUNCATE: {
12738 // Hack round the mess we make when legalizing extract_vector_elt
12739 if (Op.getValueType() == MVT::i16) {
12740 SDValue TruncSrc = Op.getOperand(0);
12741 if (TruncSrc.getValueType() == MVT::i32 &&
12742 TruncSrc.getOpcode() == ISD::BITCAST &&
12743 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12744 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12745 }
12746 }
12747 return false;
12748 }
12750 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12751 // TODO: Handle more intrinsics
12752 switch (IntrinsicID) {
12753 case Intrinsic::amdgcn_cvt_pkrtz:
12754 case Intrinsic::amdgcn_cubeid:
12755 case Intrinsic::amdgcn_frexp_mant:
12756 case Intrinsic::amdgcn_fdot2:
12757 case Intrinsic::amdgcn_rcp:
12758 case Intrinsic::amdgcn_rsq:
12759 case Intrinsic::amdgcn_rsq_clamp:
12760 case Intrinsic::amdgcn_rcp_legacy:
12761 case Intrinsic::amdgcn_rsq_legacy:
12762 case Intrinsic::amdgcn_trig_preop:
12763 case Intrinsic::amdgcn_log:
12764 case Intrinsic::amdgcn_exp2:
12765 case Intrinsic::amdgcn_sqrt:
12766 return true;
12767 default:
12768 break;
12769 }
12770
12771 break;
12772 }
12773 default:
12774 break;
12775 }
12776
12777 // FIXME: denormalsEnabledForType is broken for dynamic
12778 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12779 DAG.isKnownNeverSNaN(Op);
12780}
12781
12783 unsigned MaxDepth) const {
12784 const MachineRegisterInfo &MRI = MF.getRegInfo();
12785 MachineInstr *MI = MRI.getVRegDef(Reg);
12786 unsigned Opcode = MI->getOpcode();
12787
12788 if (Opcode == AMDGPU::G_FCANONICALIZE)
12789 return true;
12790
12791 std::optional<FPValueAndVReg> FCR;
12792 // Constant splat (can be padded with undef) or scalar constant.
12793 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12794 if (FCR->Value.isSignaling())
12795 return false;
12796 if (!FCR->Value.isDenormal())
12797 return true;
12798
12799 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12800 return Mode == DenormalMode::getIEEE();
12801 }
12802
12803 if (MaxDepth == 0)
12804 return false;
12805
12806 switch (Opcode) {
12807 case AMDGPU::G_FADD:
12808 case AMDGPU::G_FSUB:
12809 case AMDGPU::G_FMUL:
12810 case AMDGPU::G_FCEIL:
12811 case AMDGPU::G_FFLOOR:
12812 case AMDGPU::G_FRINT:
12813 case AMDGPU::G_FNEARBYINT:
12814 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12815 case AMDGPU::G_INTRINSIC_TRUNC:
12816 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12817 case AMDGPU::G_FMA:
12818 case AMDGPU::G_FMAD:
12819 case AMDGPU::G_FSQRT:
12820 case AMDGPU::G_FDIV:
12821 case AMDGPU::G_FREM:
12822 case AMDGPU::G_FPOW:
12823 case AMDGPU::G_FPEXT:
12824 case AMDGPU::G_FLOG:
12825 case AMDGPU::G_FLOG2:
12826 case AMDGPU::G_FLOG10:
12827 case AMDGPU::G_FPTRUNC:
12828 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12829 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12830 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12831 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12832 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12833 return true;
12834 case AMDGPU::G_FNEG:
12835 case AMDGPU::G_FABS:
12836 case AMDGPU::G_FCOPYSIGN:
12837 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12838 case AMDGPU::G_FMINNUM:
12839 case AMDGPU::G_FMAXNUM:
12840 case AMDGPU::G_FMINNUM_IEEE:
12841 case AMDGPU::G_FMAXNUM_IEEE:
12842 case AMDGPU::G_FMINIMUM:
12843 case AMDGPU::G_FMAXIMUM: {
12844 if (Subtarget->supportsMinMaxDenormModes() ||
12845 // FIXME: denormalsEnabledForType is broken for dynamic
12846 denormalsEnabledForType(MRI.getType(Reg), MF))
12847 return true;
12848
12849 [[fallthrough]];
12850 }
12851 case AMDGPU::G_BUILD_VECTOR:
12852 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12853 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12854 return false;
12855 return true;
12856 case AMDGPU::G_INTRINSIC:
12857 case AMDGPU::G_INTRINSIC_CONVERGENT:
12858 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12859 case Intrinsic::amdgcn_fmul_legacy:
12860 case Intrinsic::amdgcn_fmad_ftz:
12861 case Intrinsic::amdgcn_sqrt:
12862 case Intrinsic::amdgcn_fmed3:
12863 case Intrinsic::amdgcn_sin:
12864 case Intrinsic::amdgcn_cos:
12865 case Intrinsic::amdgcn_log:
12866 case Intrinsic::amdgcn_exp2:
12867 case Intrinsic::amdgcn_log_clamp:
12868 case Intrinsic::amdgcn_rcp:
12869 case Intrinsic::amdgcn_rcp_legacy:
12870 case Intrinsic::amdgcn_rsq:
12871 case Intrinsic::amdgcn_rsq_clamp:
12872 case Intrinsic::amdgcn_rsq_legacy:
12873 case Intrinsic::amdgcn_div_scale:
12874 case Intrinsic::amdgcn_div_fmas:
12875 case Intrinsic::amdgcn_div_fixup:
12876 case Intrinsic::amdgcn_fract:
12877 case Intrinsic::amdgcn_cvt_pkrtz:
12878 case Intrinsic::amdgcn_cubeid:
12879 case Intrinsic::amdgcn_cubema:
12880 case Intrinsic::amdgcn_cubesc:
12881 case Intrinsic::amdgcn_cubetc:
12882 case Intrinsic::amdgcn_frexp_mant:
12883 case Intrinsic::amdgcn_fdot2:
12884 case Intrinsic::amdgcn_trig_preop:
12885 return true;
12886 default:
12887 break;
12888 }
12889
12890 [[fallthrough]];
12891 default:
12892 return false;
12893 }
12894
12895 llvm_unreachable("invalid operation");
12896}
12897
12898// Constant fold canonicalize.
12899SDValue SITargetLowering::getCanonicalConstantFP(
12900 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12901 // Flush denormals to 0 if not enabled.
12902 if (C.isDenormal()) {
12903 DenormalMode Mode =
12904 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12905 if (Mode == DenormalMode::getPreserveSign()) {
12906 return DAG.getConstantFP(
12907 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12908 }
12909
12910 if (Mode != DenormalMode::getIEEE())
12911 return SDValue();
12912 }
12913
12914 if (C.isNaN()) {
12915 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12916 if (C.isSignaling()) {
12917 // Quiet a signaling NaN.
12918 // FIXME: Is this supposed to preserve payload bits?
12919 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12920 }
12921
12922 // Make sure it is the canonical NaN bitpattern.
12923 //
12924 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12925 // immediate?
12926 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12927 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12928 }
12929
12930 // Already canonical.
12931 return DAG.getConstantFP(C, SL, VT);
12932}
12933
12935 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12936}
12937
12938SDValue SITargetLowering::performFCanonicalizeCombine(
12939 SDNode *N,
12940 DAGCombinerInfo &DCI) const {
12941 SelectionDAG &DAG = DCI.DAG;
12942 SDValue N0 = N->getOperand(0);
12943 EVT VT = N->getValueType(0);
12944
12945 // fcanonicalize undef -> qnan
12946 if (N0.isUndef()) {
12948 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12949 }
12950
12951 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12952 EVT VT = N->getValueType(0);
12953 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12954 }
12955
12956 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12957 // (fcanonicalize k)
12958 //
12959 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12960
12961 // TODO: This could be better with wider vectors that will be split to v2f16,
12962 // and to consider uses since there aren't that many packed operations.
12963 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12964 isTypeLegal(MVT::v2f16)) {
12965 SDLoc SL(N);
12966 SDValue NewElts[2];
12967 SDValue Lo = N0.getOperand(0);
12968 SDValue Hi = N0.getOperand(1);
12969 EVT EltVT = Lo.getValueType();
12970
12972 for (unsigned I = 0; I != 2; ++I) {
12973 SDValue Op = N0.getOperand(I);
12974 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12975 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12976 CFP->getValueAPF());
12977 } else if (Op.isUndef()) {
12978 // Handled below based on what the other operand is.
12979 NewElts[I] = Op;
12980 } else {
12981 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12982 }
12983 }
12984
12985 // If one half is undef, and one is constant, prefer a splat vector rather
12986 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12987 // cheaper to use and may be free with a packed operation.
12988 if (NewElts[0].isUndef()) {
12989 if (isa<ConstantFPSDNode>(NewElts[1]))
12990 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12991 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
12992 }
12993
12994 if (NewElts[1].isUndef()) {
12995 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
12996 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
12997 }
12998
12999 return DAG.getBuildVector(VT, SL, NewElts);
13000 }
13001 }
13002
13003 return SDValue();
13004}
13005
13006static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13007 switch (Opc) {
13008 case ISD::FMAXNUM:
13009 case ISD::FMAXNUM_IEEE:
13010 return AMDGPUISD::FMAX3;
13011 case ISD::FMAXIMUM:
13012 return AMDGPUISD::FMAXIMUM3;
13013 case ISD::SMAX:
13014 return AMDGPUISD::SMAX3;
13015 case ISD::UMAX:
13016 return AMDGPUISD::UMAX3;
13017 case ISD::FMINNUM:
13018 case ISD::FMINNUM_IEEE:
13019 return AMDGPUISD::FMIN3;
13020 case ISD::FMINIMUM:
13021 return AMDGPUISD::FMINIMUM3;
13022 case ISD::SMIN:
13023 return AMDGPUISD::SMIN3;
13024 case ISD::UMIN:
13025 return AMDGPUISD::UMIN3;
13026 default:
13027 llvm_unreachable("Not a min/max opcode");
13028 }
13029}
13030
13031SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13032 const SDLoc &SL, SDValue Src,
13033 SDValue MinVal,
13034 SDValue MaxVal,
13035 bool Signed) const {
13036
13037 // med3 comes from
13038 // min(max(x, K0), K1), K0 < K1
13039 // max(min(x, K0), K1), K1 < K0
13040 //
13041 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13042 // min/max op.
13043 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13044 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13045
13046 if (!MinK || !MaxK)
13047 return SDValue();
13048
13049 if (Signed) {
13050 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13051 return SDValue();
13052 } else {
13053 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13054 return SDValue();
13055 }
13056
13057 EVT VT = MinK->getValueType(0);
13058 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13059 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13060 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13061
13062 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13063 // not available, but this is unlikely to be profitable as constants
13064 // will often need to be materialized & extended, especially on
13065 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13066 return SDValue();
13067}
13068
13070 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13071 return C;
13072
13073 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13074 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13075 return C;
13076 }
13077
13078 return nullptr;
13079}
13080
13081SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13082 const SDLoc &SL,
13083 SDValue Op0,
13084 SDValue Op1) const {
13086 if (!K1)
13087 return SDValue();
13088
13090 if (!K0)
13091 return SDValue();
13092
13093 // Ordered >= (although NaN inputs should have folded away by now).
13094 if (K0->getValueAPF() > K1->getValueAPF())
13095 return SDValue();
13096
13097 const MachineFunction &MF = DAG.getMachineFunction();
13099
13100 // TODO: Check IEEE bit enabled?
13101 EVT VT = Op0.getValueType();
13102 if (Info->getMode().DX10Clamp) {
13103 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13104 // hardware fmed3 behavior converting to a min.
13105 // FIXME: Should this be allowing -0.0?
13106 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13107 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13108 }
13109
13110 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13111 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13112 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13113 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13114 // then give the other result, which is different from med3 with a NaN
13115 // input.
13116 SDValue Var = Op0.getOperand(0);
13117 if (!DAG.isKnownNeverSNaN(Var))
13118 return SDValue();
13119
13121
13122 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13123 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13124 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13125 Var, SDValue(K0, 0), SDValue(K1, 0));
13126 }
13127 }
13128
13129 return SDValue();
13130}
13131
13132/// \return true if the subtarget supports minimum3 and maximum3 with the given
13133/// base min/max opcode \p Opc for type \p VT.
13134static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13135 EVT VT) {
13136 switch (Opc) {
13137 case ISD::FMINNUM:
13138 case ISD::FMAXNUM:
13139 case ISD::FMINNUM_IEEE:
13140 case ISD::FMAXNUM_IEEE:
13143 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13144 case ISD::FMINIMUM:
13145 case ISD::FMAXIMUM:
13146 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13147 case ISD::SMAX:
13148 case ISD::SMIN:
13149 case ISD::UMAX:
13150 case ISD::UMIN:
13151 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13152 default:
13153 return false;
13154 }
13155
13156 llvm_unreachable("not a min/max opcode");
13157}
13158
13159SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13160 DAGCombinerInfo &DCI) const {
13161 SelectionDAG &DAG = DCI.DAG;
13162
13163 EVT VT = N->getValueType(0);
13164 unsigned Opc = N->getOpcode();
13165 SDValue Op0 = N->getOperand(0);
13166 SDValue Op1 = N->getOperand(1);
13167
13168 // Only do this if the inner op has one use since this will just increases
13169 // register pressure for no benefit.
13170
13171 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13172 // max(max(a, b), c) -> max3(a, b, c)
13173 // min(min(a, b), c) -> min3(a, b, c)
13174 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13175 SDLoc DL(N);
13176 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13177 DL,
13178 N->getValueType(0),
13179 Op0.getOperand(0),
13180 Op0.getOperand(1),
13181 Op1);
13182 }
13183
13184 // Try commuted.
13185 // max(a, max(b, c)) -> max3(a, b, c)
13186 // min(a, min(b, c)) -> min3(a, b, c)
13187 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13188 SDLoc DL(N);
13189 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13190 DL,
13191 N->getValueType(0),
13192 Op0,
13193 Op1.getOperand(0),
13194 Op1.getOperand(1));
13195 }
13196 }
13197
13198 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13199 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13200 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13201 if (SDValue Med3 = performIntMed3ImmCombine(
13202 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13203 return Med3;
13204 }
13205 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13206 if (SDValue Med3 = performIntMed3ImmCombine(
13207 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13208 return Med3;
13209 }
13210
13211 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13212 if (SDValue Med3 = performIntMed3ImmCombine(
13213 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13214 return Med3;
13215 }
13216 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13217 if (SDValue Med3 = performIntMed3ImmCombine(
13218 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13219 return Med3;
13220 }
13221
13222 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13223 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13224 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13225 (Opc == AMDGPUISD::FMIN_LEGACY &&
13226 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13227 (VT == MVT::f32 || VT == MVT::f64 ||
13228 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13229 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13230 Op0.hasOneUse()) {
13231 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13232 return Res;
13233 }
13234
13235 return SDValue();
13236}
13237
13239 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13240 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13241 // FIXME: Should this be allowing -0.0?
13242 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13243 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13244 }
13245 }
13246
13247 return false;
13248}
13249
13250// FIXME: Should only worry about snans for version with chain.
13251SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13252 DAGCombinerInfo &DCI) const {
13253 EVT VT = N->getValueType(0);
13254 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13255 // NaNs. With a NaN input, the order of the operands may change the result.
13256
13257 SelectionDAG &DAG = DCI.DAG;
13258 SDLoc SL(N);
13259
13260 SDValue Src0 = N->getOperand(0);
13261 SDValue Src1 = N->getOperand(1);
13262 SDValue Src2 = N->getOperand(2);
13263
13264 if (isClampZeroToOne(Src0, Src1)) {
13265 // const_a, const_b, x -> clamp is safe in all cases including signaling
13266 // nans.
13267 // FIXME: Should this be allowing -0.0?
13268 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13269 }
13270
13271 const MachineFunction &MF = DAG.getMachineFunction();
13273
13274 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13275 // handling no dx10-clamp?
13276 if (Info->getMode().DX10Clamp) {
13277 // If NaNs is clamped to 0, we are free to reorder the inputs.
13278
13279 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13280 std::swap(Src0, Src1);
13281
13282 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13283 std::swap(Src1, Src2);
13284
13285 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13286 std::swap(Src0, Src1);
13287
13288 if (isClampZeroToOne(Src1, Src2))
13289 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13290 }
13291
13292 return SDValue();
13293}
13294
13295SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13296 DAGCombinerInfo &DCI) const {
13297 SDValue Src0 = N->getOperand(0);
13298 SDValue Src1 = N->getOperand(1);
13299 if (Src0.isUndef() && Src1.isUndef())
13300 return DCI.DAG.getUNDEF(N->getValueType(0));
13301 return SDValue();
13302}
13303
13304// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13305// expanded into a set of cmp/select instructions.
13307 unsigned NumElem,
13308 bool IsDivergentIdx,
13309 const GCNSubtarget *Subtarget) {
13311 return false;
13312
13313 unsigned VecSize = EltSize * NumElem;
13314
13315 // Sub-dword vectors of size 2 dword or less have better implementation.
13316 if (VecSize <= 64 && EltSize < 32)
13317 return false;
13318
13319 // Always expand the rest of sub-dword instructions, otherwise it will be
13320 // lowered via memory.
13321 if (EltSize < 32)
13322 return true;
13323
13324 // Always do this if var-idx is divergent, otherwise it will become a loop.
13325 if (IsDivergentIdx)
13326 return true;
13327
13328 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13329 unsigned NumInsts = NumElem /* Number of compares */ +
13330 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13331
13332 // On some architectures (GFX9) movrel is not available and it's better
13333 // to expand.
13334 if (!Subtarget->hasMovrel())
13335 return NumInsts <= 16;
13336
13337 // If movrel is available, use it instead of expanding for vector of 8
13338 // elements.
13339 return NumInsts <= 15;
13340}
13341
13343 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13344 if (isa<ConstantSDNode>(Idx))
13345 return false;
13346
13347 SDValue Vec = N->getOperand(0);
13348 EVT VecVT = Vec.getValueType();
13349 EVT EltVT = VecVT.getVectorElementType();
13350 unsigned EltSize = EltVT.getSizeInBits();
13351 unsigned NumElem = VecVT.getVectorNumElements();
13352
13354 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13355}
13356
13357SDValue SITargetLowering::performExtractVectorEltCombine(
13358 SDNode *N, DAGCombinerInfo &DCI) const {
13359 SDValue Vec = N->getOperand(0);
13360 SelectionDAG &DAG = DCI.DAG;
13361
13362 EVT VecVT = Vec.getValueType();
13363 EVT VecEltVT = VecVT.getVectorElementType();
13364 EVT ResVT = N->getValueType(0);
13365
13366 unsigned VecSize = VecVT.getSizeInBits();
13367 unsigned VecEltSize = VecEltVT.getSizeInBits();
13368
13369 if ((Vec.getOpcode() == ISD::FNEG ||
13371 SDLoc SL(N);
13372 SDValue Idx = N->getOperand(1);
13373 SDValue Elt =
13374 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13375 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13376 }
13377
13378 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13379 // =>
13380 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13381 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13382 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13383 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13384 SDLoc SL(N);
13385 SDValue Idx = N->getOperand(1);
13386 unsigned Opc = Vec.getOpcode();
13387
13388 switch(Opc) {
13389 default:
13390 break;
13391 // TODO: Support other binary operations.
13392 case ISD::FADD:
13393 case ISD::FSUB:
13394 case ISD::FMUL:
13395 case ISD::ADD:
13396 case ISD::UMIN:
13397 case ISD::UMAX:
13398 case ISD::SMIN:
13399 case ISD::SMAX:
13400 case ISD::FMAXNUM:
13401 case ISD::FMINNUM:
13402 case ISD::FMAXNUM_IEEE:
13403 case ISD::FMINNUM_IEEE:
13404 case ISD::FMAXIMUM:
13405 case ISD::FMINIMUM: {
13406 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13407 Vec.getOperand(0), Idx);
13408 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13409 Vec.getOperand(1), Idx);
13410
13411 DCI.AddToWorklist(Elt0.getNode());
13412 DCI.AddToWorklist(Elt1.getNode());
13413 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13414 }
13415 }
13416 }
13417
13418 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13420 SDLoc SL(N);
13421 SDValue Idx = N->getOperand(1);
13422 SDValue V;
13423 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13424 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13425 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13426 if (I == 0)
13427 V = Elt;
13428 else
13429 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13430 }
13431 return V;
13432 }
13433
13434 if (!DCI.isBeforeLegalize())
13435 return SDValue();
13436
13437 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13438 // elements. This exposes more load reduction opportunities by replacing
13439 // multiple small extract_vector_elements with a single 32-bit extract.
13440 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13441 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13442 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13443 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13444
13445 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13446 unsigned EltIdx = BitIndex / 32;
13447 unsigned LeftoverBitIdx = BitIndex % 32;
13448 SDLoc SL(N);
13449
13450 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13451 DCI.AddToWorklist(Cast.getNode());
13452
13453 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13454 DAG.getConstant(EltIdx, SL, MVT::i32));
13455 DCI.AddToWorklist(Elt.getNode());
13456 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13457 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13458 DCI.AddToWorklist(Srl.getNode());
13459
13460 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13461 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13462 DCI.AddToWorklist(Trunc.getNode());
13463
13464 if (VecEltVT == ResVT) {
13465 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13466 }
13467
13468 assert(ResVT.isScalarInteger());
13469 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13470 }
13471
13472 return SDValue();
13473}
13474
13475SDValue
13476SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13477 DAGCombinerInfo &DCI) const {
13478 SDValue Vec = N->getOperand(0);
13479 SDValue Idx = N->getOperand(2);
13480 EVT VecVT = Vec.getValueType();
13481 EVT EltVT = VecVT.getVectorElementType();
13482
13483 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13484 // => BUILD_VECTOR n x select (e, const-idx)
13486 return SDValue();
13487
13488 SelectionDAG &DAG = DCI.DAG;
13489 SDLoc SL(N);
13490 SDValue Ins = N->getOperand(1);
13491 EVT IdxVT = Idx.getValueType();
13492
13494 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13495 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13496 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13497 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13498 Ops.push_back(V);
13499 }
13500
13501 return DAG.getBuildVector(VecVT, SL, Ops);
13502}
13503
13504/// Return the source of an fp_extend from f16 to f32, or a converted FP
13505/// constant.
13507 if (Src.getOpcode() == ISD::FP_EXTEND &&
13508 Src.getOperand(0).getValueType() == MVT::f16) {
13509 return Src.getOperand(0);
13510 }
13511
13512 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13513 APFloat Val = CFP->getValueAPF();
13514 bool LosesInfo = true;
13516 if (!LosesInfo)
13517 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13518 }
13519
13520 return SDValue();
13521}
13522
13523SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13524 DAGCombinerInfo &DCI) const {
13525 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13526 "combine only useful on gfx8");
13527
13528 SDValue TruncSrc = N->getOperand(0);
13529 EVT VT = N->getValueType(0);
13530 if (VT != MVT::f16)
13531 return SDValue();
13532
13533 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13534 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13535 return SDValue();
13536
13537 SelectionDAG &DAG = DCI.DAG;
13538 SDLoc SL(N);
13539
13540 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13541 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13542 // casting back.
13543
13544 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13545 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13546 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13547 if (!A)
13548 return SDValue();
13549
13550 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13551 if (!B)
13552 return SDValue();
13553
13554 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13555 if (!C)
13556 return SDValue();
13557
13558 // This changes signaling nan behavior. If an input is a signaling nan, it
13559 // would have been quieted by the fpext originally. We don't care because
13560 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13561 // we would be worse off than just doing the promotion.
13562 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13563 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13564 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13565 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13566}
13567
13568unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13569 const SDNode *N0,
13570 const SDNode *N1) const {
13571 EVT VT = N0->getValueType(0);
13572
13573 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13574 // support denormals ever.
13575 if (((VT == MVT::f32 &&
13577 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13580 return ISD::FMAD;
13581
13582 const TargetOptions &Options = DAG.getTarget().Options;
13583 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13584 (N0->getFlags().hasAllowContract() &&
13585 N1->getFlags().hasAllowContract())) &&
13587 return ISD::FMA;
13588 }
13589
13590 return 0;
13591}
13592
13593// For a reassociatable opcode perform:
13594// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13595SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13596 SelectionDAG &DAG) const {
13597 EVT VT = N->getValueType(0);
13598 if (VT != MVT::i32 && VT != MVT::i64)
13599 return SDValue();
13600
13601 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13602 return SDValue();
13603
13604 unsigned Opc = N->getOpcode();
13605 SDValue Op0 = N->getOperand(0);
13606 SDValue Op1 = N->getOperand(1);
13607
13608 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13609 return SDValue();
13610
13611 if (Op0->isDivergent())
13612 std::swap(Op0, Op1);
13613
13614 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13615 return SDValue();
13616
13617 SDValue Op2 = Op1.getOperand(1);
13618 Op1 = Op1.getOperand(0);
13619 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13620 return SDValue();
13621
13622 if (Op1->isDivergent())
13623 std::swap(Op1, Op2);
13624
13625 SDLoc SL(N);
13626 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13627 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13628}
13629
13630static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13631 EVT VT,
13632 SDValue N0, SDValue N1, SDValue N2,
13633 bool Signed) {
13635 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13636 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13637 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13638}
13639
13640// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13641// multiplies, if any.
13642//
13643// Full 64-bit multiplies that feed into an addition are lowered here instead
13644// of using the generic expansion. The generic expansion ends up with
13645// a tree of ADD nodes that prevents us from using the "add" part of the
13646// MAD instruction. The expansion produced here results in a chain of ADDs
13647// instead of a tree.
13648SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13649 DAGCombinerInfo &DCI) const {
13650 assert(N->getOpcode() == ISD::ADD);
13651
13652 SelectionDAG &DAG = DCI.DAG;
13653 EVT VT = N->getValueType(0);
13654 SDLoc SL(N);
13655 SDValue LHS = N->getOperand(0);
13656 SDValue RHS = N->getOperand(1);
13657
13658 if (VT.isVector())
13659 return SDValue();
13660
13661 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13662 // result in scalar registers for uniform values.
13663 if (!N->isDivergent() && Subtarget->hasSMulHi())
13664 return SDValue();
13665
13666 unsigned NumBits = VT.getScalarSizeInBits();
13667 if (NumBits <= 32 || NumBits > 64)
13668 return SDValue();
13669
13670 if (LHS.getOpcode() != ISD::MUL) {
13671 assert(RHS.getOpcode() == ISD::MUL);
13672 std::swap(LHS, RHS);
13673 }
13674
13675 // Avoid the fold if it would unduly increase the number of multiplies due to
13676 // multiple uses, except on hardware with full-rate multiply-add (which is
13677 // part of full-rate 64-bit ops).
13678 if (!Subtarget->hasFullRate64Ops()) {
13679 unsigned NumUsers = 0;
13680 for (SDNode *Use : LHS->uses()) {
13681 // There is a use that does not feed into addition, so the multiply can't
13682 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13683 if (Use->getOpcode() != ISD::ADD)
13684 return SDValue();
13685
13686 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13687 // MUL + 3xADD + 3xADDC over 3xMAD.
13688 ++NumUsers;
13689 if (NumUsers >= 3)
13690 return SDValue();
13691 }
13692 }
13693
13694 SDValue MulLHS = LHS.getOperand(0);
13695 SDValue MulRHS = LHS.getOperand(1);
13696 SDValue AddRHS = RHS;
13697
13698 // Always check whether operands are small unsigned values, since that
13699 // knowledge is useful in more cases. Check for small signed values only if
13700 // doing so can unlock a shorter code sequence.
13701 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13702 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13703
13704 bool MulSignedLo = false;
13705 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13706 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13707 numBitsSigned(MulRHS, DAG) <= 32;
13708 }
13709
13710 // The operands and final result all have the same number of bits. If
13711 // operands need to be extended, they can be extended with garbage. The
13712 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13713 // truncated away in the end.
13714 if (VT != MVT::i64) {
13715 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13716 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13717 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13718 }
13719
13720 // The basic code generated is conceptually straightforward. Pseudo code:
13721 //
13722 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13723 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13724 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13725 //
13726 // The second and third lines are optional, depending on whether the factors
13727 // are {sign,zero}-extended or not.
13728 //
13729 // The actual DAG is noisier than the pseudo code, but only due to
13730 // instructions that disassemble values into low and high parts, and
13731 // assemble the final result.
13732 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13733
13734 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13735 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13736 SDValue Accum =
13737 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13738
13739 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13740 SDValue AccumLo, AccumHi;
13741 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13742
13743 if (!MulLHSUnsigned32) {
13744 auto MulLHSHi =
13745 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13746 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13747 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13748 }
13749
13750 if (!MulRHSUnsigned32) {
13751 auto MulRHSHi =
13752 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13753 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13754 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13755 }
13756
13757 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13758 Accum = DAG.getBitcast(MVT::i64, Accum);
13759 }
13760
13761 if (VT != MVT::i64)
13762 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13763 return Accum;
13764}
13765
13766// Collect the ultimate src of each of the mul node's operands, and confirm
13767// each operand is 8 bytes.
13768static std::optional<ByteProvider<SDValue>>
13769handleMulOperand(const SDValue &MulOperand) {
13770 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13771 if (!Byte0 || Byte0->isConstantZero()) {
13772 return std::nullopt;
13773 }
13774 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13775 if (Byte1 && !Byte1->isConstantZero()) {
13776 return std::nullopt;
13777 }
13778 return Byte0;
13779}
13780
13781static unsigned addPermMasks(unsigned First, unsigned Second) {
13782 unsigned FirstCs = First & 0x0c0c0c0c;
13783 unsigned SecondCs = Second & 0x0c0c0c0c;
13784 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13785 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13786
13787 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13788 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13789 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13790 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13791
13792 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13793}
13794
13795struct DotSrc {
13797 int64_t PermMask;
13799};
13800
13804 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13805
13806 assert(Src0.Src.has_value() && Src1.Src.has_value());
13807 // Src0s and Src1s are empty, just place arbitrarily.
13808 if (Step == 0) {
13809 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13810 Src0.SrcOffset / 4});
13811 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13812 Src1.SrcOffset / 4});
13813 return;
13814 }
13815
13816 for (int BPI = 0; BPI < 2; BPI++) {
13817 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13818 if (BPI == 1) {
13819 BPP = {Src1, Src0};
13820 }
13821 unsigned ZeroMask = 0x0c0c0c0c;
13822 unsigned FMask = 0xFF << (8 * (3 - Step));
13823
13824 unsigned FirstMask =
13825 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13826 unsigned SecondMask =
13827 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13828 // Attempt to find Src vector which contains our SDValue, if so, add our
13829 // perm mask to the existing one. If we are unable to find a match for the
13830 // first SDValue, attempt to find match for the second.
13831 int FirstGroup = -1;
13832 for (int I = 0; I < 2; I++) {
13833 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13834 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13835 return IterElt.SrcOp == *BPP.first.Src &&
13836 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13837 };
13838
13839 auto Match = llvm::find_if(Srcs, MatchesFirst);
13840 if (Match != Srcs.end()) {
13841 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13842 FirstGroup = I;
13843 break;
13844 }
13845 }
13846 if (FirstGroup != -1) {
13847 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13848 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13849 return IterElt.SrcOp == *BPP.second.Src &&
13850 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13851 };
13852 auto Match = llvm::find_if(Srcs, MatchesSecond);
13853 if (Match != Srcs.end()) {
13854 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13855 } else
13856 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13857 return;
13858 }
13859 }
13860
13861 // If we have made it here, then we could not find a match in Src0s or Src1s
13862 // for either Src0 or Src1, so just place them arbitrarily.
13863
13864 unsigned ZeroMask = 0x0c0c0c0c;
13865 unsigned FMask = 0xFF << (8 * (3 - Step));
13866
13867 Src0s.push_back(
13868 {*Src0.Src,
13869 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13870 Src1.SrcOffset / 4});
13871 Src1s.push_back(
13872 {*Src1.Src,
13873 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13874 Src1.SrcOffset / 4});
13875
13876 return;
13877}
13878
13880 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13881 bool IsAny) {
13882
13883 // If we just have one source, just permute it accordingly.
13884 if (Srcs.size() == 1) {
13885 auto Elt = Srcs.begin();
13886 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13887
13888 // v_perm will produce the original value
13889 if (Elt->PermMask == 0x3020100)
13890 return EltOp;
13891
13892 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13893 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13894 }
13895
13896 auto FirstElt = Srcs.begin();
13897 auto SecondElt = std::next(FirstElt);
13898
13900
13901 // If we have multiple sources in the chain, combine them via perms (using
13902 // calculated perm mask) and Ors.
13903 while (true) {
13904 auto FirstMask = FirstElt->PermMask;
13905 auto SecondMask = SecondElt->PermMask;
13906
13907 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13908 unsigned FirstPlusFour = FirstMask | 0x04040404;
13909 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13910 // original 0x0C.
13911 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13912
13913 auto PermMask = addPermMasks(FirstMask, SecondMask);
13914 auto FirstVal =
13915 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13916 auto SecondVal =
13917 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13918
13919 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13920 SecondVal,
13921 DAG.getConstant(PermMask, SL, MVT::i32)));
13922
13923 FirstElt = std::next(SecondElt);
13924 if (FirstElt == Srcs.end())
13925 break;
13926
13927 SecondElt = std::next(FirstElt);
13928 // If we only have a FirstElt, then just combine that into the cumulative
13929 // source node.
13930 if (SecondElt == Srcs.end()) {
13931 auto EltOp =
13932 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13933
13934 Perms.push_back(
13935 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13936 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13937 break;
13938 }
13939 }
13940
13941 assert(Perms.size() == 1 || Perms.size() == 2);
13942 return Perms.size() == 2
13943 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13944 : Perms[0];
13945}
13946
13947static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13948 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13949 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13950 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13951 EntryMask += ZeroMask;
13952 }
13953}
13954
13955static bool isMul(const SDValue Op) {
13956 auto Opcode = Op.getOpcode();
13957
13958 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13959 Opcode == AMDGPUISD::MUL_I24);
13960}
13961
13962static std::optional<bool>
13964 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13965 const SDValue &S1Op, const SelectionDAG &DAG) {
13966 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13967 // of the dot4 is irrelevant.
13968 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13969 return false;
13970
13971 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13972 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13973 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13974 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13975 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13976 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13977
13978 assert(!(S0IsUnsigned && S0IsSigned));
13979 assert(!(S1IsUnsigned && S1IsSigned));
13980
13981 // There are 9 possible permutations of
13982 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13983
13984 // In two permutations, the sign bits are known to be the same for both Ops,
13985 // so simply return Signed / Unsigned corresponding to the MSB
13986
13987 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13988 return S0IsSigned;
13989
13990 // In another two permutations, the sign bits are known to be opposite. In
13991 // this case return std::nullopt to indicate a bad match.
13992
13993 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13994 return std::nullopt;
13995
13996 // In the remaining five permutations, we don't know the value of the sign
13997 // bit for at least one Op. Since we have a valid ByteProvider, we know that
13998 // the upper bits must be extension bits. Thus, the only ways for the sign
13999 // bit to be unknown is if it was sign extended from unknown value, or if it
14000 // was any extended. In either case, it is correct to use the signed
14001 // version of the signedness semantics of dot4
14002
14003 // In two of such permutations, we known the sign bit is set for
14004 // one op, and the other is unknown. It is okay to used signed version of
14005 // dot4.
14006 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14007 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14008 return true;
14009
14010 // In one such permutation, we don't know either of the sign bits. It is okay
14011 // to used the signed version of dot4.
14012 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14013 return true;
14014
14015 // In two of such permutations, we known the sign bit is unset for
14016 // one op, and the other is unknown. Return std::nullopt to indicate a
14017 // bad match.
14018 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14019 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14020 return std::nullopt;
14021
14022 llvm_unreachable("Fully covered condition");
14023}
14024
14025SDValue SITargetLowering::performAddCombine(SDNode *N,
14026 DAGCombinerInfo &DCI) const {
14027 SelectionDAG &DAG = DCI.DAG;
14028 EVT VT = N->getValueType(0);
14029 SDLoc SL(N);
14030 SDValue LHS = N->getOperand(0);
14031 SDValue RHS = N->getOperand(1);
14032
14033 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14034 if (Subtarget->hasMad64_32()) {
14035 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14036 return Folded;
14037 }
14038 }
14039
14040 if (SDValue V = reassociateScalarOps(N, DAG)) {
14041 return V;
14042 }
14043
14044 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14045 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14046 SDValue TempNode(N, 0);
14047 std::optional<bool> IsSigned;
14051
14052 // Match the v_dot4 tree, while collecting src nodes.
14053 int ChainLength = 0;
14054 for (int I = 0; I < 4; I++) {
14055 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14056 if (MulIdx == -1)
14057 break;
14058 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14059 if (!Src0)
14060 break;
14061 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14062 if (!Src1)
14063 break;
14064
14065 auto IterIsSigned = checkDot4MulSignedness(
14066 TempNode->getOperand(MulIdx), *Src0, *Src1,
14067 TempNode->getOperand(MulIdx)->getOperand(0),
14068 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14069 if (!IterIsSigned)
14070 break;
14071 if (!IsSigned)
14072 IsSigned = *IterIsSigned;
14073 if (*IterIsSigned != *IsSigned)
14074 break;
14075 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14076 auto AddIdx = 1 - MulIdx;
14077 // Allow the special case where add (add (mul24, 0), mul24) became ->
14078 // add (mul24, mul24).
14079 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14080 Src2s.push_back(TempNode->getOperand(AddIdx));
14081 auto Src0 =
14082 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14083 if (!Src0)
14084 break;
14085 auto Src1 =
14086 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14087 if (!Src1)
14088 break;
14089 auto IterIsSigned = checkDot4MulSignedness(
14090 TempNode->getOperand(AddIdx), *Src0, *Src1,
14091 TempNode->getOperand(AddIdx)->getOperand(0),
14092 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14093 if (!IterIsSigned)
14094 break;
14095 assert(IsSigned);
14096 if (*IterIsSigned != *IsSigned)
14097 break;
14098 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14099 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14100 ChainLength = I + 2;
14101 break;
14102 }
14103
14104 TempNode = TempNode->getOperand(AddIdx);
14105 Src2s.push_back(TempNode);
14106 ChainLength = I + 1;
14107 if (TempNode->getNumOperands() < 2)
14108 break;
14109 LHS = TempNode->getOperand(0);
14110 RHS = TempNode->getOperand(1);
14111 }
14112
14113 if (ChainLength < 2)
14114 return SDValue();
14115
14116 // Masks were constructed with assumption that we would find a chain of
14117 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14118 // 0x0c) so they do not affect dot calculation.
14119 if (ChainLength < 4) {
14120 fixMasks(Src0s, ChainLength);
14121 fixMasks(Src1s, ChainLength);
14122 }
14123
14124 SDValue Src0, Src1;
14125
14126 // If we are just using a single source for both, and have permuted the
14127 // bytes consistently, we can just use the sources without permuting
14128 // (commutation).
14129 bool UseOriginalSrc = false;
14130 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14131 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14132 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14133 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14134 SmallVector<unsigned, 4> SrcBytes;
14135 auto Src0Mask = Src0s.begin()->PermMask;
14136 SrcBytes.push_back(Src0Mask & 0xFF000000);
14137 bool UniqueEntries = true;
14138 for (auto I = 1; I < 4; I++) {
14139 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14140
14141 if (is_contained(SrcBytes, NextByte)) {
14142 UniqueEntries = false;
14143 break;
14144 }
14145 SrcBytes.push_back(NextByte);
14146 }
14147
14148 if (UniqueEntries) {
14149 UseOriginalSrc = true;
14150
14151 auto FirstElt = Src0s.begin();
14152 auto FirstEltOp =
14153 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14154
14155 auto SecondElt = Src1s.begin();
14156 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14157 SecondElt->DWordOffset);
14158
14159 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14160 MVT::getIntegerVT(32));
14161 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14162 MVT::getIntegerVT(32));
14163 }
14164 }
14165
14166 if (!UseOriginalSrc) {
14167 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14168 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14169 }
14170
14171 assert(IsSigned);
14172 SDValue Src2 =
14173 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14174
14175 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14176 : Intrinsic::amdgcn_udot4,
14177 SL, MVT::i64);
14178
14179 assert(!VT.isVector());
14180 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14181 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14182
14183 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14184 }
14185
14186 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14187 return SDValue();
14188
14189 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14190 // add x, sext (setcc) => usubo_carry x, 0, setcc
14191 unsigned Opc = LHS.getOpcode();
14192 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14193 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14194 std::swap(RHS, LHS);
14195
14196 Opc = RHS.getOpcode();
14197 switch (Opc) {
14198 default: break;
14199 case ISD::ZERO_EXTEND:
14200 case ISD::SIGN_EXTEND:
14201 case ISD::ANY_EXTEND: {
14202 auto Cond = RHS.getOperand(0);
14203 // If this won't be a real VOPC output, we would still need to insert an
14204 // extra instruction anyway.
14205 if (!isBoolSGPR(Cond))
14206 break;
14207 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14208 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14210 return DAG.getNode(Opc, SL, VTList, Args);
14211 }
14212 case ISD::UADDO_CARRY: {
14213 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14214 if (!isNullConstant(RHS.getOperand(1)))
14215 break;
14216 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14217 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14218 }
14219 }
14220 return SDValue();
14221}
14222
14223SDValue SITargetLowering::performSubCombine(SDNode *N,
14224 DAGCombinerInfo &DCI) const {
14225 SelectionDAG &DAG = DCI.DAG;
14226 EVT VT = N->getValueType(0);
14227
14228 if (VT != MVT::i32)
14229 return SDValue();
14230
14231 SDLoc SL(N);
14232 SDValue LHS = N->getOperand(0);
14233 SDValue RHS = N->getOperand(1);
14234
14235 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14236 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14237 unsigned Opc = RHS.getOpcode();
14238 switch (Opc) {
14239 default: break;
14240 case ISD::ZERO_EXTEND:
14241 case ISD::SIGN_EXTEND:
14242 case ISD::ANY_EXTEND: {
14243 auto Cond = RHS.getOperand(0);
14244 // If this won't be a real VOPC output, we would still need to insert an
14245 // extra instruction anyway.
14246 if (!isBoolSGPR(Cond))
14247 break;
14248 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14249 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14251 return DAG.getNode(Opc, SL, VTList, Args);
14252 }
14253 }
14254
14255 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14256 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14257 if (!isNullConstant(LHS.getOperand(1)))
14258 return SDValue();
14259 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14260 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14261 }
14262 return SDValue();
14263}
14264
14265SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14266 DAGCombinerInfo &DCI) const {
14267
14268 if (N->getValueType(0) != MVT::i32)
14269 return SDValue();
14270
14271 if (!isNullConstant(N->getOperand(1)))
14272 return SDValue();
14273
14274 SelectionDAG &DAG = DCI.DAG;
14275 SDValue LHS = N->getOperand(0);
14276
14277 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14278 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14279 unsigned LHSOpc = LHS.getOpcode();
14280 unsigned Opc = N->getOpcode();
14281 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14282 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14283 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14284 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14285 }
14286 return SDValue();
14287}
14288
14289SDValue SITargetLowering::performFAddCombine(SDNode *N,
14290 DAGCombinerInfo &DCI) const {
14291 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14292 return SDValue();
14293
14294 SelectionDAG &DAG = DCI.DAG;
14295 EVT VT = N->getValueType(0);
14296
14297 SDLoc SL(N);
14298 SDValue LHS = N->getOperand(0);
14299 SDValue RHS = N->getOperand(1);
14300
14301 // These should really be instruction patterns, but writing patterns with
14302 // source modifiers is a pain.
14303
14304 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14305 if (LHS.getOpcode() == ISD::FADD) {
14306 SDValue A = LHS.getOperand(0);
14307 if (A == LHS.getOperand(1)) {
14308 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14309 if (FusedOp != 0) {
14310 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14311 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14312 }
14313 }
14314 }
14315
14316 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14317 if (RHS.getOpcode() == ISD::FADD) {
14318 SDValue A = RHS.getOperand(0);
14319 if (A == RHS.getOperand(1)) {
14320 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14321 if (FusedOp != 0) {
14322 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14323 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14324 }
14325 }
14326 }
14327
14328 return SDValue();
14329}
14330
14331SDValue SITargetLowering::performFSubCombine(SDNode *N,
14332 DAGCombinerInfo &DCI) const {
14333 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14334 return SDValue();
14335
14336 SelectionDAG &DAG = DCI.DAG;
14337 SDLoc SL(N);
14338 EVT VT = N->getValueType(0);
14339 assert(!VT.isVector());
14340
14341 // Try to get the fneg to fold into the source modifier. This undoes generic
14342 // DAG combines and folds them into the mad.
14343 //
14344 // Only do this if we are not trying to support denormals. v_mad_f32 does
14345 // not support denormals ever.
14346 SDValue LHS = N->getOperand(0);
14347 SDValue RHS = N->getOperand(1);
14348 if (LHS.getOpcode() == ISD::FADD) {
14349 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14350 SDValue A = LHS.getOperand(0);
14351 if (A == LHS.getOperand(1)) {
14352 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14353 if (FusedOp != 0){
14354 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14355 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14356
14357 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14358 }
14359 }
14360 }
14361
14362 if (RHS.getOpcode() == ISD::FADD) {
14363 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14364
14365 SDValue A = RHS.getOperand(0);
14366 if (A == RHS.getOperand(1)) {
14367 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14368 if (FusedOp != 0){
14369 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14370 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14371 }
14372 }
14373 }
14374
14375 return SDValue();
14376}
14377
14378SDValue SITargetLowering::performFDivCombine(SDNode *N,
14379 DAGCombinerInfo &DCI) const {
14380 SelectionDAG &DAG = DCI.DAG;
14381 SDLoc SL(N);
14382 EVT VT = N->getValueType(0);
14383 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14384 return SDValue();
14385
14386 SDValue LHS = N->getOperand(0);
14387 SDValue RHS = N->getOperand(1);
14388
14389 SDNodeFlags Flags = N->getFlags();
14390 SDNodeFlags RHSFlags = RHS->getFlags();
14391 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14392 !RHS->hasOneUse())
14393 return SDValue();
14394
14395 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14396 bool IsNegative = false;
14397 if (CLHS->isExactlyValue(1.0) ||
14398 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14399 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14400 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14401 if (RHS.getOpcode() == ISD::FSQRT) {
14402 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14403 SDValue Rsq =
14404 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14405 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14406 }
14407 }
14408 }
14409
14410 return SDValue();
14411}
14412
14413SDValue SITargetLowering::performFMACombine(SDNode *N,
14414 DAGCombinerInfo &DCI) const {
14415 SelectionDAG &DAG = DCI.DAG;
14416 EVT VT = N->getValueType(0);
14417 SDLoc SL(N);
14418
14419 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14420 return SDValue();
14421
14422 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14423 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14424 SDValue Op1 = N->getOperand(0);
14425 SDValue Op2 = N->getOperand(1);
14426 SDValue FMA = N->getOperand(2);
14427
14428 if (FMA.getOpcode() != ISD::FMA ||
14429 Op1.getOpcode() != ISD::FP_EXTEND ||
14430 Op2.getOpcode() != ISD::FP_EXTEND)
14431 return SDValue();
14432
14433 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14434 // regardless of the denorm mode setting. Therefore,
14435 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14436 const TargetOptions &Options = DAG.getTarget().Options;
14437 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14438 (N->getFlags().hasAllowContract() &&
14439 FMA->getFlags().hasAllowContract())) {
14440 Op1 = Op1.getOperand(0);
14441 Op2 = Op2.getOperand(0);
14442 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14444 return SDValue();
14445
14446 SDValue Vec1 = Op1.getOperand(0);
14447 SDValue Idx1 = Op1.getOperand(1);
14448 SDValue Vec2 = Op2.getOperand(0);
14449
14450 SDValue FMAOp1 = FMA.getOperand(0);
14451 SDValue FMAOp2 = FMA.getOperand(1);
14452 SDValue FMAAcc = FMA.getOperand(2);
14453
14454 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14455 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14456 return SDValue();
14457
14458 FMAOp1 = FMAOp1.getOperand(0);
14459 FMAOp2 = FMAOp2.getOperand(0);
14460 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14462 return SDValue();
14463
14464 SDValue Vec3 = FMAOp1.getOperand(0);
14465 SDValue Vec4 = FMAOp2.getOperand(0);
14466 SDValue Idx2 = FMAOp1.getOperand(1);
14467
14468 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14469 // Idx1 and Idx2 cannot be the same.
14470 Idx1 == Idx2)
14471 return SDValue();
14472
14473 if (Vec1 == Vec2 || Vec3 == Vec4)
14474 return SDValue();
14475
14476 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14477 return SDValue();
14478
14479 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14480 (Vec1 == Vec4 && Vec2 == Vec3)) {
14481 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14482 DAG.getTargetConstant(0, SL, MVT::i1));
14483 }
14484 }
14485 return SDValue();
14486}
14487
14488SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14489 DAGCombinerInfo &DCI) const {
14490 SelectionDAG &DAG = DCI.DAG;
14491 SDLoc SL(N);
14492
14493 SDValue LHS = N->getOperand(0);
14494 SDValue RHS = N->getOperand(1);
14495 EVT VT = LHS.getValueType();
14496 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14497
14498 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14499 if (!CRHS) {
14500 CRHS = dyn_cast<ConstantSDNode>(LHS);
14501 if (CRHS) {
14502 std::swap(LHS, RHS);
14504 }
14505 }
14506
14507 if (CRHS) {
14508 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14509 isBoolSGPR(LHS.getOperand(0))) {
14510 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14511 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14512 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14513 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14514 if ((CRHS->isAllOnes() &&
14515 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14516 (CRHS->isZero() &&
14517 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14518 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14519 DAG.getConstant(-1, SL, MVT::i1));
14520 if ((CRHS->isAllOnes() &&
14521 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14522 (CRHS->isZero() &&
14523 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14524 return LHS.getOperand(0);
14525 }
14526
14527 const APInt &CRHSVal = CRHS->getAPIntValue();
14528 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14529 LHS.getOpcode() == ISD::SELECT &&
14530 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14531 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14532 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14533 isBoolSGPR(LHS.getOperand(0))) {
14534 // Given CT != FT:
14535 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14536 // setcc (select cc, CT, CF), CF, ne => cc
14537 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14538 // setcc (select cc, CT, CF), CT, eq => cc
14539 const APInt &CT = LHS.getConstantOperandAPInt(1);
14540 const APInt &CF = LHS.getConstantOperandAPInt(2);
14541
14542 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14543 (CT == CRHSVal && CC == ISD::SETNE))
14544 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14545 DAG.getConstant(-1, SL, MVT::i1));
14546 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14547 (CT == CRHSVal && CC == ISD::SETEQ))
14548 return LHS.getOperand(0);
14549 }
14550 }
14551
14552 if (VT != MVT::f32 && VT != MVT::f64 &&
14553 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14554 return SDValue();
14555
14556 // Match isinf/isfinite pattern
14557 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14558 // (fcmp one (fabs x), inf) -> (fp_class x,
14559 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14560 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14561 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14562 if (!CRHS)
14563 return SDValue();
14564
14565 const APFloat &APF = CRHS->getValueAPF();
14566 if (APF.isInfinity() && !APF.isNegative()) {
14567 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14569 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14575 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14576 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14577 DAG.getConstant(Mask, SL, MVT::i32));
14578 }
14579 }
14580
14581 return SDValue();
14582}
14583
14584SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14585 DAGCombinerInfo &DCI) const {
14586 SelectionDAG &DAG = DCI.DAG;
14587 SDLoc SL(N);
14588 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14589
14590 SDValue Src = N->getOperand(0);
14591 SDValue Shift = N->getOperand(0);
14592
14593 // TODO: Extend type shouldn't matter (assuming legal types).
14594 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14595 Shift = Shift.getOperand(0);
14596
14597 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14598 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14599 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14600 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14601 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14602 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14603 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14604 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14605 SDLoc(Shift.getOperand(0)), MVT::i32);
14606
14607 unsigned ShiftOffset = 8 * Offset;
14608 if (Shift.getOpcode() == ISD::SHL)
14609 ShiftOffset -= C->getZExtValue();
14610 else
14611 ShiftOffset += C->getZExtValue();
14612
14613 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14614 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14615 MVT::f32, Shifted);
14616 }
14617 }
14618 }
14619
14620 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14621 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14622 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14623 // We simplified Src. If this node is not dead, visit it again so it is
14624 // folded properly.
14625 if (N->getOpcode() != ISD::DELETED_NODE)
14626 DCI.AddToWorklist(N);
14627 return SDValue(N, 0);
14628 }
14629
14630 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14631 if (SDValue DemandedSrc =
14633 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14634
14635 return SDValue();
14636}
14637
14638SDValue SITargetLowering::performClampCombine(SDNode *N,
14639 DAGCombinerInfo &DCI) const {
14640 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14641 if (!CSrc)
14642 return SDValue();
14643
14644 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14645 const APFloat &F = CSrc->getValueAPF();
14646 APFloat Zero = APFloat::getZero(F.getSemantics());
14647 if (F < Zero ||
14648 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14649 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14650 }
14651
14652 APFloat One(F.getSemantics(), "1.0");
14653 if (F > One)
14654 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14655
14656 return SDValue(CSrc, 0);
14657}
14658
14659
14661 DAGCombinerInfo &DCI) const {
14662 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14663 return SDValue();
14664 switch (N->getOpcode()) {
14665 case ISD::ADD:
14666 return performAddCombine(N, DCI);
14667 case ISD::SUB:
14668 return performSubCombine(N, DCI);
14669 case ISD::UADDO_CARRY:
14670 case ISD::USUBO_CARRY:
14671 return performAddCarrySubCarryCombine(N, DCI);
14672 case ISD::FADD:
14673 return performFAddCombine(N, DCI);
14674 case ISD::FSUB:
14675 return performFSubCombine(N, DCI);
14676 case ISD::FDIV:
14677 return performFDivCombine(N, DCI);
14678 case ISD::SETCC:
14679 return performSetCCCombine(N, DCI);
14680 case ISD::FMAXNUM:
14681 case ISD::FMINNUM:
14682 case ISD::FMAXNUM_IEEE:
14683 case ISD::FMINNUM_IEEE:
14684 case ISD::FMAXIMUM:
14685 case ISD::FMINIMUM:
14686 case ISD::SMAX:
14687 case ISD::SMIN:
14688 case ISD::UMAX:
14689 case ISD::UMIN:
14692 return performMinMaxCombine(N, DCI);
14693 case ISD::FMA:
14694 return performFMACombine(N, DCI);
14695 case ISD::AND:
14696 return performAndCombine(N, DCI);
14697 case ISD::OR:
14698 return performOrCombine(N, DCI);
14699 case ISD::FSHR: {
14701 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14702 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14703 return matchPERM(N, DCI);
14704 }
14705 break;
14706 }
14707 case ISD::XOR:
14708 return performXorCombine(N, DCI);
14709 case ISD::ZERO_EXTEND:
14710 return performZeroExtendCombine(N, DCI);
14712 return performSignExtendInRegCombine(N , DCI);
14714 return performClassCombine(N, DCI);
14715 case ISD::FCANONICALIZE:
14716 return performFCanonicalizeCombine(N, DCI);
14717 case AMDGPUISD::RCP:
14718 return performRcpCombine(N, DCI);
14719 case ISD::FLDEXP:
14720 case AMDGPUISD::FRACT:
14721 case AMDGPUISD::RSQ:
14724 case AMDGPUISD::RSQ_CLAMP: {
14725 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14726 SDValue Src = N->getOperand(0);
14727 if (Src.isUndef())
14728 return Src;
14729 break;
14730 }
14731 case ISD::SINT_TO_FP:
14732 case ISD::UINT_TO_FP:
14733 return performUCharToFloatCombine(N, DCI);
14734 case ISD::FCOPYSIGN:
14735 return performFCopySignCombine(N, DCI);
14740 return performCvtF32UByteNCombine(N, DCI);
14741 case AMDGPUISD::FMED3:
14742 return performFMed3Combine(N, DCI);
14744 return performCvtPkRTZCombine(N, DCI);
14745 case AMDGPUISD::CLAMP:
14746 return performClampCombine(N, DCI);
14747 case ISD::SCALAR_TO_VECTOR: {
14748 SelectionDAG &DAG = DCI.DAG;
14749 EVT VT = N->getValueType(0);
14750
14751 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14752 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14753 SDLoc SL(N);
14754 SDValue Src = N->getOperand(0);
14755 EVT EltVT = Src.getValueType();
14756 if (EltVT != MVT::i16)
14757 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14758
14759 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14760 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14761 }
14762
14763 break;
14764 }
14766 return performExtractVectorEltCombine(N, DCI);
14768 return performInsertVectorEltCombine(N, DCI);
14769 case ISD::FP_ROUND:
14770 return performFPRoundCombine(N, DCI);
14771 case ISD::LOAD: {
14772 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14773 return Widened;
14774 [[fallthrough]];
14775 }
14776 default: {
14777 if (!DCI.isBeforeLegalize()) {
14778 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14779 return performMemSDNodeCombine(MemNode, DCI);
14780 }
14781
14782 break;
14783 }
14784 }
14785
14787}
14788
14789/// Helper function for adjustWritemask
14790static unsigned SubIdx2Lane(unsigned Idx) {
14791 switch (Idx) {
14792 default: return ~0u;
14793 case AMDGPU::sub0: return 0;
14794 case AMDGPU::sub1: return 1;
14795 case AMDGPU::sub2: return 2;
14796 case AMDGPU::sub3: return 3;
14797 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14798 }
14799}
14800
14801/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14802SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14803 SelectionDAG &DAG) const {
14804 unsigned Opcode = Node->getMachineOpcode();
14805
14806 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14807 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14808 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14809 return Node; // not implemented for D16
14810
14811 SDNode *Users[5] = { nullptr };
14812 unsigned Lane = 0;
14813 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14814 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14815 unsigned NewDmask = 0;
14816 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14817 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14818 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14819 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14820 ? true
14821 : false;
14822 unsigned TFCLane = 0;
14823 bool HasChain = Node->getNumValues() > 1;
14824
14825 if (OldDmask == 0) {
14826 // These are folded out, but on the chance it happens don't assert.
14827 return Node;
14828 }
14829
14830 unsigned OldBitsSet = llvm::popcount(OldDmask);
14831 // Work out which is the TFE/LWE lane if that is enabled.
14832 if (UsesTFC) {
14833 TFCLane = OldBitsSet;
14834 }
14835
14836 // Try to figure out the used register components
14837 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14838 I != E; ++I) {
14839
14840 // Don't look at users of the chain.
14841 if (I.getUse().getResNo() != 0)
14842 continue;
14843
14844 // Abort if we can't understand the usage
14845 if (!I->isMachineOpcode() ||
14846 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14847 return Node;
14848
14849 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14850 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14851 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14852 // set, etc.
14853 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14854 if (Lane == ~0u)
14855 return Node;
14856
14857 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14858 if (UsesTFC && Lane == TFCLane) {
14859 Users[Lane] = *I;
14860 } else {
14861 // Set which texture component corresponds to the lane.
14862 unsigned Comp;
14863 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14864 Comp = llvm::countr_zero(Dmask);
14865 Dmask &= ~(1 << Comp);
14866 }
14867
14868 // Abort if we have more than one user per component.
14869 if (Users[Lane])
14870 return Node;
14871
14872 Users[Lane] = *I;
14873 NewDmask |= 1 << Comp;
14874 }
14875 }
14876
14877 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14878 bool NoChannels = !NewDmask;
14879 if (NoChannels) {
14880 if (!UsesTFC) {
14881 // No uses of the result and not using TFC. Then do nothing.
14882 return Node;
14883 }
14884 // If the original dmask has one channel - then nothing to do
14885 if (OldBitsSet == 1)
14886 return Node;
14887 // Use an arbitrary dmask - required for the instruction to work
14888 NewDmask = 1;
14889 }
14890 // Abort if there's no change
14891 if (NewDmask == OldDmask)
14892 return Node;
14893
14894 unsigned BitsSet = llvm::popcount(NewDmask);
14895
14896 // Check for TFE or LWE - increase the number of channels by one to account
14897 // for the extra return value
14898 // This will need adjustment for D16 if this is also included in
14899 // adjustWriteMask (this function) but at present D16 are excluded.
14900 unsigned NewChannels = BitsSet + UsesTFC;
14901
14902 int NewOpcode =
14903 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14904 assert(NewOpcode != -1 &&
14905 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14906 "failed to find equivalent MIMG op");
14907
14908 // Adjust the writemask in the node
14910 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14911 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14912 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14913
14914 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14915
14916 MVT ResultVT = NewChannels == 1 ?
14917 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14918 NewChannels == 5 ? 8 : NewChannels);
14919 SDVTList NewVTList = HasChain ?
14920 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14921
14922
14923 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14924 NewVTList, Ops);
14925
14926 if (HasChain) {
14927 // Update chain.
14928 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14929 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14930 }
14931
14932 if (NewChannels == 1) {
14933 assert(Node->hasNUsesOfValue(1, 0));
14934 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14935 SDLoc(Node), Users[Lane]->getValueType(0),
14936 SDValue(NewNode, 0));
14937 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14938 return nullptr;
14939 }
14940
14941 // Update the users of the node with the new indices
14942 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14943 SDNode *User = Users[i];
14944 if (!User) {
14945 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14946 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14947 if (i || !NoChannels)
14948 continue;
14949 } else {
14950 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14951 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14952 if (NewUser != User) {
14953 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14954 DAG.RemoveDeadNode(User);
14955 }
14956 }
14957
14958 switch (Idx) {
14959 default: break;
14960 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14961 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14962 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14963 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14964 }
14965 }
14966
14967 DAG.RemoveDeadNode(Node);
14968 return nullptr;
14969}
14970
14972 if (Op.getOpcode() == ISD::AssertZext)
14973 Op = Op.getOperand(0);
14974
14975 return isa<FrameIndexSDNode>(Op);
14976}
14977
14978/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14979/// with frame index operands.
14980/// LLVM assumes that inputs are to these instructions are registers.
14982 SelectionDAG &DAG) const {
14983 if (Node->getOpcode() == ISD::CopyToReg) {
14984 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14985 SDValue SrcVal = Node->getOperand(2);
14986
14987 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14988 // to try understanding copies to physical registers.
14989 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
14990 SDLoc SL(Node);
14992 SDValue VReg = DAG.getRegister(
14993 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14994
14995 SDNode *Glued = Node->getGluedNode();
14996 SDValue ToVReg
14997 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14998 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
14999 SDValue ToResultReg
15000 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15001 VReg, ToVReg.getValue(1));
15002 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15003 DAG.RemoveDeadNode(Node);
15004 return ToResultReg.getNode();
15005 }
15006 }
15007
15009 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15010 if (!isFrameIndexOp(Node->getOperand(i))) {
15011 Ops.push_back(Node->getOperand(i));
15012 continue;
15013 }
15014
15015 SDLoc DL(Node);
15016 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15017 Node->getOperand(i).getValueType(),
15018 Node->getOperand(i)), 0));
15019 }
15020
15021 return DAG.UpdateNodeOperands(Node, Ops);
15022}
15023
15024/// Fold the instructions after selecting them.
15025/// Returns null if users were already updated.
15027 SelectionDAG &DAG) const {
15029 unsigned Opcode = Node->getMachineOpcode();
15030
15031 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15032 !TII->isGather4(Opcode) &&
15033 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15034 return adjustWritemask(Node, DAG);
15035 }
15036
15037 if (Opcode == AMDGPU::INSERT_SUBREG ||
15038 Opcode == AMDGPU::REG_SEQUENCE) {
15040 return Node;
15041 }
15042
15043 switch (Opcode) {
15044 case AMDGPU::V_DIV_SCALE_F32_e64:
15045 case AMDGPU::V_DIV_SCALE_F64_e64: {
15046 // Satisfy the operand register constraint when one of the inputs is
15047 // undefined. Ordinarily each undef value will have its own implicit_def of
15048 // a vreg, so force these to use a single register.
15049 SDValue Src0 = Node->getOperand(1);
15050 SDValue Src1 = Node->getOperand(3);
15051 SDValue Src2 = Node->getOperand(5);
15052
15053 if ((Src0.isMachineOpcode() &&
15054 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15055 (Src0 == Src1 || Src0 == Src2))
15056 break;
15057
15058 MVT VT = Src0.getValueType().getSimpleVT();
15059 const TargetRegisterClass *RC =
15060 getRegClassFor(VT, Src0.getNode()->isDivergent());
15061
15063 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15064
15065 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15066 UndefReg, Src0, SDValue());
15067
15068 // src0 must be the same register as src1 or src2, even if the value is
15069 // undefined, so make sure we don't violate this constraint.
15070 if (Src0.isMachineOpcode() &&
15071 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15072 if (Src1.isMachineOpcode() &&
15073 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15074 Src0 = Src1;
15075 else if (Src2.isMachineOpcode() &&
15076 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15077 Src0 = Src2;
15078 else {
15079 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15080 Src0 = UndefReg;
15081 Src1 = UndefReg;
15082 }
15083 } else
15084 break;
15085
15086 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15087 Ops[1] = Src0;
15088 Ops[3] = Src1;
15089 Ops[5] = Src2;
15090 Ops.push_back(ImpDef.getValue(1));
15091 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15092 }
15093 default:
15094 break;
15095 }
15096
15097 return Node;
15098}
15099
15100// Any MIMG instructions that use tfe or lwe require an initialization of the
15101// result register that will be written in the case of a memory access failure.
15102// The required code is also added to tie this init code to the result of the
15103// img instruction.
15106 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15107 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15108 MachineBasicBlock &MBB = *MI.getParent();
15109
15110 int DstIdx =
15111 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15112 unsigned InitIdx = 0;
15113
15114 if (TII->isImage(MI)) {
15115 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15116 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15117 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15118
15119 if (!TFE && !LWE) // intersect_ray
15120 return;
15121
15122 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15123 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15124 unsigned D16Val = D16 ? D16->getImm() : 0;
15125
15126 if (!TFEVal && !LWEVal)
15127 return;
15128
15129 // At least one of TFE or LWE are non-zero
15130 // We have to insert a suitable initialization of the result value and
15131 // tie this to the dest of the image instruction.
15132
15133 // Calculate which dword we have to initialize to 0.
15134 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15135
15136 // check that dmask operand is found.
15137 assert(MO_Dmask && "Expected dmask operand in instruction");
15138
15139 unsigned dmask = MO_Dmask->getImm();
15140 // Determine the number of active lanes taking into account the
15141 // Gather4 special case
15142 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15143
15144 bool Packed = !Subtarget->hasUnpackedD16VMem();
15145
15146 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15147
15148 // Abandon attempt if the dst size isn't large enough
15149 // - this is in fact an error but this is picked up elsewhere and
15150 // reported correctly.
15151 uint32_t DstSize =
15152 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15153 if (DstSize < InitIdx)
15154 return;
15155 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15156 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15157 } else {
15158 return;
15159 }
15160
15161 const DebugLoc &DL = MI.getDebugLoc();
15162
15163 // Create a register for the initialization value.
15164 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15165 unsigned NewDst = 0; // Final initialized value will be in here
15166
15167 // If PRTStrictNull feature is enabled (the default) then initialize
15168 // all the result registers to 0, otherwise just the error indication
15169 // register (VGPRn+1)
15170 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15171 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15172
15173 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15174 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15175 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15176 // Initialize dword
15177 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15178 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15179 .addImm(0);
15180 // Insert into the super-reg
15181 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15182 .addReg(PrevDst)
15183 .addReg(SubReg)
15185
15186 PrevDst = NewDst;
15187 }
15188
15189 // Add as an implicit operand
15190 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15191
15192 // Tie the just added implicit operand to the dst
15193 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15194}
15195
15196/// Assign the register class depending on the number of
15197/// bits set in the writemask
15199 SDNode *Node) const {
15201
15202 MachineFunction *MF = MI.getParent()->getParent();
15205
15206 if (TII->isVOP3(MI.getOpcode())) {
15207 // Make sure constant bus requirements are respected.
15208 TII->legalizeOperandsVOP3(MRI, MI);
15209
15210 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15211 // This saves a chain-copy of registers and better balance register
15212 // use between vgpr and agpr as agpr tuples tend to be big.
15213 if (!MI.getDesc().operands().empty()) {
15214 unsigned Opc = MI.getOpcode();
15215 bool HasAGPRs = Info->mayNeedAGPRs();
15216 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15217 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15218 for (auto I :
15219 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15220 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15221 if (I == -1)
15222 break;
15223 if ((I == Src2Idx) && (HasAGPRs))
15224 break;
15225 MachineOperand &Op = MI.getOperand(I);
15226 if (!Op.isReg() || !Op.getReg().isVirtual())
15227 continue;
15228 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15229 if (!TRI->hasAGPRs(RC))
15230 continue;
15231 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15232 if (!Src || !Src->isCopy() ||
15233 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15234 continue;
15235 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15236 // All uses of agpr64 and agpr32 can also accept vgpr except for
15237 // v_accvgpr_read, but we do not produce agpr reads during selection,
15238 // so no use checks are needed.
15239 MRI.setRegClass(Op.getReg(), NewRC);
15240 }
15241
15242 if (!HasAGPRs)
15243 return;
15244
15245 // Resolve the rest of AV operands to AGPRs.
15246 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15247 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15248 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15249 if (TRI->isVectorSuperClass(RC)) {
15250 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15251 MRI.setRegClass(Src2->getReg(), NewRC);
15252 if (Src2->isTied())
15253 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15254 }
15255 }
15256 }
15257 }
15258
15259 return;
15260 }
15261
15262 if (TII->isImage(MI))
15263 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15264}
15265
15267 uint64_t Val) {
15268 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15269 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15270}
15271
15273 const SDLoc &DL,
15274 SDValue Ptr) const {
15276
15277 // Build the half of the subregister with the constants before building the
15278 // full 128-bit register. If we are building multiple resource descriptors,
15279 // this will allow CSEing of the 2-component register.
15280 const SDValue Ops0[] = {
15281 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15282 buildSMovImm32(DAG, DL, 0),
15283 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15284 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15285 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15286 };
15287
15288 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15289 MVT::v2i32, Ops0), 0);
15290
15291 // Combine the constants and the pointer.
15292 const SDValue Ops1[] = {
15293 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15294 Ptr,
15295 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15296 SubRegHi,
15297 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15298 };
15299
15300 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15301}
15302
15303/// Return a resource descriptor with the 'Add TID' bit enabled
15304/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15305/// of the resource descriptor) to create an offset, which is added to
15306/// the resource pointer.
15308 SDValue Ptr, uint32_t RsrcDword1,
15309 uint64_t RsrcDword2And3) const {
15310 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15311 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15312 if (RsrcDword1) {
15313 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15314 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15315 0);
15316 }
15317
15318 SDValue DataLo = buildSMovImm32(DAG, DL,
15319 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15320 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15321
15322 const SDValue Ops[] = {
15323 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15324 PtrLo,
15325 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15326 PtrHi,
15327 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15328 DataLo,
15329 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15330 DataHi,
15331 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15332 };
15333
15334 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15335}
15336
15337//===----------------------------------------------------------------------===//
15338// SI Inline Assembly Support
15339//===----------------------------------------------------------------------===//
15340
15341std::pair<unsigned, const TargetRegisterClass *>
15343 StringRef Constraint,
15344 MVT VT) const {
15345 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15346
15347 const TargetRegisterClass *RC = nullptr;
15348 if (Constraint.size() == 1) {
15349 const unsigned BitWidth = VT.getSizeInBits();
15350 switch (Constraint[0]) {
15351 default:
15352 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15353 case 's':
15354 case 'r':
15355 switch (BitWidth) {
15356 case 16:
15357 RC = &AMDGPU::SReg_32RegClass;
15358 break;
15359 case 64:
15360 RC = &AMDGPU::SGPR_64RegClass;
15361 break;
15362 default:
15364 if (!RC)
15365 return std::pair(0U, nullptr);
15366 break;
15367 }
15368 break;
15369 case 'v':
15370 switch (BitWidth) {
15371 case 16:
15372 RC = &AMDGPU::VGPR_32RegClass;
15373 break;
15374 default:
15375 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15376 if (!RC)
15377 return std::pair(0U, nullptr);
15378 break;
15379 }
15380 break;
15381 case 'a':
15382 if (!Subtarget->hasMAIInsts())
15383 break;
15384 switch (BitWidth) {
15385 case 16:
15386 RC = &AMDGPU::AGPR_32RegClass;
15387 break;
15388 default:
15389 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15390 if (!RC)
15391 return std::pair(0U, nullptr);
15392 break;
15393 }
15394 break;
15395 }
15396 // We actually support i128, i16 and f16 as inline parameters
15397 // even if they are not reported as legal
15398 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15399 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15400 return std::pair(0U, RC);
15401 }
15402
15403 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15404 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15405 if (RegName.consume_front("v")) {
15406 RC = &AMDGPU::VGPR_32RegClass;
15407 } else if (RegName.consume_front("s")) {
15408 RC = &AMDGPU::SGPR_32RegClass;
15409 } else if (RegName.consume_front("a")) {
15410 RC = &AMDGPU::AGPR_32RegClass;
15411 }
15412
15413 if (RC) {
15414 uint32_t Idx;
15415 if (RegName.consume_front("[")) {
15416 uint32_t End;
15417 bool Failed = RegName.consumeInteger(10, Idx);
15418 Failed |= !RegName.consume_front(":");
15419 Failed |= RegName.consumeInteger(10, End);
15420 Failed |= !RegName.consume_back("]");
15421 if (!Failed) {
15422 uint32_t Width = (End - Idx + 1) * 32;
15423 MCRegister Reg = RC->getRegister(Idx);
15425 RC = TRI->getVGPRClassForBitWidth(Width);
15426 else if (SIRegisterInfo::isSGPRClass(RC))
15427 RC = TRI->getSGPRClassForBitWidth(Width);
15428 else if (SIRegisterInfo::isAGPRClass(RC))
15429 RC = TRI->getAGPRClassForBitWidth(Width);
15430 if (RC) {
15431 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15432 return std::pair(Reg, RC);
15433 }
15434 }
15435 } else {
15436 bool Failed = RegName.getAsInteger(10, Idx);
15437 if (!Failed && Idx < RC->getNumRegs())
15438 return std::pair(RC->getRegister(Idx), RC);
15439 }
15440 }
15441 }
15442
15443 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15444 if (Ret.first)
15445 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15446
15447 return Ret;
15448}
15449
15450static bool isImmConstraint(StringRef Constraint) {
15451 if (Constraint.size() == 1) {
15452 switch (Constraint[0]) {
15453 default: break;
15454 case 'I':
15455 case 'J':
15456 case 'A':
15457 case 'B':
15458 case 'C':
15459 return true;
15460 }
15461 } else if (Constraint == "DA" ||
15462 Constraint == "DB") {
15463 return true;
15464 }
15465 return false;
15466}
15467
15470 if (Constraint.size() == 1) {
15471 switch (Constraint[0]) {
15472 default: break;
15473 case 's':
15474 case 'v':
15475 case 'a':
15476 return C_RegisterClass;
15477 }
15478 }
15479 if (isImmConstraint(Constraint)) {
15480 return C_Other;
15481 }
15482 return TargetLowering::getConstraintType(Constraint);
15483}
15484
15485static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15487 Val = Val & maskTrailingOnes<uint64_t>(Size);
15488 }
15489 return Val;
15490}
15491
15493 StringRef Constraint,
15494 std::vector<SDValue> &Ops,
15495 SelectionDAG &DAG) const {
15496 if (isImmConstraint(Constraint)) {
15497 uint64_t Val;
15498 if (getAsmOperandConstVal(Op, Val) &&
15499 checkAsmConstraintVal(Op, Constraint, Val)) {
15500 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15501 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15502 }
15503 } else {
15504 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15505 }
15506}
15507
15509 unsigned Size = Op.getScalarValueSizeInBits();
15510 if (Size > 64)
15511 return false;
15512
15513 if (Size == 16 && !Subtarget->has16BitInsts())
15514 return false;
15515
15516 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15517 Val = C->getSExtValue();
15518 return true;
15519 }
15520 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15521 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15522 return true;
15523 }
15524 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15525 if (Size != 16 || Op.getNumOperands() != 2)
15526 return false;
15527 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15528 return false;
15529 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15530 Val = C->getSExtValue();
15531 return true;
15532 }
15533 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15534 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15535 return true;
15536 }
15537 }
15538
15539 return false;
15540}
15541
15543 uint64_t Val) const {
15544 if (Constraint.size() == 1) {
15545 switch (Constraint[0]) {
15546 case 'I':
15548 case 'J':
15549 return isInt<16>(Val);
15550 case 'A':
15551 return checkAsmConstraintValA(Op, Val);
15552 case 'B':
15553 return isInt<32>(Val);
15554 case 'C':
15555 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15557 default:
15558 break;
15559 }
15560 } else if (Constraint.size() == 2) {
15561 if (Constraint == "DA") {
15562 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15563 int64_t LoBits = static_cast<int32_t>(Val);
15564 return checkAsmConstraintValA(Op, HiBits, 32) &&
15565 checkAsmConstraintValA(Op, LoBits, 32);
15566 }
15567 if (Constraint == "DB") {
15568 return true;
15569 }
15570 }
15571 llvm_unreachable("Invalid asm constraint");
15572}
15573
15575 unsigned MaxSize) const {
15576 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15577 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15578 if (Size == 16) {
15579 MVT VT = Op.getSimpleValueType();
15580 switch (VT.SimpleTy) {
15581 default:
15582 return false;
15583 case MVT::i16:
15584 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15585 case MVT::f16:
15586 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15587 case MVT::bf16:
15588 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15589 case MVT::v2i16:
15590 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15591 case MVT::v2f16:
15592 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15593 case MVT::v2bf16:
15594 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15595 }
15596 }
15597 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15598 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15599 return true;
15600 return false;
15601}
15602
15603static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15604 switch (UnalignedClassID) {
15605 case AMDGPU::VReg_64RegClassID:
15606 return AMDGPU::VReg_64_Align2RegClassID;
15607 case AMDGPU::VReg_96RegClassID:
15608 return AMDGPU::VReg_96_Align2RegClassID;
15609 case AMDGPU::VReg_128RegClassID:
15610 return AMDGPU::VReg_128_Align2RegClassID;
15611 case AMDGPU::VReg_160RegClassID:
15612 return AMDGPU::VReg_160_Align2RegClassID;
15613 case AMDGPU::VReg_192RegClassID:
15614 return AMDGPU::VReg_192_Align2RegClassID;
15615 case AMDGPU::VReg_224RegClassID:
15616 return AMDGPU::VReg_224_Align2RegClassID;
15617 case AMDGPU::VReg_256RegClassID:
15618 return AMDGPU::VReg_256_Align2RegClassID;
15619 case AMDGPU::VReg_288RegClassID:
15620 return AMDGPU::VReg_288_Align2RegClassID;
15621 case AMDGPU::VReg_320RegClassID:
15622 return AMDGPU::VReg_320_Align2RegClassID;
15623 case AMDGPU::VReg_352RegClassID:
15624 return AMDGPU::VReg_352_Align2RegClassID;
15625 case AMDGPU::VReg_384RegClassID:
15626 return AMDGPU::VReg_384_Align2RegClassID;
15627 case AMDGPU::VReg_512RegClassID:
15628 return AMDGPU::VReg_512_Align2RegClassID;
15629 case AMDGPU::VReg_1024RegClassID:
15630 return AMDGPU::VReg_1024_Align2RegClassID;
15631 case AMDGPU::AReg_64RegClassID:
15632 return AMDGPU::AReg_64_Align2RegClassID;
15633 case AMDGPU::AReg_96RegClassID:
15634 return AMDGPU::AReg_96_Align2RegClassID;
15635 case AMDGPU::AReg_128RegClassID:
15636 return AMDGPU::AReg_128_Align2RegClassID;
15637 case AMDGPU::AReg_160RegClassID:
15638 return AMDGPU::AReg_160_Align2RegClassID;
15639 case AMDGPU::AReg_192RegClassID:
15640 return AMDGPU::AReg_192_Align2RegClassID;
15641 case AMDGPU::AReg_256RegClassID:
15642 return AMDGPU::AReg_256_Align2RegClassID;
15643 case AMDGPU::AReg_512RegClassID:
15644 return AMDGPU::AReg_512_Align2RegClassID;
15645 case AMDGPU::AReg_1024RegClassID:
15646 return AMDGPU::AReg_1024_Align2RegClassID;
15647 default:
15648 return -1;
15649 }
15650}
15651
15652// Figure out which registers should be reserved for stack access. Only after
15653// the function is legalized do we know all of the non-spill stack objects or if
15654// calls are present.
15658 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15659 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15660 const SIInstrInfo *TII = ST.getInstrInfo();
15661
15662 if (Info->isEntryFunction()) {
15663 // Callable functions have fixed registers used for stack access.
15665 }
15666
15667 // TODO: Move this logic to getReservedRegs()
15668 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15669 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15670 Register SReg = ST.isWave32()
15671 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15672 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15673 &AMDGPU::SGPR_64RegClass);
15674 Info->setSGPRForEXECCopy(SReg);
15675
15676 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15677 Info->getStackPtrOffsetReg()));
15678 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15679 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15680
15681 // We need to worry about replacing the default register with itself in case
15682 // of MIR testcases missing the MFI.
15683 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15684 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15685
15686 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15687 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15688
15689 Info->limitOccupancy(MF);
15690
15691 if (ST.isWave32() && !MF.empty()) {
15692 for (auto &MBB : MF) {
15693 for (auto &MI : MBB) {
15694 TII->fixImplicitOperands(MI);
15695 }
15696 }
15697 }
15698
15699 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15700 // classes if required. Ideally the register class constraints would differ
15701 // per-subtarget, but there's no easy way to achieve that right now. This is
15702 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15703 // from using them as the register class for legal types.
15704 if (ST.needsAlignedVGPRs()) {
15705 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15706 const Register Reg = Register::index2VirtReg(I);
15707 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15708 if (!RC)
15709 continue;
15710 int NewClassID = getAlignedAGPRClassID(RC->getID());
15711 if (NewClassID != -1)
15712 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15713 }
15714 }
15715
15717}
15718
15720 KnownBits &Known,
15721 const APInt &DemandedElts,
15722 const SelectionDAG &DAG,
15723 unsigned Depth) const {
15724 Known.resetAll();
15725 unsigned Opc = Op.getOpcode();
15726 switch (Opc) {
15728 unsigned IID = Op.getConstantOperandVal(0);
15729 switch (IID) {
15730 case Intrinsic::amdgcn_mbcnt_lo:
15731 case Intrinsic::amdgcn_mbcnt_hi: {
15732 const GCNSubtarget &ST =
15734 // These return at most the (wavefront size - 1) + src1
15735 // As long as src1 is an immediate we can calc known bits
15736 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15737 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15738 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15739 // Cater for potential carry
15740 MaxActiveBits += Src1ValBits ? 1 : 0;
15741 unsigned Size = Op.getValueType().getSizeInBits();
15742 if (MaxActiveBits < Size)
15743 Known.Zero.setHighBits(Size - MaxActiveBits);
15744 return;
15745 }
15746 }
15747 break;
15748 }
15749 }
15751 Op, Known, DemandedElts, DAG, Depth);
15752}
15753
15755 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15757
15758 // Set the high bits to zero based on the maximum allowed scratch size per
15759 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15760 // calculation won't overflow, so assume the sign bit is never set.
15761 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15762}
15763
15765 KnownBits &Known, unsigned Dim) {
15766 unsigned MaxValue =
15767 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15768 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15769}
15770
15772 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15773 const MachineRegisterInfo &MRI, unsigned Depth) const {
15774 const MachineInstr *MI = MRI.getVRegDef(R);
15775 switch (MI->getOpcode()) {
15776 case AMDGPU::G_INTRINSIC:
15777 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15778 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15779 case Intrinsic::amdgcn_workitem_id_x:
15780 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15781 break;
15782 case Intrinsic::amdgcn_workitem_id_y:
15783 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15784 break;
15785 case Intrinsic::amdgcn_workitem_id_z:
15786 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15787 break;
15788 case Intrinsic::amdgcn_mbcnt_lo:
15789 case Intrinsic::amdgcn_mbcnt_hi: {
15790 // These return at most the wavefront size - 1.
15791 unsigned Size = MRI.getType(R).getSizeInBits();
15792 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15793 break;
15794 }
15795 case Intrinsic::amdgcn_groupstaticsize: {
15796 // We can report everything over the maximum size as 0. We can't report
15797 // based on the actual size because we don't know if it's accurate or not
15798 // at any given point.
15799 Known.Zero.setHighBits(
15800 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15801 break;
15802 }
15803 }
15804 break;
15805 }
15806 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15807 Known.Zero.setHighBits(24);
15808 break;
15809 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15810 Known.Zero.setHighBits(16);
15811 break;
15812 case AMDGPU::G_AMDGPU_SMED3:
15813 case AMDGPU::G_AMDGPU_UMED3: {
15814 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15815
15816 KnownBits Known2;
15817 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15818 if (Known2.isUnknown())
15819 break;
15820
15821 KnownBits Known1;
15822 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15823 if (Known1.isUnknown())
15824 break;
15825
15826 KnownBits Known0;
15827 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15828 if (Known0.isUnknown())
15829 break;
15830
15831 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15832 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15833 Known.One = Known0.One & Known1.One & Known2.One;
15834 break;
15835 }
15836 }
15837}
15838
15841 unsigned Depth) const {
15842 const MachineInstr *MI = MRI.getVRegDef(R);
15843 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15844 // FIXME: Can this move to generic code? What about the case where the call
15845 // site specifies a lower alignment?
15846 Intrinsic::ID IID = GI->getIntrinsicID();
15848 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15849 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15850 return *RetAlign;
15851 }
15852 return Align(1);
15853}
15854
15857 const Align CacheLineAlign = Align(64);
15858
15859 // Pre-GFX10 target did not benefit from loop alignment
15860 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15861 getSubtarget()->hasInstFwdPrefetchBug())
15862 return PrefAlign;
15863
15864 // On GFX10 I$ is 4 x 64 bytes cache lines.
15865 // By default prefetcher keeps one cache line behind and reads two ahead.
15866 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15867 // behind and one ahead.
15868 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15869 // If loop fits 64 bytes it always spans no more than two cache lines and
15870 // does not need an alignment.
15871 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15872 // Else if loop is less or equal 192 bytes we need two lines behind.
15873
15875 const MachineBasicBlock *Header = ML->getHeader();
15876 if (Header->getAlignment() != PrefAlign)
15877 return Header->getAlignment(); // Already processed.
15878
15879 unsigned LoopSize = 0;
15880 for (const MachineBasicBlock *MBB : ML->blocks()) {
15881 // If inner loop block is aligned assume in average half of the alignment
15882 // size to be added as nops.
15883 if (MBB != Header)
15884 LoopSize += MBB->getAlignment().value() / 2;
15885
15886 for (const MachineInstr &MI : *MBB) {
15887 LoopSize += TII->getInstSizeInBytes(MI);
15888 if (LoopSize > 192)
15889 return PrefAlign;
15890 }
15891 }
15892
15893 if (LoopSize <= 64)
15894 return PrefAlign;
15895
15896 if (LoopSize <= 128)
15897 return CacheLineAlign;
15898
15899 // If any of parent loops is surrounded by prefetch instructions do not
15900 // insert new for inner loop, which would reset parent's settings.
15901 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15902 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15903 auto I = Exit->getFirstNonDebugInstr();
15904 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15905 return CacheLineAlign;
15906 }
15907 }
15908
15909 MachineBasicBlock *Pre = ML->getLoopPreheader();
15910 MachineBasicBlock *Exit = ML->getExitBlock();
15911
15912 if (Pre && Exit) {
15913 auto PreTerm = Pre->getFirstTerminator();
15914 if (PreTerm == Pre->begin() ||
15915 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15916 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15917 .addImm(1); // prefetch 2 lines behind PC
15918
15919 auto ExitHead = Exit->getFirstNonDebugInstr();
15920 if (ExitHead == Exit->end() ||
15921 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15922 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15923 .addImm(2); // prefetch 1 line behind PC
15924 }
15925
15926 return CacheLineAlign;
15927}
15928
15930static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15931 assert(N->getOpcode() == ISD::CopyFromReg);
15932 do {
15933 // Follow the chain until we find an INLINEASM node.
15934 N = N->getOperand(0).getNode();
15935 if (N->getOpcode() == ISD::INLINEASM ||
15936 N->getOpcode() == ISD::INLINEASM_BR)
15937 return true;
15938 } while (N->getOpcode() == ISD::CopyFromReg);
15939 return false;
15940}
15941
15944 UniformityInfo *UA) const {
15945 switch (N->getOpcode()) {
15946 case ISD::CopyFromReg: {
15947 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15948 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15949 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15950 Register Reg = R->getReg();
15951
15952 // FIXME: Why does this need to consider isLiveIn?
15953 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15954 return !TRI->isSGPRReg(MRI, Reg);
15955
15956 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15957 return UA->isDivergent(V);
15958
15960 return !TRI->isSGPRReg(MRI, Reg);
15961 }
15962 case ISD::LOAD: {
15963 const LoadSDNode *L = cast<LoadSDNode>(N);
15964 unsigned AS = L->getAddressSpace();
15965 // A flat load may access private memory.
15967 }
15968 case ISD::CALLSEQ_END:
15969 return true;
15971 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15973 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
15992 // Target-specific read-modify-write atomics are sources of divergence.
15993 return true;
15994 default:
15995 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
15996 // Generic read-modify-write atomics are sources of divergence.
15997 return A->readMem() && A->writeMem();
15998 }
15999 return false;
16000 }
16001}
16002
16004 EVT VT) const {
16005 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16006 case MVT::f32:
16008 case MVT::f64:
16009 case MVT::f16:
16011 default:
16012 return false;
16013 }
16014}
16015
16017 LLT Ty, const MachineFunction &MF) const {
16018 switch (Ty.getScalarSizeInBits()) {
16019 case 32:
16020 return !denormalModeIsFlushAllF32(MF);
16021 case 64:
16022 case 16:
16023 return !denormalModeIsFlushAllF64F16(MF);
16024 default:
16025 return false;
16026 }
16027}
16028
16030 const SelectionDAG &DAG,
16031 bool SNaN,
16032 unsigned Depth) const {
16033 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16034 const MachineFunction &MF = DAG.getMachineFunction();
16036
16037 if (Info->getMode().DX10Clamp)
16038 return true; // Clamped to 0.
16039 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16040 }
16041
16043 SNaN, Depth);
16044}
16045
16046#if 0
16047// FIXME: This should be checked before unsafe fp atomics are enabled
16048// Global FP atomic instructions have a hardcoded FP mode and do not support
16049// FP32 denormals, and only support v2f16 denormals.
16050static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16052 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16053 if (&Flt == &APFloat::IEEEsingle())
16054 return DenormMode == DenormalMode::getPreserveSign();
16055 return DenormMode == DenormalMode::getIEEE();
16056}
16057#endif
16058
16059// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16060// floating point atomic instructions. May generate more efficient code,
16061// but may not respect rounding and denormal modes, and may give incorrect
16062// results for certain memory destinations.
16064 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16065 "true";
16066}
16067
16069 LLVMContext &Ctx = RMW->getContext();
16071 Ctx.getSyncScopeNames(SSNs);
16072 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16073 ? "system"
16074 : SSNs[RMW->getSyncScopeID()];
16075
16076 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16077 << "Hardware instruction generated for atomic "
16078 << RMW->getOperationName(RMW->getOperation())
16079 << " operation at memory scope " << MemScope;
16080}
16081
16082static bool isHalf2OrBFloat2(Type *Ty) {
16083 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16084 Type *EltTy = VT->getElementType();
16085 return VT->getNumElements() == 2 &&
16086 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16087 }
16088
16089 return false;
16090}
16091
16092static bool isHalf2(Type *Ty) {
16093 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16094 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16095}
16096
16097static bool isBFloat2(Type *Ty) {
16098 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16099 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16100}
16101
16104 unsigned AS = RMW->getPointerAddressSpace();
16105 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16107
16108 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16110 ORE.emit([=]() {
16111 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16112 });
16113 return Kind;
16114 };
16115
16116 auto SSID = RMW->getSyncScopeID();
16117 bool HasSystemScope =
16118 SSID == SyncScope::System ||
16119 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16120
16121 switch (RMW->getOperation()) {
16122 case AtomicRMWInst::Sub:
16123 case AtomicRMWInst::Or:
16124 case AtomicRMWInst::Xor: {
16125 // Atomic sub/or/xor do not work over PCI express, but atomic add
16126 // does. InstCombine transforms these with 0 to or, so undo that.
16127 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16128 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16129 ConstVal && ConstVal->isNullValue())
16131 }
16132
16133 break;
16134 }
16135 case AtomicRMWInst::FAdd: {
16136 Type *Ty = RMW->getType();
16137
16138 // TODO: Handle REGION_ADDRESS
16139 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16140 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16141 // is fixed to round-to-nearest-even.
16142 //
16143 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16144 // round-to-nearest-even.
16145 //
16146 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16147 // suggests it is OK if the floating-point mode may not match the calling
16148 // thread.
16149 if (Ty->isFloatTy()) {
16152 }
16153
16154 if (Ty->isDoubleTy()) {
16155 // Ignores denormal mode, but we don't consider flushing mandatory.
16158 }
16159
16160 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16162
16164 }
16165
16169
16170 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16172
16173 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16174 // gfx940, gfx12
16175 // FIXME: Needs to account for no fine-grained memory
16176 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16178 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16179 // gfx90a, gfx940, gfx12
16180 // FIXME: Needs to account for no fine-grained memory
16181 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16183
16184 // gfx940, gfx12
16185 // FIXME: Needs to account for no fine-grained memory
16186 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16188 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16189 // gfx90a, gfx940, gfx12
16190 // FIXME: Needs to account for no fine-grained memory
16191 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16193
16194 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16195 // buffer. gfx12 does have the buffer version.
16196 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16198 }
16199
16202
16203 // Always expand system scope fp atomics.
16204 if (HasSystemScope)
16206
16207 // global and flat atomic fadd f64: gfx90a, gfx940.
16208 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16209 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16210
16211 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16212 if (Ty->isFloatTy()) {
16213 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16214 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16215 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16216 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16217 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16218 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16219 } else {
16220 // gfx908
16221 if (RMW->use_empty() &&
16223 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16224 }
16225 }
16226
16227 // flat atomic fadd f32: gfx940, gfx11+.
16228 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16229 if (Subtarget->hasFlatAtomicFaddF32Inst())
16230 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16231
16232 // If it is in flat address space, and the type is float, we will try to
16233 // expand it, if the target supports global and lds atomic fadd. The
16234 // reason we need that is, in the expansion, we emit the check of address
16235 // space. If it is in global address space, we emit the global atomic
16236 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16237 if (Subtarget->hasLDSFPAtomicAddF32()) {
16238 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16240 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16242 }
16243 }
16244
16246 }
16248 case AtomicRMWInst::FMax: {
16249 Type *Ty = RMW->getType();
16250
16251 // LDS float and double fmin/fmax were always supported.
16252 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16254
16257
16258 // Always expand system scope fp atomics.
16259 if (HasSystemScope)
16261
16262 // For flat and global cases:
16263 // float, double in gfx7. Manual claims denormal support.
16264 // Removed in gfx8.
16265 // float, double restored in gfx10.
16266 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16267 //
16268 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16269 // f32.
16270 //
16271 // FIXME: Check scope and fine grained memory
16272 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16273 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16274 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16275 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16276 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16277 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16279 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16280 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16281 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16282 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16283 }
16284
16286 }
16287 case AtomicRMWInst::Min:
16288 case AtomicRMWInst::Max:
16290 case AtomicRMWInst::UMax: {
16293 // Always expand system scope min/max atomics.
16294 if (HasSystemScope)
16296 }
16297 break;
16298 }
16299 default:
16300 break;
16301 }
16302
16304}
16305
16311}
16312
16315 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16318}
16319
16325}
16326
16327const TargetRegisterClass *
16328SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16330 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16331 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16332 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16333 : &AMDGPU::SReg_32RegClass;
16334 if (!TRI->isSGPRClass(RC) && !isDivergent)
16335 return TRI->getEquivalentSGPRClass(RC);
16336 if (TRI->isSGPRClass(RC) && isDivergent)
16337 return TRI->getEquivalentVGPRClass(RC);
16338
16339 return RC;
16340}
16341
16342// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16343// uniform values (as produced by the mask results of control flow intrinsics)
16344// used outside of divergent blocks. The phi users need to also be treated as
16345// always uniform.
16346//
16347// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16348static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16349 unsigned WaveSize) {
16350 // FIXME: We assume we never cast the mask results of a control flow
16351 // intrinsic.
16352 // Early exit if the type won't be consistent as a compile time hack.
16353 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16354 if (!IT || IT->getBitWidth() != WaveSize)
16355 return false;
16356
16357 if (!isa<Instruction>(V))
16358 return false;
16359 if (!Visited.insert(V).second)
16360 return false;
16361 bool Result = false;
16362 for (const auto *U : V->users()) {
16363 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16364 if (V == U->getOperand(1)) {
16365 switch (Intrinsic->getIntrinsicID()) {
16366 default:
16367 Result = false;
16368 break;
16369 case Intrinsic::amdgcn_if_break:
16370 case Intrinsic::amdgcn_if:
16371 case Intrinsic::amdgcn_else:
16372 Result = true;
16373 break;
16374 }
16375 }
16376 if (V == U->getOperand(0)) {
16377 switch (Intrinsic->getIntrinsicID()) {
16378 default:
16379 Result = false;
16380 break;
16381 case Intrinsic::amdgcn_end_cf:
16382 case Intrinsic::amdgcn_loop:
16383 Result = true;
16384 break;
16385 }
16386 }
16387 } else {
16388 Result = hasCFUser(U, Visited, WaveSize);
16389 }
16390 if (Result)
16391 break;
16392 }
16393 return Result;
16394}
16395
16397 const Value *V) const {
16398 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16399 if (CI->isInlineAsm()) {
16400 // FIXME: This cannot give a correct answer. This should only trigger in
16401 // the case where inline asm returns mixed SGPR and VGPR results, used
16402 // outside the defining block. We don't have a specific result to
16403 // consider, so this assumes if any value is SGPR, the overall register
16404 // also needs to be SGPR.
16405 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16407 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16408 for (auto &TC : TargetConstraints) {
16409 if (TC.Type == InlineAsm::isOutput) {
16412 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16413 if (RC && SIRI->isSGPRClass(RC))
16414 return true;
16415 }
16416 }
16417 }
16418 }
16420 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16421}
16422
16424 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16425 for (; I != E; ++I) {
16426 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16427 if (getBasePtrIndex(M) == I.getOperandNo())
16428 return true;
16429 }
16430 }
16431 return false;
16432}
16433
16435 SDValue N1) const {
16436 if (!N0.hasOneUse())
16437 return false;
16438 // Take care of the opportunity to keep N0 uniform
16439 if (N0->isDivergent() || !N1->isDivergent())
16440 return true;
16441 // Check if we have a good chance to form the memory access pattern with the
16442 // base and offset
16443 return (DAG.isBaseWithConstantOffset(N0) &&
16444 hasMemSDNodeUser(*N0->use_begin()));
16445}
16446
16448 Register N0, Register N1) const {
16449 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16450}
16451
16454 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16456 if (I.getMetadata("amdgpu.noclobber"))
16457 Flags |= MONoClobber;
16458 if (I.getMetadata("amdgpu.last.use"))
16459 Flags |= MOLastUse;
16460 return Flags;
16461}
16462
16464 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16465 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16466 if (User->getOpcode() != ISD::CopyToReg)
16467 return false;
16468 if (!Def->isMachineOpcode())
16469 return false;
16470 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16471 if (!MDef)
16472 return false;
16473
16474 unsigned ResNo = User->getOperand(Op).getResNo();
16475 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16476 return false;
16477 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16478 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16479 PhysReg = AMDGPU::SCC;
16480 const TargetRegisterClass *RC =
16481 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16482 Cost = RC->getCopyCost();
16483 return true;
16484 }
16485 return false;
16486}
16487
16490
16493 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16494 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16495 "this cannot be replaced with add");
16497 return;
16498 }
16499
16500 assert(Subtarget->hasAtomicFaddInsts() &&
16501 "target should have atomic fadd instructions");
16502 assert(AI->getType()->isFloatTy() &&
16504 "generic atomicrmw expansion only supports FP32 operand in flat "
16505 "address space");
16506 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16507
16508 // Given: atomicrmw fadd ptr %addr, float %val ordering
16509 //
16510 // With this expansion we produce the following code:
16511 // [...]
16512 // br label %atomicrmw.check.shared
16513 //
16514 // atomicrmw.check.shared:
16515 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16516 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16517 //
16518 // atomicrmw.shared:
16519 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16520 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16521 // float %val ordering
16522 // br label %atomicrmw.phi
16523 //
16524 // atomicrmw.check.private:
16525 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16526 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16527 //
16528 // atomicrmw.private:
16529 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16530 // %loaded.private = load float, ptr addrspace(5) %cast.private
16531 // %val.new = fadd float %loaded.private, %val
16532 // store float %val.new, ptr addrspace(5) %cast.private
16533 // br label %atomicrmw.phi
16534 //
16535 // atomicrmw.global:
16536 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16537 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16538 // float %val ordering
16539 // br label %atomicrmw.phi
16540 //
16541 // atomicrmw.phi:
16542 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16543 // [ %loaded.private, %atomicrmw.private ],
16544 // [ %loaded.global, %atomicrmw.global ]
16545 // br label %atomicrmw.end
16546 //
16547 // atomicrmw.end:
16548 // [...]
16549
16550 IRBuilder<> Builder(AI);
16551 LLVMContext &Ctx = Builder.getContext();
16552
16553 BasicBlock *BB = Builder.GetInsertBlock();
16554 Function *F = BB->getParent();
16555 BasicBlock *ExitBB =
16556 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16557 BasicBlock *CheckSharedBB =
16558 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16559 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16560 BasicBlock *CheckPrivateBB =
16561 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16562 BasicBlock *PrivateBB =
16563 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16564 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16565 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16566
16567 Value *Val = AI->getValOperand();
16568 Type *ValTy = Val->getType();
16569 Value *Addr = AI->getPointerOperand();
16570
16571 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16572 Value *Val) -> Value * {
16573 AtomicRMWInst *OldVal =
16574 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16575 AI->getOrdering(), AI->getSyncScopeID());
16577 AI->getAllMetadata(MDs);
16578 for (auto &P : MDs)
16579 OldVal->setMetadata(P.first, P.second);
16580 return OldVal;
16581 };
16582
16583 std::prev(BB->end())->eraseFromParent();
16584 Builder.SetInsertPoint(BB);
16585 Builder.CreateBr(CheckSharedBB);
16586
16587 Builder.SetInsertPoint(CheckSharedBB);
16588 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16589 {Addr}, nullptr, "is.shared");
16590 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16591
16592 Builder.SetInsertPoint(SharedBB);
16593 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16595 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16596 Builder.CreateBr(PhiBB);
16597
16598 Builder.SetInsertPoint(CheckPrivateBB);
16599 CallInst *IsPrivate = Builder.CreateIntrinsic(
16600 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16601 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16602
16603 Builder.SetInsertPoint(PrivateBB);
16604 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16606 Value *LoadedPrivate =
16607 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16608 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16609 Builder.CreateStore(NewVal, CastToPrivate);
16610 Builder.CreateBr(PhiBB);
16611
16612 Builder.SetInsertPoint(GlobalBB);
16613 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16615 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16616 Builder.CreateBr(PhiBB);
16617
16618 Builder.SetInsertPoint(PhiBB);
16619 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16620 Loaded->addIncoming(LoadedShared, SharedBB);
16621 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16622 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16623 Builder.CreateBr(ExitBB);
16624
16625 AI->replaceAllUsesWith(Loaded);
16626 AI->eraseFromParent();
16627}
16628
16629LoadInst *
16631 IRBuilder<> Builder(AI);
16632 auto Order = AI->getOrdering();
16633
16634 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16635 // must be flushed if the atomic ordering had a release semantics. This is
16636 // not necessary a fence, a release fence just coincides to do that flush.
16637 // Avoid replacing of an atomicrmw with a release semantics.
16638 if (isReleaseOrStronger(Order))
16639 return nullptr;
16640
16641 LoadInst *LI = Builder.CreateAlignedLoad(
16642 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16643 LI->setAtomic(Order, AI->getSyncScopeID());
16644 LI->copyMetadata(*AI);
16645 LI->takeName(AI);
16646 AI->replaceAllUsesWith(LI);
16647 AI->eraseFromParent();
16648 return LI;
16649}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1168
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isHalf2OrBFloat2(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static bool isHalf2(Type *Ty)
bool unsafeFPAtomicsDisabled(Function *F)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool isBFloat2(Type *Ty)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1026
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5317
bool isNegative() const
Definition: APFloat.h:1354
APInt bitcastToAPInt() const
Definition: APFloat.h:1260
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1044
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1004
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:988
bool isInfinity() const
Definition: APFloat.h:1351
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:238
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:446
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1598
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1217
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1201
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:632
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:809
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
Value * getPointerOperand()
Definition: Instructions.h:852
void setOperation(BinOp Operation)
Definition: Instructions.h:803
BinOp getOperation() const
Definition: Instructions.h:787
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:843
Value * getValOperand()
Definition: Instructions.h:856
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:829
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:860
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:451
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:202
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:575
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1551
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
bool isSigned() const
Definition: InstrTypes.h:1007
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
iterator_range< arg_iterator > args()
Definition: Function.h:855
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasPrefetch() const
Definition: GCNSubtarget.h:939
bool hasD16Images() const
Definition: GCNSubtarget.h:695
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:844
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:476
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:467
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:840
bool hasDot7Insts() const
Definition: GCNSubtarget.h:794
bool hasApertureRegs() const
Definition: GCNSubtarget.h:596
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:626
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:836
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:764
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:410
bool hasMAIInsts() const
Definition: GCNSubtarget.h:814
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:675
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:526
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:584
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:265
bool hasDot1Insts() const
Definition: GCNSubtarget.h:770
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:852
Align getStackAlignment() const
Definition: GCNSubtarget.h:952
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:454
bool enableFlatScratch() const
Definition: GCNSubtarget.h:651
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:622
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:460
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:872
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:277
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:740
bool useDS128() const
Definition: GCNSubtarget.h:536
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:456
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:269
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:588
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:828
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:426
bool hasIntClamp() const
Definition: GCNSubtarget.h:356
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:376
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:600
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:630
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:965
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:729
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:335
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:919
bool hasFFBL() const
Definition: GCNSubtarget.h:414
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:984
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:558
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:832
bool hasMed3_16() const
Definition: GCNSubtarget.h:422
bool hasMovrel() const
Definition: GCNSubtarget.h:988
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:846
bool hasBFI() const
Definition: GCNSubtarget.h:402
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:576
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:343
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:521
bool hasFFBH() const
Definition: GCNSubtarget.h:418
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:848
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:856
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:868
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:854
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:876
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:546
bool hasDot8Insts() const
Definition: GCNSubtarget.h:798
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:541
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:530
Generation getGeneration() const
Definition: GCNSubtarget.h:316
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:860
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:727
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:864
bool hasAddr64() const
Definition: GCNSubtarget.h:380
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:430
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasFractBug() const
Definition: GCNSubtarget.h:394
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:398
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:710
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1812
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1538
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2402
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1125
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1795
LLVMContext & getContext() const
Definition: IRBuilder.h:173
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1808
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1859
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1119
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:363
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:399
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:174
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:259
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:734
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:966
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:565
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
Definition: SelectionDAG.h:482
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:840
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:785
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:688
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:811
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:857
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:574
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:838
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:250
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:778
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1163
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:751
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1317
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1039
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1310
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:742
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1312
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1282
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1313
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1072
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:811
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1295
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:818
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:941
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:494
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1308
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:931
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1309
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:974
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1451
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1315
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:913
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1437
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:802
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1229
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1088
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:750
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1262
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1029
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:958
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1118
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1311
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:521
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:755
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1278
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1318
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:908
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1057
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1034
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:733
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1306
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1019
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:808
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1252
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:770
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1289
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1314
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1006
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1082
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:826
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:916
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1138
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:950
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1320
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1304
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:479
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1025
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1305
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:864
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1223
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1249
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1303
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:979
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:897
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:935
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1135
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:814
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1111
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:791
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1319
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1574
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1554
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1071
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1581
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:276
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:274
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals