LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236 }
237
238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254
255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262
263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264
268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269
270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274
276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Expand);
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Expand);
289
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Custom);
294
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298
300
302
304 Expand);
305
306#if 0
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
331 case ISD::IS_FPCLASS:
332 break;
337 break;
338 default:
340 break;
341 }
342 }
343 }
344
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420 }
421
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Expand);
425
426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
455 Expand);
456
457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458
459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
468
469 if (Subtarget->hasSMemRealTime() ||
473
474 if (Subtarget->has16BitInsts()) {
477 } else {
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487
488 if (!Subtarget->hasBCNT(32))
490
491 if (!Subtarget->hasBCNT(64))
493
494 if (Subtarget->hasFFBH())
496
497 if (Subtarget->hasFFBL())
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517 Legal);
518
519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520 Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
526 {MVT::f32, MVT::f64}, Legal);
527
528 if (Subtarget->haveRoundOpsF64())
530 Legal);
531 else
533 MVT::f64, Custom);
534
536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537 Legal);
538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539
542
543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
550
551 if (Subtarget->has16BitInsts()) {
554 MVT::i16, Legal);
555
556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557
559 MVT::i16, Expand);
560
564 ISD::CTPOP},
565 MVT::i16, Promote);
566
568
569 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570
572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575
579
581
582 // F16 - Constant Actions.
585
586 // F16 - Load/Store Actions.
588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591
592 // BF16 - Load/Store Actions.
594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597
598 // F16 - VOP1 Actions.
601 MVT::f16, Custom);
602
605
606 // F16 - VOP2 Actions.
607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608 Expand);
612
613 // F16 - VOP3 Actions.
615 if (STI.hasMadF16())
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
638 break;
639 default:
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652
653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Legal);
655
657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660
662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665
666 setOperationAction(ISD::AND, MVT::v2i16, Promote);
667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668 setOperationAction(ISD::OR, MVT::v2i16, Promote);
669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672
674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679
681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693
695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698
700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705
706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712
714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719
720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726
728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733
735 MVT::v2i32, Expand);
737
739 MVT::v4i32, Expand);
740
742 MVT::v8i32, Expand);
743
744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
754
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 }
859
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863 MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
871 Custom);
872
874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
885
886 // TODO: Could move this to custom lowering, could benefit from combines on
887 // extract of relevant bits.
889
891
894 ISD::SUB,
896 ISD::FADD,
897 ISD::FSUB,
898 ISD::FDIV,
905 ISD::FMA,
906 ISD::SMIN,
907 ISD::SMAX,
908 ISD::UMIN,
909 ISD::UMAX,
911 ISD::AND,
912 ISD::OR,
913 ISD::XOR,
914 ISD::FSHR,
924
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
927
928 // All memory operations. Some folding on the pointer operand is done to help
929 // matching the constant offsets in the addressing modes.
954
955 // FIXME: In other contexts we pretend this is a per-function property.
957
959}
960
962 return Subtarget;
963}
964
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
967 return RCRegs;
968}
969
970//===----------------------------------------------------------------------===//
971// TargetLowering queries
972//===----------------------------------------------------------------------===//
973
974// v_mad_mix* support a conversion from f16 to f32.
975//
976// There is only one special case when denormals are enabled we don't currently,
977// where this is OK to use.
978bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
979 EVT DestVT, EVT SrcVT) const {
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
982 DestVT.getScalarType() == MVT::f32 &&
983 SrcVT.getScalarType() == MVT::f16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 LLT DestTy, LLT SrcTy) const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
992 DestTy.getScalarSizeInBits() == 32 &&
993 SrcTy.getScalarSizeInBits() == 16 &&
994 // TODO: This probably only requires no input flushing?
996}
997
999 // SI has some legal vector types, but no legal vector operations. Say no
1000 // shuffles are legal in order to prefer scalarizing some vector operations.
1001 return false;
1002}
1003
1006 EVT VT) const {
1009
1010 if (VT.isVector()) {
1011 EVT ScalarVT = VT.getScalarType();
1012 unsigned Size = ScalarVT.getSizeInBits();
1013 if (Size == 16) {
1014 if (Subtarget->has16BitInsts()) {
1015 if (VT.isInteger())
1016 return MVT::v2i16;
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1018 }
1019 return VT.isInteger() ? MVT::i32 : MVT::f32;
1020 }
1021
1022 if (Size < 16)
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1024 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1025 }
1026
1027 if (VT.getSizeInBits() > 32)
1028 return MVT::i32;
1029
1031}
1032
1035 EVT VT) const {
1038
1039 if (VT.isVector()) {
1040 unsigned NumElts = VT.getVectorNumElements();
1041 EVT ScalarVT = VT.getScalarType();
1042 unsigned Size = ScalarVT.getSizeInBits();
1043
1044 // FIXME: Should probably promote 8-bit vectors to i16.
1045 if (Size == 16 && Subtarget->has16BitInsts())
1046 return (NumElts + 1) / 2;
1047
1048 if (Size <= 32)
1049 return NumElts;
1050
1051 if (Size > 32)
1052 return NumElts * ((Size + 31) / 32);
1053 } else if (VT.getSizeInBits() > 32)
1054 return (VT.getSizeInBits() + 31) / 32;
1055
1057}
1058
1060 LLVMContext &Context, CallingConv::ID CC,
1061 EVT VT, EVT &IntermediateVT,
1062 unsigned &NumIntermediates, MVT &RegisterVT) const {
1063 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1064 unsigned NumElts = VT.getVectorNumElements();
1065 EVT ScalarVT = VT.getScalarType();
1066 unsigned Size = ScalarVT.getSizeInBits();
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068 // support, but unless we can properly handle 3-vectors, it will be still be
1069 // inconsistent.
1070 if (Size == 16 && Subtarget->has16BitInsts()) {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1074 } else {
1075 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1077 }
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1080 }
1081
1082 if (Size == 32) {
1083 RegisterVT = ScalarVT.getSimpleVT();
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1087 }
1088
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1090 // FIXME: Should probably form v2i16 pieces
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1095 }
1096
1097
1098 if (Size != 16 && Size <= 32) {
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1103 }
1104
1105 if (Size > 32) {
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((Size + 31) / 32);
1109 return NumIntermediates;
1110 }
1111 }
1112
1114 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1115}
1116
1118 const DataLayout &DL, Type *Ty,
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1121
1122 LLVMContext &Ctx = Ty->getContext();
1123 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1126 NumElts);
1127 }
1128
1129 return TLI.getValueType(DL, Ty);
1130}
1131
1132// Peek through TFE struct returns to only use the data size.
1134 const DataLayout &DL, Type *Ty,
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Ty);
1137 if (!ST)
1138 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1139
1140 // TFE intrinsics return an aggregate type.
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1144}
1145
1146/// Map address space 7 to MVT::v5i32 because that's its in-memory
1147/// representation. This return value is vector-typed because there is no
1148/// MVT::i160 and it is not clear if one can be added. While this could
1149/// cause issues during codegen, these address space 7 pointers will be
1150/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1151/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1152/// modeling, to work.
1154 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1155 return MVT::v5i32;
1157 DL.getPointerSizeInBits(AS) == 192)
1158 return MVT::v6i32;
1160}
1161/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1162/// v8i32 when padding is added.
1163/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1164/// also v8i32 with padding.
1166 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1167 DL.getPointerSizeInBits(AS) == 160) ||
1169 DL.getPointerSizeInBits(AS) == 192))
1170 return MVT::v8i32;
1172}
1173
1175 const CallInst &CI,
1176 MachineFunction &MF,
1177 unsigned IntrID) const {
1179 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1181
1182 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1185 (Intrinsic::ID)IntrID);
1186 MemoryEffects ME = Attr.getMemoryEffects();
1187 if (ME.doesNotAccessMemory())
1188 return false;
1189
1190 // TODO: Should images get their own address space?
1191 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1192
1193 if (RsrcIntr->IsImage)
1194 Info.align.reset();
1195
1196 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1197 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1198 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1199 // We conservatively set the memory operand of a buffer intrinsic to the
1200 // base resource pointer, so that we can access alias information about
1201 // those pointers. Cases like "this points at the same value
1202 // but with a different offset" are handled in
1203 // areMemAccessesTriviallyDisjoint.
1204 Info.ptrVal = RsrcArg;
1205 }
1206
1207 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1208 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1211 if (ME.onlyReadsMemory()) {
1212 if (RsrcIntr->IsImage) {
1213 unsigned MaxNumLanes = 4;
1214
1217 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1219
1220 if (!BaseOpcode->Gather4) {
1221 // If this isn't a gather, we may have excess loaded elements in the
1222 // IR type. Check the dmask for the real number of elements loaded.
1223 unsigned DMask
1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1225 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1226 }
1227
1228 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1229 CI.getType(), MaxNumLanes);
1230 } else {
1231 Info.memVT =
1233 std::numeric_limits<unsigned>::max());
1234 }
1235
1236 // FIXME: What does alignment mean for an image?
1239 } else if (ME.onlyWritesMemory()) {
1241
1242 Type *DataTy = CI.getArgOperand(0)->getType();
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1245 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1246 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1247 DMaskLanes);
1248 } else
1249 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1250
1252 } else {
1253 // Atomic
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1259
1260 switch (IntrID) {
1261 default:
1262 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1263 // XXX - Should this be volatile without known ordering?
1265 break;
1266 case Intrinsic::amdgcn_raw_buffer_load_lds:
1267 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1268 case Intrinsic::amdgcn_struct_buffer_load_lds:
1269 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1270 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1271 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1272 Info.ptrVal = CI.getArgOperand(1);
1273 return true;
1274 }
1275 }
1276 }
1277 return true;
1278 }
1279
1280 switch (IntrID) {
1281 case Intrinsic::amdgcn_ds_ordered_add:
1282 case Intrinsic::amdgcn_ds_ordered_swap: {
1284 Info.memVT = MVT::getVT(CI.getType());
1285 Info.ptrVal = CI.getOperand(0);
1286 Info.align.reset();
1288
1289 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1290 if (!Vol->isZero())
1292
1293 return true;
1294 }
1295 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1296 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1298 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1299 Info.ptrVal = nullptr;
1300 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1302 return true;
1303 }
1304 case Intrinsic::amdgcn_ds_append:
1305 case Intrinsic::amdgcn_ds_consume: {
1307 Info.memVT = MVT::getVT(CI.getType());
1308 Info.ptrVal = CI.getOperand(0);
1309 Info.align.reset();
1311
1312 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1313 if (!Vol->isZero())
1315
1316 return true;
1317 }
1318 case Intrinsic::amdgcn_global_atomic_csub: {
1320 Info.memVT = MVT::getVT(CI.getType());
1321 Info.ptrVal = CI.getOperand(0);
1322 Info.align.reset();
1326 return true;
1327 }
1328 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1330 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1331
1332 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1333 Info.align.reset();
1336 return true;
1337 }
1338 case Intrinsic::amdgcn_global_atomic_fadd:
1339 case Intrinsic::amdgcn_global_atomic_fmin:
1340 case Intrinsic::amdgcn_global_atomic_fmax:
1341 case Intrinsic::amdgcn_global_atomic_fmin_num:
1342 case Intrinsic::amdgcn_global_atomic_fmax_num:
1343 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1344 case Intrinsic::amdgcn_flat_atomic_fadd:
1345 case Intrinsic::amdgcn_flat_atomic_fmin:
1346 case Intrinsic::amdgcn_flat_atomic_fmax:
1347 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1348 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1349 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1350 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1351 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1353 Info.memVT = MVT::getVT(CI.getType());
1354 Info.ptrVal = CI.getOperand(0);
1355 Info.align.reset();
1360 return true;
1361 }
1362 case Intrinsic::amdgcn_global_load_tr_b64:
1363 case Intrinsic::amdgcn_global_load_tr_b128: {
1365 Info.memVT = MVT::getVT(CI.getType());
1366 Info.ptrVal = CI.getOperand(0);
1367 Info.align.reset();
1369 return true;
1370 }
1371 case Intrinsic::amdgcn_ds_gws_init:
1372 case Intrinsic::amdgcn_ds_gws_barrier:
1373 case Intrinsic::amdgcn_ds_gws_sema_v:
1374 case Intrinsic::amdgcn_ds_gws_sema_br:
1375 case Intrinsic::amdgcn_ds_gws_sema_p:
1376 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1378
1379 const GCNTargetMachine &TM =
1380 static_cast<const GCNTargetMachine &>(getTargetMachine());
1381
1383 Info.ptrVal = MFI->getGWSPSV(TM);
1384
1385 // This is an abstract access, but we need to specify a type and size.
1386 Info.memVT = MVT::i32;
1387 Info.size = 4;
1388 Info.align = Align(4);
1389
1390 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1392 else
1394 return true;
1395 }
1396 case Intrinsic::amdgcn_global_load_lds: {
1398 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1399 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1400 Info.ptrVal = CI.getArgOperand(1);
1402 return true;
1403 }
1404 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1406
1407 const GCNTargetMachine &TM =
1408 static_cast<const GCNTargetMachine &>(getTargetMachine());
1409
1411 Info.ptrVal = MFI->getGWSPSV(TM);
1412
1413 // This is an abstract access, but we need to specify a type and size.
1414 Info.memVT = MVT::i32;
1415 Info.size = 4;
1416 Info.align = Align(4);
1417
1419 return true;
1420 }
1421 default:
1422 return false;
1423 }
1424}
1425
1427 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1428 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1429 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1430 // The DAG's ValueType loses the addrspaces.
1431 // Add them as 2 extra Constant operands "from" and "to".
1432 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1433 unsigned DstAS = I.getType()->getPointerAddressSpace();
1434 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1435 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1436 break;
1437 }
1438 default:
1439 break;
1440 }
1441}
1442
1445 Type *&AccessTy) const {
1446 Value *Ptr = nullptr;
1447 switch (II->getIntrinsicID()) {
1448 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1449 case Intrinsic::amdgcn_ds_append:
1450 case Intrinsic::amdgcn_ds_consume:
1451 case Intrinsic::amdgcn_ds_ordered_add:
1452 case Intrinsic::amdgcn_ds_ordered_swap:
1453 case Intrinsic::amdgcn_flat_atomic_fadd:
1454 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1455 case Intrinsic::amdgcn_flat_atomic_fmax:
1456 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1457 case Intrinsic::amdgcn_flat_atomic_fmin:
1458 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1459 case Intrinsic::amdgcn_global_atomic_csub:
1460 case Intrinsic::amdgcn_global_atomic_fadd:
1461 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1462 case Intrinsic::amdgcn_global_atomic_fmax:
1463 case Intrinsic::amdgcn_global_atomic_fmax_num:
1464 case Intrinsic::amdgcn_global_atomic_fmin:
1465 case Intrinsic::amdgcn_global_atomic_fmin_num:
1466 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b64:
1468 case Intrinsic::amdgcn_global_load_tr_b128:
1469 Ptr = II->getArgOperand(0);
1470 break;
1471 case Intrinsic::amdgcn_global_load_lds:
1472 Ptr = II->getArgOperand(1);
1473 break;
1474 default:
1475 return false;
1476 }
1477 AccessTy = II->getType();
1478 Ops.push_back(Ptr);
1479 return true;
1480}
1481
1483 unsigned AddrSpace) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 decltype(SIInstrFlags::FLAT) FlatVariant =
1494
1495 return AM.Scale == 0 &&
1496 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1497 AM.BaseOffs, AddrSpace, FlatVariant));
1498}
1499
1501 if (Subtarget->hasFlatGlobalInsts())
1503
1504 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1505 // Assume the we will use FLAT for all global memory accesses
1506 // on VI.
1507 // FIXME: This assumption is currently wrong. On VI we still use
1508 // MUBUF instructions for the r + i addressing mode. As currently
1509 // implemented, the MUBUF instructions only work on buffer < 4GB.
1510 // It may be possible to support > 4GB buffers with MUBUF instructions,
1511 // by setting the stride value in the resource descriptor which would
1512 // increase the size limit to (stride * 4GB). However, this is risky,
1513 // because it has never been validated.
1515 }
1516
1517 return isLegalMUBUFAddressingMode(AM);
1518}
1519
1520bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1521 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1522 // additionally can do r + r + i with addr64. 32-bit has more addressing
1523 // mode options. Depending on the resource constant, it can also do
1524 // (i64 r0) + (i32 r1) * (i14 i).
1525 //
1526 // Private arrays end up using a scratch buffer most of the time, so also
1527 // assume those use MUBUF instructions. Scratch loads / stores are currently
1528 // implemented as mubuf instructions with offen bit set, so slightly
1529 // different than the normal addr64.
1530 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1531 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1532 return false;
1533
1534 // FIXME: Since we can split immediate into soffset and immediate offset,
1535 // would it make sense to allow any immediate?
1536
1537 switch (AM.Scale) {
1538 case 0: // r + i or just i, depending on HasBaseReg.
1539 return true;
1540 case 1:
1541 return true; // We have r + r or r + i.
1542 case 2:
1543 if (AM.HasBaseReg) {
1544 // Reject 2 * r + r.
1545 return false;
1546 }
1547
1548 // Allow 2 * r as r + r
1549 // Or 2 * r + i is allowed as r + r + i.
1550 return true;
1551 default: // Don't allow n * r
1552 return false;
1553 }
1554}
1555
1557 const AddrMode &AM, Type *Ty,
1558 unsigned AS, Instruction *I) const {
1559 // No global is ever allowed as a base.
1560 if (AM.BaseGV)
1561 return false;
1562
1563 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1564 return isLegalGlobalAddressingMode(AM);
1565
1566 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1570 // If the offset isn't a multiple of 4, it probably isn't going to be
1571 // correctly aligned.
1572 // FIXME: Can we get the real alignment here?
1573 if (AM.BaseOffs % 4 != 0)
1574 return isLegalMUBUFAddressingMode(AM);
1575
1576 if (!Subtarget->hasScalarSubwordLoads()) {
1577 // There are no SMRD extloads, so if we have to do a small type access we
1578 // will use a MUBUF load.
1579 // FIXME?: We also need to do this if unaligned, but we don't know the
1580 // alignment here.
1581 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1582 return isLegalGlobalAddressingMode(AM);
1583 }
1584
1586 // SMRD instructions have an 8-bit, dword offset on SI.
1587 if (!isUInt<8>(AM.BaseOffs / 4))
1588 return false;
1589 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1590 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1591 // in 8-bits, it can use a smaller encoding.
1592 if (!isUInt<32>(AM.BaseOffs / 4))
1593 return false;
1594 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1595 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1596 if (!isUInt<20>(AM.BaseOffs))
1597 return false;
1598 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1599 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1600 // for S_BUFFER_* instructions).
1601 if (!isInt<21>(AM.BaseOffs))
1602 return false;
1603 } else {
1604 // On GFX12, all offsets are signed 24-bit in bytes.
1605 if (!isInt<24>(AM.BaseOffs))
1606 return false;
1607 }
1608
1609 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1611 AM.BaseOffs < 0) {
1612 // Scalar (non-buffer) loads can only use a negative offset if
1613 // soffset+offset is non-negative. Since the compiler can only prove that
1614 // in a few special cases, it is safer to claim that negative offsets are
1615 // not supported.
1616 return false;
1617 }
1618
1619 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1620 return true;
1621
1622 if (AM.Scale == 1 && AM.HasBaseReg)
1623 return true;
1624
1625 return false;
1626 }
1627
1628 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1629 return Subtarget->enableFlatScratch()
1631 : isLegalMUBUFAddressingMode(AM);
1632
1633 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1634 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1635 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1636 // field.
1637 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1638 // an 8-bit dword offset but we don't know the alignment here.
1639 if (!isUInt<16>(AM.BaseOffs))
1640 return false;
1641
1642 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1643 return true;
1644
1645 if (AM.Scale == 1 && AM.HasBaseReg)
1646 return true;
1647
1648 return false;
1649 }
1650
1652 // For an unknown address space, this usually means that this is for some
1653 // reason being used for pure arithmetic, and not based on some addressing
1654 // computation. We don't have instructions that compute pointers with any
1655 // addressing modes, so treat them as having no offset like flat
1656 // instructions.
1658 }
1659
1660 // Assume a user alias of global for unknown address spaces.
1661 return isLegalGlobalAddressingMode(AM);
1662}
1663
1665 const MachineFunction &MF) const {
1667 return (MemVT.getSizeInBits() <= 4 * 32);
1668 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1669 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1670 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1671 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1672 return (MemVT.getSizeInBits() <= 2 * 32);
1673 }
1674 return true;
1675}
1676
1678 unsigned Size, unsigned AddrSpace, Align Alignment,
1679 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1680 if (IsFast)
1681 *IsFast = 0;
1682
1683 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1684 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1685 // Check if alignment requirements for ds_read/write instructions are
1686 // disabled.
1687 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1688 return false;
1689
1690 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1691 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1692 Alignment < RequiredAlignment)
1693 return false;
1694
1695 // Either, the alignment requirements are "enabled", or there is an
1696 // unaligned LDS access related hardware bug though alignment requirements
1697 // are "disabled". In either case, we need to check for proper alignment
1698 // requirements.
1699 //
1700 switch (Size) {
1701 case 64:
1702 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1703 // address is negative, then the instruction is incorrectly treated as
1704 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1705 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1706 // load later in the SILoadStoreOptimizer.
1707 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1708 return false;
1709
1710 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1711 // can do a 4 byte aligned, 8 byte access in a single operation using
1712 // ds_read2/write2_b32 with adjacent offsets.
1713 RequiredAlignment = Align(4);
1714
1715 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1716 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1717 // ds_write2_b32 depending on the alignment. In either case with either
1718 // alignment there is no faster way of doing this.
1719
1720 // The numbers returned here and below are not additive, it is a 'speed
1721 // rank'. They are just meant to be compared to decide if a certain way
1722 // of lowering an operation is faster than another. For that purpose
1723 // naturally aligned operation gets it bitsize to indicate that "it
1724 // operates with a speed comparable to N-bit wide load". With the full
1725 // alignment ds128 is slower than ds96 for example. If underaligned it
1726 // is comparable to a speed of a single dword access, which would then
1727 // mean 32 < 128 and it is faster to issue a wide load regardless.
1728 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1729 // wider load which will not be aligned anymore the latter is slower.
1730 if (IsFast)
1731 *IsFast = (Alignment >= RequiredAlignment) ? 64
1732 : (Alignment < Align(4)) ? 32
1733 : 1;
1734 return true;
1735 }
1736
1737 break;
1738 case 96:
1739 if (!Subtarget->hasDS96AndDS128())
1740 return false;
1741
1742 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1743 // gfx8 and older.
1744
1745 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1746 // Naturally aligned access is fastest. However, also report it is Fast
1747 // if memory is aligned less than DWORD. A narrow load or store will be
1748 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1749 // be more of them, so overall we will pay less penalty issuing a single
1750 // instruction.
1751
1752 // See comment on the values above.
1753 if (IsFast)
1754 *IsFast = (Alignment >= RequiredAlignment) ? 96
1755 : (Alignment < Align(4)) ? 32
1756 : 1;
1757 return true;
1758 }
1759
1760 break;
1761 case 128:
1762 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1763 return false;
1764
1765 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1766 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1767 // single operation using ds_read2/write2_b64.
1768 RequiredAlignment = Align(8);
1769
1770 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1771 // Naturally aligned access is fastest. However, also report it is Fast
1772 // if memory is aligned less than DWORD. A narrow load or store will be
1773 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1774 // will be more of them, so overall we will pay less penalty issuing a
1775 // single instruction.
1776
1777 // See comment on the values above.
1778 if (IsFast)
1779 *IsFast = (Alignment >= RequiredAlignment) ? 128
1780 : (Alignment < Align(4)) ? 32
1781 : 1;
1782 return true;
1783 }
1784
1785 break;
1786 default:
1787 if (Size > 32)
1788 return false;
1789
1790 break;
1791 }
1792
1793 // See comment on the values above.
1794 // Note that we have a single-dword or sub-dword here, so if underaligned
1795 // it is a slowest possible access, hence returned value is 0.
1796 if (IsFast)
1797 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1798
1799 return Alignment >= RequiredAlignment ||
1800 Subtarget->hasUnalignedDSAccessEnabled();
1801 }
1802
1803 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1804 bool AlignedBy4 = Alignment >= Align(4);
1805 if (IsFast)
1806 *IsFast = AlignedBy4;
1807
1808 return AlignedBy4 ||
1809 Subtarget->enableFlatScratch() ||
1810 Subtarget->hasUnalignedScratchAccess();
1811 }
1812
1813 // FIXME: We have to be conservative here and assume that flat operations
1814 // will access scratch. If we had access to the IR function, then we
1815 // could determine if any private memory was used in the function.
1816 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1817 !Subtarget->hasUnalignedScratchAccess()) {
1818 bool AlignedBy4 = Alignment >= Align(4);
1819 if (IsFast)
1820 *IsFast = AlignedBy4;
1821
1822 return AlignedBy4;
1823 }
1824
1825 // So long as they are correct, wide global memory operations perform better
1826 // than multiple smaller memory ops -- even when misaligned
1827 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1828 if (IsFast)
1829 *IsFast = Size;
1830
1831 return Alignment >= Align(4) ||
1833 }
1834
1835 // Smaller than dword value must be aligned.
1836 if (Size < 32)
1837 return false;
1838
1839 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1840 // byte-address are ignored, thus forcing Dword alignment.
1841 // This applies to private, global, and constant memory.
1842 if (IsFast)
1843 *IsFast = 1;
1844
1845 return Size >= 32 && Alignment >= Align(4);
1846}
1847
1849 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1850 unsigned *IsFast) const {
1852 Alignment, Flags, IsFast);
1853}
1854
1856 const MemOp &Op, const AttributeList &FuncAttributes) const {
1857 // FIXME: Should account for address space here.
1858
1859 // The default fallback uses the private pointer size as a guess for a type to
1860 // use. Make sure we switch these to 64-bit accesses.
1861
1862 if (Op.size() >= 16 &&
1863 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1864 return MVT::v4i32;
1865
1866 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1867 return MVT::v2i32;
1868
1869 // Use the default.
1870 return MVT::Other;
1871}
1872
1874 const MemSDNode *MemNode = cast<MemSDNode>(N);
1875 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1876}
1877
1879 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1881}
1882
1884 unsigned DestAS) const {
1885 // Flat -> private/local is a simple truncate.
1886 // Flat -> global is no-op
1887 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1888 return true;
1889
1890 const GCNTargetMachine &TM =
1891 static_cast<const GCNTargetMachine &>(getTargetMachine());
1892 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1893}
1894
1896 const MemSDNode *MemNode = cast<MemSDNode>(N);
1897
1899}
1900
1903 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1904 VT.getScalarType().bitsLE(MVT::i16))
1907}
1908
1910 Type *Ty) const {
1911 // FIXME: Could be smarter if called for vector constants.
1912 return true;
1913}
1914
1916 unsigned Index) const {
1918 return false;
1919
1920 // TODO: Add more cases that are cheap.
1921 return Index == 0;
1922}
1923
1925 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1926 switch (Op) {
1927 case ISD::LOAD:
1928 case ISD::STORE:
1929
1930 // These operations are done with 32-bit instructions anyway.
1931 case ISD::AND:
1932 case ISD::OR:
1933 case ISD::XOR:
1934 case ISD::SELECT:
1935 // TODO: Extensions?
1936 return true;
1937 default:
1938 return false;
1939 }
1940 }
1941
1942 // SimplifySetCC uses this function to determine whether or not it should
1943 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1944 if (VT == MVT::i1 && Op == ISD::SETCC)
1945 return false;
1946
1948}
1949
1950SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1951 const SDLoc &SL,
1952 SDValue Chain,
1953 uint64_t Offset) const {
1954 const DataLayout &DL = DAG.getDataLayout();
1957
1958 const ArgDescriptor *InputPtrReg;
1959 const TargetRegisterClass *RC;
1960 LLT ArgTy;
1962
1963 std::tie(InputPtrReg, RC, ArgTy) =
1965
1966 // We may not have the kernarg segment argument if we have no kernel
1967 // arguments.
1968 if (!InputPtrReg)
1969 return DAG.getConstant(Offset, SL, PtrVT);
1970
1972 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1973 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1974
1975 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1976}
1977
1978SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1979 const SDLoc &SL) const {
1982 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1983}
1984
1985SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1986 const SDLoc &SL) const {
1987
1989 std::optional<uint32_t> KnownSize =
1991 if (KnownSize.has_value())
1992 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1993 return SDValue();
1994}
1995
1996SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1997 const SDLoc &SL, SDValue Val,
1998 bool Signed,
1999 const ISD::InputArg *Arg) const {
2000 // First, if it is a widened vector, narrow it.
2001 if (VT.isVector() &&
2003 EVT NarrowedVT =
2006 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2007 DAG.getConstant(0, SL, MVT::i32));
2008 }
2009
2010 // Then convert the vector elements or scalar value.
2011 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2012 VT.bitsLT(MemVT)) {
2013 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2014 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2015 }
2016
2017 if (MemVT.isFloatingPoint())
2018 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2019 else if (Signed)
2020 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2021 else
2022 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2023
2024 return Val;
2025}
2026
2027SDValue SITargetLowering::lowerKernargMemParameter(
2028 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2029 uint64_t Offset, Align Alignment, bool Signed,
2030 const ISD::InputArg *Arg) const {
2032
2033 // Try to avoid using an extload by loading earlier than the argument address,
2034 // and extracting the relevant bits. The load should hopefully be merged with
2035 // the previous argument.
2036 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2037 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2038 int64_t AlignDownOffset = alignDown(Offset, 4);
2039 int64_t OffsetDiff = Offset - AlignDownOffset;
2040
2041 EVT IntVT = MemVT.changeTypeToInteger();
2042
2043 // TODO: If we passed in the base kernel offset we could have a better
2044 // alignment than 4, but we don't really need it.
2045 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2046 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2049
2050 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2051 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2052
2053 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2054 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2055 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2056
2057
2058 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2059 }
2060
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2062 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2065
2066 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2067 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2068}
2069
2070SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2071 const SDLoc &SL, SDValue Chain,
2072 const ISD::InputArg &Arg) const {
2074 MachineFrameInfo &MFI = MF.getFrameInfo();
2075
2076 if (Arg.Flags.isByVal()) {
2077 unsigned Size = Arg.Flags.getByValSize();
2078 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2079 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2080 }
2081
2082 unsigned ArgOffset = VA.getLocMemOffset();
2083 unsigned ArgSize = VA.getValVT().getStoreSize();
2084
2085 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2086
2087 // Create load nodes to retrieve arguments from the stack.
2088 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2089 SDValue ArgValue;
2090
2091 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2093 MVT MemVT = VA.getValVT();
2094
2095 switch (VA.getLocInfo()) {
2096 default:
2097 break;
2098 case CCValAssign::BCvt:
2099 MemVT = VA.getLocVT();
2100 break;
2101 case CCValAssign::SExt:
2102 ExtType = ISD::SEXTLOAD;
2103 break;
2104 case CCValAssign::ZExt:
2105 ExtType = ISD::ZEXTLOAD;
2106 break;
2107 case CCValAssign::AExt:
2108 ExtType = ISD::EXTLOAD;
2109 break;
2110 }
2111
2112 ArgValue = DAG.getExtLoad(
2113 ExtType, SL, VA.getLocVT(), Chain, FIN,
2115 MemVT);
2116 return ArgValue;
2117}
2118
2119SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2120 const SIMachineFunctionInfo &MFI,
2121 EVT VT,
2123 const ArgDescriptor *Reg = nullptr;
2124 const TargetRegisterClass *RC;
2125 LLT Ty;
2126
2128 const ArgDescriptor WorkGroupIDX =
2129 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2130 // If GridZ is not programmed in an entry function then the hardware will set
2131 // it to all zeros, so there is no need to mask the GridY value in the low
2132 // order bits.
2133 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2134 AMDGPU::TTMP7,
2135 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2136 const ArgDescriptor WorkGroupIDZ =
2137 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2138 if (Subtarget->hasArchitectedSGPRs() &&
2140 switch (PVID) {
2142 Reg = &WorkGroupIDX;
2143 RC = &AMDGPU::SReg_32RegClass;
2144 Ty = LLT::scalar(32);
2145 break;
2147 Reg = &WorkGroupIDY;
2148 RC = &AMDGPU::SReg_32RegClass;
2149 Ty = LLT::scalar(32);
2150 break;
2152 Reg = &WorkGroupIDZ;
2153 RC = &AMDGPU::SReg_32RegClass;
2154 Ty = LLT::scalar(32);
2155 break;
2156 default:
2157 break;
2158 }
2159 }
2160
2161 if (!Reg)
2162 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2163 if (!Reg) {
2165 // It's possible for a kernarg intrinsic call to appear in a kernel with
2166 // no allocated segment, in which case we do not add the user sgpr
2167 // argument, so just return null.
2168 return DAG.getConstant(0, SDLoc(), VT);
2169 }
2170
2171 // It's undefined behavior if a function marked with the amdgpu-no-*
2172 // attributes uses the corresponding intrinsic.
2173 return DAG.getUNDEF(VT);
2174 }
2175
2176 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2177}
2178
2180 CallingConv::ID CallConv,
2181 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2182 FunctionType *FType,
2183 SIMachineFunctionInfo *Info) {
2184 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2185 const ISD::InputArg *Arg = &Ins[I];
2186
2187 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2188 "vector type argument should have been split");
2189
2190 // First check if it's a PS input addr.
2191 if (CallConv == CallingConv::AMDGPU_PS &&
2192 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2193 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2194
2195 // Inconveniently only the first part of the split is marked as isSplit,
2196 // so skip to the end. We only want to increment PSInputNum once for the
2197 // entire split argument.
2198 if (Arg->Flags.isSplit()) {
2199 while (!Arg->Flags.isSplitEnd()) {
2200 assert((!Arg->VT.isVector() ||
2201 Arg->VT.getScalarSizeInBits() == 16) &&
2202 "unexpected vector split in ps argument type");
2203 if (!SkipArg)
2204 Splits.push_back(*Arg);
2205 Arg = &Ins[++I];
2206 }
2207 }
2208
2209 if (SkipArg) {
2210 // We can safely skip PS inputs.
2211 Skipped.set(Arg->getOrigArgIndex());
2212 ++PSInputNum;
2213 continue;
2214 }
2215
2216 Info->markPSInputAllocated(PSInputNum);
2217 if (Arg->Used)
2218 Info->markPSInputEnabled(PSInputNum);
2219
2220 ++PSInputNum;
2221 }
2222
2223 Splits.push_back(*Arg);
2224 }
2225}
2226
2227// Allocate special inputs passed in VGPRs.
2229 MachineFunction &MF,
2230 const SIRegisterInfo &TRI,
2231 SIMachineFunctionInfo &Info) const {
2232 const LLT S32 = LLT::scalar(32);
2234
2235 if (Info.hasWorkItemIDX()) {
2236 Register Reg = AMDGPU::VGPR0;
2237 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2238
2239 CCInfo.AllocateReg(Reg);
2240 unsigned Mask = (Subtarget->hasPackedTID() &&
2241 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2242 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2243 }
2244
2245 if (Info.hasWorkItemIDY()) {
2246 assert(Info.hasWorkItemIDX());
2247 if (Subtarget->hasPackedTID()) {
2248 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2249 0x3ff << 10));
2250 } else {
2251 unsigned Reg = AMDGPU::VGPR1;
2252 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2253
2254 CCInfo.AllocateReg(Reg);
2255 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2256 }
2257 }
2258
2259 if (Info.hasWorkItemIDZ()) {
2260 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2261 if (Subtarget->hasPackedTID()) {
2262 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2263 0x3ff << 20));
2264 } else {
2265 unsigned Reg = AMDGPU::VGPR2;
2266 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2267
2268 CCInfo.AllocateReg(Reg);
2269 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2270 }
2271 }
2272}
2273
2274// Try to allocate a VGPR at the end of the argument list, or if no argument
2275// VGPRs are left allocating a stack slot.
2276// If \p Mask is is given it indicates bitfield position in the register.
2277// If \p Arg is given use it with new ]p Mask instead of allocating new.
2278static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2279 ArgDescriptor Arg = ArgDescriptor()) {
2280 if (Arg.isSet())
2281 return ArgDescriptor::createArg(Arg, Mask);
2282
2283 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2284 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2285 if (RegIdx == ArgVGPRs.size()) {
2286 // Spill to stack required.
2287 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2288
2289 return ArgDescriptor::createStack(Offset, Mask);
2290 }
2291
2292 unsigned Reg = ArgVGPRs[RegIdx];
2293 Reg = CCInfo.AllocateReg(Reg);
2294 assert(Reg != AMDGPU::NoRegister);
2295
2296 MachineFunction &MF = CCInfo.getMachineFunction();
2297 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2298 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2299 return ArgDescriptor::createRegister(Reg, Mask);
2300}
2301
2303 const TargetRegisterClass *RC,
2304 unsigned NumArgRegs) {
2305 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2306 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2307 if (RegIdx == ArgSGPRs.size())
2308 report_fatal_error("ran out of SGPRs for arguments");
2309
2310 unsigned Reg = ArgSGPRs[RegIdx];
2311 Reg = CCInfo.AllocateReg(Reg);
2312 assert(Reg != AMDGPU::NoRegister);
2313
2314 MachineFunction &MF = CCInfo.getMachineFunction();
2315 MF.addLiveIn(Reg, RC);
2317}
2318
2319// If this has a fixed position, we still should allocate the register in the
2320// CCInfo state. Technically we could get away with this for values passed
2321// outside of the normal argument range.
2323 const TargetRegisterClass *RC,
2324 MCRegister Reg) {
2325 Reg = CCInfo.AllocateReg(Reg);
2326 assert(Reg != AMDGPU::NoRegister);
2327 MachineFunction &MF = CCInfo.getMachineFunction();
2328 MF.addLiveIn(Reg, RC);
2329}
2330
2331static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2332 if (Arg) {
2333 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2334 Arg.getRegister());
2335 } else
2336 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2337}
2338
2339static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2340 if (Arg) {
2341 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2342 Arg.getRegister());
2343 } else
2344 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2345}
2346
2347/// Allocate implicit function VGPR arguments at the end of allocated user
2348/// arguments.
2350 CCState &CCInfo, MachineFunction &MF,
2351 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2352 const unsigned Mask = 0x3ff;
2353 ArgDescriptor Arg;
2354
2355 if (Info.hasWorkItemIDX()) {
2356 Arg = allocateVGPR32Input(CCInfo, Mask);
2357 Info.setWorkItemIDX(Arg);
2358 }
2359
2360 if (Info.hasWorkItemIDY()) {
2361 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2362 Info.setWorkItemIDY(Arg);
2363 }
2364
2365 if (Info.hasWorkItemIDZ())
2366 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2367}
2368
2369/// Allocate implicit function VGPR arguments in fixed registers.
2371 CCState &CCInfo, MachineFunction &MF,
2372 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2373 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2374 if (!Reg)
2375 report_fatal_error("failed to allocated VGPR for implicit arguments");
2376
2377 const unsigned Mask = 0x3ff;
2378 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2379 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2380 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2381}
2382
2384 CCState &CCInfo,
2385 MachineFunction &MF,
2386 const SIRegisterInfo &TRI,
2387 SIMachineFunctionInfo &Info) const {
2388 auto &ArgInfo = Info.getArgInfo();
2389 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2390
2391 // TODO: Unify handling with private memory pointers.
2392 if (UserSGPRInfo.hasDispatchPtr())
2393 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2394
2395 const Module *M = MF.getFunction().getParent();
2396 if (UserSGPRInfo.hasQueuePtr() &&
2398 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2399
2400 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2401 // constant offset from the kernarg segment.
2402 if (Info.hasImplicitArgPtr())
2403 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2404
2405 if (UserSGPRInfo.hasDispatchID())
2406 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2407
2408 // flat_scratch_init is not applicable for non-kernel functions.
2409
2410 if (Info.hasWorkGroupIDX())
2411 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2412
2413 if (Info.hasWorkGroupIDY())
2414 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2415
2416 if (Info.hasWorkGroupIDZ())
2417 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2418
2419 if (Info.hasLDSKernelId())
2420 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2421}
2422
2423// Allocate special inputs passed in user SGPRs.
2425 MachineFunction &MF,
2426 const SIRegisterInfo &TRI,
2427 SIMachineFunctionInfo &Info) const {
2428 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2429 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2430 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2431 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2432 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2433 }
2434
2435 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2436 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2437 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2438 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2439 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2440 }
2441
2442 if (UserSGPRInfo.hasDispatchPtr()) {
2443 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2444 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2445 CCInfo.AllocateReg(DispatchPtrReg);
2446 }
2447
2448 const Module *M = MF.getFunction().getParent();
2449 if (UserSGPRInfo.hasQueuePtr() &&
2451 Register QueuePtrReg = Info.addQueuePtr(TRI);
2452 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2453 CCInfo.AllocateReg(QueuePtrReg);
2454 }
2455
2456 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2458 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2459 CCInfo.AllocateReg(InputPtrReg);
2460
2461 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2462 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2463 }
2464
2465 if (UserSGPRInfo.hasDispatchID()) {
2466 Register DispatchIDReg = Info.addDispatchID(TRI);
2467 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2468 CCInfo.AllocateReg(DispatchIDReg);
2469 }
2470
2471 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2472 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2473 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2474 CCInfo.AllocateReg(FlatScratchInitReg);
2475 }
2476
2477 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2478 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2479 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2480 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2481 }
2482
2483 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2484 // these from the dispatch pointer.
2485}
2486
2487// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2488// sequential starting from the first argument.
2490 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2492 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2493 Function &F = MF.getFunction();
2494 unsigned LastExplicitArgOffset =
2495 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2496 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2497 bool InPreloadSequence = true;
2498 unsigned InIdx = 0;
2499 for (auto &Arg : F.args()) {
2500 if (!InPreloadSequence || !Arg.hasInRegAttr())
2501 break;
2502
2503 int ArgIdx = Arg.getArgNo();
2504 // Don't preload non-original args or parts not in the current preload
2505 // sequence.
2506 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2507 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2508 break;
2509
2510 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2511 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2512 InIdx++) {
2513 assert(ArgLocs[ArgIdx].isMemLoc());
2514 auto &ArgLoc = ArgLocs[InIdx];
2515 const Align KernelArgBaseAlign = Align(16);
2516 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2517 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2518 unsigned NumAllocSGPRs =
2519 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2520
2521 // Arg is preloaded into the previous SGPR.
2522 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2523 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2524 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2525 continue;
2526 }
2527
2528 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2529 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2530 // Check for free user SGPRs for preloading.
2531 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2532 SGPRInfo.getNumFreeUserSGPRs()) {
2533 InPreloadSequence = false;
2534 break;
2535 }
2536
2537 // Preload this argument.
2538 const TargetRegisterClass *RC =
2539 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2540 SmallVectorImpl<MCRegister> *PreloadRegs =
2541 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2542
2543 if (PreloadRegs->size() > 1)
2544 RC = &AMDGPU::SGPR_32RegClass;
2545 for (auto &Reg : *PreloadRegs) {
2546 assert(Reg);
2547 MF.addLiveIn(Reg, RC);
2548 CCInfo.AllocateReg(Reg);
2549 }
2550
2551 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2552 }
2553 }
2554}
2555
2557 const SIRegisterInfo &TRI,
2558 SIMachineFunctionInfo &Info) const {
2559 // Always allocate this last since it is a synthetic preload.
2560 if (Info.hasLDSKernelId()) {
2561 Register Reg = Info.addLDSKernelId();
2562 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2563 CCInfo.AllocateReg(Reg);
2564 }
2565}
2566
2567// Allocate special input registers that are initialized per-wave.
2569 MachineFunction &MF,
2571 CallingConv::ID CallConv,
2572 bool IsShader) const {
2573 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2574 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2575 // Note: user SGPRs are handled by the front-end for graphics shaders
2576 // Pad up the used user SGPRs with dead inputs.
2577
2578 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2579 // before enabling architected SGPRs for workgroup IDs.
2580 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2581
2582 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2583 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2584 // rely on it to reach 16 since if we end up having no stack usage, it will
2585 // not really be added.
2586 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2587 Info.hasWorkGroupIDY() +
2588 Info.hasWorkGroupIDZ() +
2589 Info.hasWorkGroupInfo();
2590 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2591 Register Reg = Info.addReservedUserSGPR();
2592 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2593 CCInfo.AllocateReg(Reg);
2594 }
2595 }
2596
2597 if (!HasArchitectedSGPRs) {
2598 if (Info.hasWorkGroupIDX()) {
2599 Register Reg = Info.addWorkGroupIDX();
2600 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2601 CCInfo.AllocateReg(Reg);
2602 }
2603
2604 if (Info.hasWorkGroupIDY()) {
2605 Register Reg = Info.addWorkGroupIDY();
2606 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2607 CCInfo.AllocateReg(Reg);
2608 }
2609
2610 if (Info.hasWorkGroupIDZ()) {
2611 Register Reg = Info.addWorkGroupIDZ();
2612 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 CCInfo.AllocateReg(Reg);
2614 }
2615 }
2616
2617 if (Info.hasWorkGroupInfo()) {
2618 Register Reg = Info.addWorkGroupInfo();
2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622
2623 if (Info.hasPrivateSegmentWaveByteOffset()) {
2624 // Scratch wave offset passed in system SGPR.
2625 unsigned PrivateSegmentWaveByteOffsetReg;
2626
2627 if (IsShader) {
2628 PrivateSegmentWaveByteOffsetReg =
2629 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2630
2631 // This is true if the scratch wave byte offset doesn't have a fixed
2632 // location.
2633 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2634 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2635 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2636 }
2637 } else
2638 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2639
2640 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2641 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2642 }
2643
2644 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2645 Info.getNumPreloadedSGPRs() >= 16);
2646}
2647
2649 MachineFunction &MF,
2650 const SIRegisterInfo &TRI,
2651 SIMachineFunctionInfo &Info) {
2652 // Now that we've figured out where the scratch register inputs are, see if
2653 // should reserve the arguments and use them directly.
2654 MachineFrameInfo &MFI = MF.getFrameInfo();
2655 bool HasStackObjects = MFI.hasStackObjects();
2656 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2657
2658 // Record that we know we have non-spill stack objects so we don't need to
2659 // check all stack objects later.
2660 if (HasStackObjects)
2661 Info.setHasNonSpillStackObjects(true);
2662
2663 // Everything live out of a block is spilled with fast regalloc, so it's
2664 // almost certain that spilling will be required.
2665 if (TM.getOptLevel() == CodeGenOptLevel::None)
2666 HasStackObjects = true;
2667
2668 // For now assume stack access is needed in any callee functions, so we need
2669 // the scratch registers to pass in.
2670 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2671
2672 if (!ST.enableFlatScratch()) {
2673 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2674 // If we have stack objects, we unquestionably need the private buffer
2675 // resource. For the Code Object V2 ABI, this will be the first 4 user
2676 // SGPR inputs. We can reserve those and use them directly.
2677
2678 Register PrivateSegmentBufferReg =
2680 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2681 } else {
2682 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2683 // We tentatively reserve the last registers (skipping the last registers
2684 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2685 // we'll replace these with the ones immediately after those which were
2686 // really allocated. In the prologue copies will be inserted from the
2687 // argument to these reserved registers.
2688
2689 // Without HSA, relocations are used for the scratch pointer and the
2690 // buffer resource setup is always inserted in the prologue. Scratch wave
2691 // offset is still in an input SGPR.
2692 Info.setScratchRSrcReg(ReservedBufferReg);
2693 }
2694 }
2695
2697
2698 // For entry functions we have to set up the stack pointer if we use it,
2699 // whereas non-entry functions get this "for free". This means there is no
2700 // intrinsic advantage to using S32 over S34 in cases where we do not have
2701 // calls but do need a frame pointer (i.e. if we are requested to have one
2702 // because frame pointer elimination is disabled). To keep things simple we
2703 // only ever use S32 as the call ABI stack pointer, and so using it does not
2704 // imply we need a separate frame pointer.
2705 //
2706 // Try to use s32 as the SP, but move it if it would interfere with input
2707 // arguments. This won't work with calls though.
2708 //
2709 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2710 // registers.
2711 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2712 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2713 } else {
2715
2716 if (MFI.hasCalls())
2717 report_fatal_error("call in graphics shader with too many input SGPRs");
2718
2719 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2720 if (!MRI.isLiveIn(Reg)) {
2721 Info.setStackPtrOffsetReg(Reg);
2722 break;
2723 }
2724 }
2725
2726 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2727 report_fatal_error("failed to find register for SP");
2728 }
2729
2730 // hasFP should be accurate for entry functions even before the frame is
2731 // finalized, because it does not rely on the known stack size, only
2732 // properties like whether variable sized objects are present.
2733 if (ST.getFrameLowering()->hasFP(MF)) {
2734 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2735 }
2736}
2737
2740 return !Info->isEntryFunction();
2741}
2742
2744
2745}
2746
2748 MachineBasicBlock *Entry,
2749 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2751
2752 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2753 if (!IStart)
2754 return;
2755
2756 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2757 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2758 MachineBasicBlock::iterator MBBI = Entry->begin();
2759 for (const MCPhysReg *I = IStart; *I; ++I) {
2760 const TargetRegisterClass *RC = nullptr;
2761 if (AMDGPU::SReg_64RegClass.contains(*I))
2762 RC = &AMDGPU::SGPR_64RegClass;
2763 else if (AMDGPU::SReg_32RegClass.contains(*I))
2764 RC = &AMDGPU::SGPR_32RegClass;
2765 else
2766 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2767
2768 Register NewVR = MRI->createVirtualRegister(RC);
2769 // Create copy from CSR to a virtual register.
2770 Entry->addLiveIn(*I);
2771 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2772 .addReg(*I);
2773
2774 // Insert the copy-back instructions right before the terminator.
2775 for (auto *Exit : Exits)
2776 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2777 TII->get(TargetOpcode::COPY), *I)
2778 .addReg(NewVR);
2779 }
2780}
2781
2783 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2784 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2785 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2787
2789 const Function &Fn = MF.getFunction();
2792
2793 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2794 DiagnosticInfoUnsupported NoGraphicsHSA(
2795 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2796 DAG.getContext()->diagnose(NoGraphicsHSA);
2797 return DAG.getEntryNode();
2798 }
2799
2802 BitVector Skipped(Ins.size());
2803 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2804 *DAG.getContext());
2805
2806 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2807 bool IsKernel = AMDGPU::isKernel(CallConv);
2808 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2809
2810 if (IsGraphics) {
2811 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2812 assert(!UserSGPRInfo.hasDispatchPtr() &&
2813 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2814 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2815 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2816 (void)UserSGPRInfo;
2817 if (!Subtarget->enableFlatScratch())
2818 assert(!UserSGPRInfo.hasFlatScratchInit());
2819 if ((CallConv != CallingConv::AMDGPU_CS &&
2820 CallConv != CallingConv::AMDGPU_Gfx) ||
2821 !Subtarget->hasArchitectedSGPRs())
2822 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2823 !Info->hasWorkGroupIDZ());
2824 }
2825
2826 if (CallConv == CallingConv::AMDGPU_PS) {
2827 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2828
2829 // At least one interpolation mode must be enabled or else the GPU will
2830 // hang.
2831 //
2832 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2833 // set PSInputAddr, the user wants to enable some bits after the compilation
2834 // based on run-time states. Since we can't know what the final PSInputEna
2835 // will look like, so we shouldn't do anything here and the user should take
2836 // responsibility for the correct programming.
2837 //
2838 // Otherwise, the following restrictions apply:
2839 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2840 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2841 // enabled too.
2842 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2843 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2844 CCInfo.AllocateReg(AMDGPU::VGPR0);
2845 CCInfo.AllocateReg(AMDGPU::VGPR1);
2846 Info->markPSInputAllocated(0);
2847 Info->markPSInputEnabled(0);
2848 }
2849 if (Subtarget->isAmdPalOS()) {
2850 // For isAmdPalOS, the user does not enable some bits after compilation
2851 // based on run-time states; the register values being generated here are
2852 // the final ones set in hardware. Therefore we need to apply the
2853 // workaround to PSInputAddr and PSInputEnable together. (The case where
2854 // a bit is set in PSInputAddr but not PSInputEnable is where the
2855 // frontend set up an input arg for a particular interpolation mode, but
2856 // nothing uses that input arg. Really we should have an earlier pass
2857 // that removes such an arg.)
2858 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2859 if ((PsInputBits & 0x7F) == 0 ||
2860 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2861 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2862 }
2863 } else if (IsKernel) {
2864 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2865 } else {
2866 Splits.append(Ins.begin(), Ins.end());
2867 }
2868
2869 if (IsKernel)
2870 analyzeFormalArgumentsCompute(CCInfo, Ins);
2871
2872 if (IsEntryFunc) {
2873 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2874 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2875 if (IsKernel && Subtarget->hasKernargPreload())
2876 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2877
2878 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2879 } else if (!IsGraphics) {
2880 // For the fixed ABI, pass workitem IDs in the last argument register.
2881 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2882
2883 // FIXME: Sink this into allocateSpecialInputSGPRs
2884 if (!Subtarget->enableFlatScratch())
2885 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2886
2887 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2888 }
2889
2890 if (!IsKernel) {
2891 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2892 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2893 }
2894
2896
2897 // FIXME: This is the minimum kernel argument alignment. We should improve
2898 // this to the maximum alignment of the arguments.
2899 //
2900 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2901 // kern arg offset.
2902 const Align KernelArgBaseAlign = Align(16);
2903
2904 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2905 const ISD::InputArg &Arg = Ins[i];
2906 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2907 InVals.push_back(DAG.getUNDEF(Arg.VT));
2908 continue;
2909 }
2910
2911 CCValAssign &VA = ArgLocs[ArgIdx++];
2912 MVT VT = VA.getLocVT();
2913
2914 if (IsEntryFunc && VA.isMemLoc()) {
2915 VT = Ins[i].VT;
2916 EVT MemVT = VA.getLocVT();
2917
2918 const uint64_t Offset = VA.getLocMemOffset();
2919 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2920
2921 if (Arg.Flags.isByRef()) {
2922 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2923
2924 const GCNTargetMachine &TM =
2925 static_cast<const GCNTargetMachine &>(getTargetMachine());
2926 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2927 Arg.Flags.getPointerAddrSpace())) {
2930 }
2931
2932 InVals.push_back(Ptr);
2933 continue;
2934 }
2935
2936 SDValue NewArg;
2937 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2938 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2939 // In this case the argument is packed into the previous preload SGPR.
2940 int64_t AlignDownOffset = alignDown(Offset, 4);
2941 int64_t OffsetDiff = Offset - AlignDownOffset;
2942 EVT IntVT = MemVT.changeTypeToInteger();
2943
2947 Register Reg =
2948 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2949
2950 assert(Reg);
2951 Register VReg = MRI.getLiveInVirtReg(Reg);
2952 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2953
2954 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2955 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2956
2957 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2958 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2959 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2960 Ins[i].Flags.isSExt(), &Ins[i]);
2961
2962 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2963 } else {
2967 const SmallVectorImpl<MCRegister> &PreloadRegs =
2968 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2969
2970 SDValue Copy;
2971 if (PreloadRegs.size() == 1) {
2972 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2973 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2974 NewArg = DAG.getCopyFromReg(
2975 Chain, DL, VReg,
2977 TRI->getRegSizeInBits(*RC)));
2978
2979 } else {
2980 // If the kernarg alignment does not match the alignment of the SGPR
2981 // tuple RC that can accommodate this argument, it will be built up
2982 // via copies from from the individual SGPRs that the argument was
2983 // preloaded to.
2985 for (auto Reg : PreloadRegs) {
2986 Register VReg = MRI.getLiveInVirtReg(Reg);
2987 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2988 Elts.push_back(Copy);
2989 }
2990 NewArg =
2991 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2992 PreloadRegs.size()),
2993 DL, Elts);
2994 }
2995
2996 // If the argument was preloaded to multiple consecutive 32-bit
2997 // registers because of misalignment between addressable SGPR tuples
2998 // and the argument size, we can still assume that because of kernarg
2999 // segment alignment restrictions that NewArg's size is the same as
3000 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3001 // truncate since we cannot preload to less than a single SGPR and the
3002 // MemVT may be smaller.
3003 EVT MemVTInt =
3005 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3006 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3007
3008 NewArg = DAG.getBitcast(MemVT, NewArg);
3009 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3010 Ins[i].Flags.isSExt(), &Ins[i]);
3011 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3012 }
3013 } else {
3014 NewArg =
3015 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3016 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3017 }
3018 Chains.push_back(NewArg.getValue(1));
3019
3020 auto *ParamTy =
3021 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3023 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3024 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3025 // On SI local pointers are just offsets into LDS, so they are always
3026 // less than 16-bits. On CI and newer they could potentially be
3027 // real pointers, so we can't guarantee their size.
3028 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3029 DAG.getValueType(MVT::i16));
3030 }
3031
3032 InVals.push_back(NewArg);
3033 continue;
3034 } else if (!IsEntryFunc && VA.isMemLoc()) {
3035 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3036 InVals.push_back(Val);
3037 if (!Arg.Flags.isByVal())
3038 Chains.push_back(Val.getValue(1));
3039 continue;
3040 }
3041
3042 assert(VA.isRegLoc() && "Parameter must be in a register!");
3043
3044 Register Reg = VA.getLocReg();
3045 const TargetRegisterClass *RC = nullptr;
3046 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3047 RC = &AMDGPU::VGPR_32RegClass;
3048 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3049 RC = &AMDGPU::SGPR_32RegClass;
3050 else
3051 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3052 EVT ValVT = VA.getValVT();
3053
3054 Reg = MF.addLiveIn(Reg, RC);
3055 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3056
3057 if (Arg.Flags.isSRet()) {
3058 // The return object should be reasonably addressable.
3059
3060 // FIXME: This helps when the return is a real sret. If it is a
3061 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3062 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3063 unsigned NumBits
3065 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3066 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3067 }
3068
3069 // If this is an 8 or 16-bit value, it is really passed promoted
3070 // to 32 bits. Insert an assert[sz]ext to capture this, then
3071 // truncate to the right size.
3072 switch (VA.getLocInfo()) {
3073 case CCValAssign::Full:
3074 break;
3075 case CCValAssign::BCvt:
3076 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3077 break;
3078 case CCValAssign::SExt:
3079 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3080 DAG.getValueType(ValVT));
3081 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3082 break;
3083 case CCValAssign::ZExt:
3084 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3085 DAG.getValueType(ValVT));
3086 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3087 break;
3088 case CCValAssign::AExt:
3089 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3090 break;
3091 default:
3092 llvm_unreachable("Unknown loc info!");
3093 }
3094
3095 InVals.push_back(Val);
3096 }
3097
3098 // Start adding system SGPRs.
3099 if (IsEntryFunc)
3100 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3101
3102 // DAG.getPass() returns nullptr when using new pass manager.
3103 // TODO: Use DAG.getMFAM() to access analysis result.
3104 if (DAG.getPass()) {
3105 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3106 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3107 }
3108
3109 unsigned StackArgSize = CCInfo.getStackSize();
3110 Info->setBytesInStackArgArea(StackArgSize);
3111
3112 return Chains.empty() ? Chain :
3113 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3114}
3115
3116// TODO: If return values can't fit in registers, we should return as many as
3117// possible in registers before passing on stack.
3119 CallingConv::ID CallConv,
3120 MachineFunction &MF, bool IsVarArg,
3122 LLVMContext &Context) const {
3123 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3124 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3125 // for shaders. Vector types should be explicitly handled by CC.
3126 if (AMDGPU::isEntryFunctionCC(CallConv))
3127 return true;
3128
3130 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3131 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3132 return false;
3133
3134 // We must use the stack if return would require unavailable registers.
3135 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3136 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3137 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3138 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3139 return false;
3140
3141 return true;
3142}
3143
3144SDValue
3146 bool isVarArg,
3148 const SmallVectorImpl<SDValue> &OutVals,
3149 const SDLoc &DL, SelectionDAG &DAG) const {
3152
3153 if (AMDGPU::isKernel(CallConv)) {
3154 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3155 OutVals, DL, DAG);
3156 }
3157
3158 bool IsShader = AMDGPU::isShader(CallConv);
3159
3160 Info->setIfReturnsVoid(Outs.empty());
3161 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3162
3163 // CCValAssign - represent the assignment of the return value to a location.
3166
3167 // CCState - Info about the registers and stack slots.
3168 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3169 *DAG.getContext());
3170
3171 // Analyze outgoing return values.
3172 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3173
3174 SDValue Glue;
3176 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3177
3178 // Copy the result values into the output registers.
3179 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3180 ++I, ++RealRVLocIdx) {
3181 CCValAssign &VA = RVLocs[I];
3182 assert(VA.isRegLoc() && "Can only return in registers!");
3183 // TODO: Partially return in registers if return values don't fit.
3184 SDValue Arg = OutVals[RealRVLocIdx];
3185
3186 // Copied from other backends.
3187 switch (VA.getLocInfo()) {
3188 case CCValAssign::Full:
3189 break;
3190 case CCValAssign::BCvt:
3191 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3192 break;
3193 case CCValAssign::SExt:
3194 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3195 break;
3196 case CCValAssign::ZExt:
3197 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3198 break;
3199 case CCValAssign::AExt:
3200 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3201 break;
3202 default:
3203 llvm_unreachable("Unknown loc info!");
3204 }
3205
3206 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3207 Glue = Chain.getValue(1);
3208 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3209 }
3210
3211 // FIXME: Does sret work properly?
3212 if (!Info->isEntryFunction()) {
3213 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3214 const MCPhysReg *I =
3215 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3216 if (I) {
3217 for (; *I; ++I) {
3218 if (AMDGPU::SReg_64RegClass.contains(*I))
3219 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3220 else if (AMDGPU::SReg_32RegClass.contains(*I))
3221 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3222 else
3223 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3224 }
3225 }
3226 }
3227
3228 // Update chain and glue.
3229 RetOps[0] = Chain;
3230 if (Glue.getNode())
3231 RetOps.push_back(Glue);
3232
3233 unsigned Opc = AMDGPUISD::ENDPGM;
3234 if (!IsWaveEnd)
3236 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3237}
3238
3240 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3241 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3242 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3243 SDValue ThisVal) const {
3244 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3245
3246 // Assign locations to each value returned by this call.
3248 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3249 *DAG.getContext());
3250 CCInfo.AnalyzeCallResult(Ins, RetCC);
3251
3252 // Copy all of the result registers out of their specified physreg.
3253 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3254 CCValAssign VA = RVLocs[i];
3255 SDValue Val;
3256
3257 if (VA.isRegLoc()) {
3258 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3259 Chain = Val.getValue(1);
3260 InGlue = Val.getValue(2);
3261 } else if (VA.isMemLoc()) {
3262 report_fatal_error("TODO: return values in memory");
3263 } else
3264 llvm_unreachable("unknown argument location type");
3265
3266 switch (VA.getLocInfo()) {
3267 case CCValAssign::Full:
3268 break;
3269 case CCValAssign::BCvt:
3270 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3271 break;
3272 case CCValAssign::ZExt:
3273 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3274 DAG.getValueType(VA.getValVT()));
3275 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3276 break;
3277 case CCValAssign::SExt:
3278 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3279 DAG.getValueType(VA.getValVT()));
3280 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3281 break;
3282 case CCValAssign::AExt:
3283 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3284 break;
3285 default:
3286 llvm_unreachable("Unknown loc info!");
3287 }
3288
3289 InVals.push_back(Val);
3290 }
3291
3292 return Chain;
3293}
3294
3295// Add code to pass special inputs required depending on used features separate
3296// from the explicit user arguments present in the IR.
3298 CallLoweringInfo &CLI,
3299 CCState &CCInfo,
3300 const SIMachineFunctionInfo &Info,
3301 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3302 SmallVectorImpl<SDValue> &MemOpChains,
3303 SDValue Chain) const {
3304 // If we don't have a call site, this was a call inserted by
3305 // legalization. These can never use special inputs.
3306 if (!CLI.CB)
3307 return;
3308
3309 SelectionDAG &DAG = CLI.DAG;
3310 const SDLoc &DL = CLI.DL;
3311 const Function &F = DAG.getMachineFunction().getFunction();
3312
3313 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3314 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3315
3316 const AMDGPUFunctionArgInfo *CalleeArgInfo
3318 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3319 // DAG.getPass() returns nullptr when using new pass manager.
3320 // TODO: Use DAG.getMFAM() to access analysis result.
3321 if (DAG.getPass()) {
3322 auto &ArgUsageInfo =
3324 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3325 }
3326 }
3327
3328 // TODO: Unify with private memory register handling. This is complicated by
3329 // the fact that at least in kernels, the input argument is not necessarily
3330 // in the same location as the input.
3331 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3333 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3334 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3335 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3336 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3337 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3338 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3339 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3340 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3341 };
3342
3343 for (auto Attr : ImplicitAttrs) {
3344 const ArgDescriptor *OutgoingArg;
3345 const TargetRegisterClass *ArgRC;
3346 LLT ArgTy;
3347
3348 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3349
3350 // If the callee does not use the attribute value, skip copying the value.
3351 if (CLI.CB->hasFnAttr(Attr.second))
3352 continue;
3353
3354 std::tie(OutgoingArg, ArgRC, ArgTy) =
3355 CalleeArgInfo->getPreloadedValue(InputID);
3356 if (!OutgoingArg)
3357 continue;
3358
3359 const ArgDescriptor *IncomingArg;
3360 const TargetRegisterClass *IncomingArgRC;
3361 LLT Ty;
3362 std::tie(IncomingArg, IncomingArgRC, Ty) =
3363 CallerArgInfo.getPreloadedValue(InputID);
3364 assert(IncomingArgRC == ArgRC);
3365
3366 // All special arguments are ints for now.
3367 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3368 SDValue InputReg;
3369
3370 if (IncomingArg) {
3371 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3372 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3373 // The implicit arg ptr is special because it doesn't have a corresponding
3374 // input for kernels, and is computed from the kernarg segment pointer.
3375 InputReg = getImplicitArgPtr(DAG, DL);
3376 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3377 std::optional<uint32_t> Id =
3379 if (Id.has_value()) {
3380 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3381 } else {
3382 InputReg = DAG.getUNDEF(ArgVT);
3383 }
3384 } else {
3385 // We may have proven the input wasn't needed, although the ABI is
3386 // requiring it. We just need to allocate the register appropriately.
3387 InputReg = DAG.getUNDEF(ArgVT);
3388 }
3389
3390 if (OutgoingArg->isRegister()) {
3391 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3392 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3393 report_fatal_error("failed to allocate implicit input argument");
3394 } else {
3395 unsigned SpecialArgOffset =
3396 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3397 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3398 SpecialArgOffset);
3399 MemOpChains.push_back(ArgStore);
3400 }
3401 }
3402
3403 // Pack workitem IDs into a single register or pass it as is if already
3404 // packed.
3405 const ArgDescriptor *OutgoingArg;
3406 const TargetRegisterClass *ArgRC;
3407 LLT Ty;
3408
3409 std::tie(OutgoingArg, ArgRC, Ty) =
3411 if (!OutgoingArg)
3412 std::tie(OutgoingArg, ArgRC, Ty) =
3414 if (!OutgoingArg)
3415 std::tie(OutgoingArg, ArgRC, Ty) =
3417 if (!OutgoingArg)
3418 return;
3419
3420 const ArgDescriptor *IncomingArgX = std::get<0>(
3422 const ArgDescriptor *IncomingArgY = std::get<0>(
3424 const ArgDescriptor *IncomingArgZ = std::get<0>(
3426
3427 SDValue InputReg;
3428 SDLoc SL;
3429
3430 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3431 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3432 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3433
3434 // If incoming ids are not packed we need to pack them.
3435 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3436 NeedWorkItemIDX) {
3437 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3438 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3439 } else {
3440 InputReg = DAG.getConstant(0, DL, MVT::i32);
3441 }
3442 }
3443
3444 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3445 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3446 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3447 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3448 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3449 InputReg = InputReg.getNode() ?
3450 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3451 }
3452
3453 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3454 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3455 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3456 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3457 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3458 InputReg = InputReg.getNode() ?
3459 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3460 }
3461
3462 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3463 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3464 // We're in a situation where the outgoing function requires the workitem
3465 // ID, but the calling function does not have it (e.g a graphics function
3466 // calling a C calling convention function). This is illegal, but we need
3467 // to produce something.
3468 InputReg = DAG.getUNDEF(MVT::i32);
3469 } else {
3470 // Workitem ids are already packed, any of present incoming arguments
3471 // will carry all required fields.
3473 IncomingArgX ? *IncomingArgX :
3474 IncomingArgY ? *IncomingArgY :
3475 *IncomingArgZ, ~0u);
3476 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3477 }
3478 }
3479
3480 if (OutgoingArg->isRegister()) {
3481 if (InputReg)
3482 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3483
3484 CCInfo.AllocateReg(OutgoingArg->getRegister());
3485 } else {
3486 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3487 if (InputReg) {
3488 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3489 SpecialArgOffset);
3490 MemOpChains.push_back(ArgStore);
3491 }
3492 }
3493}
3494
3496 return CC == CallingConv::Fast;
3497}
3498
3499/// Return true if we might ever do TCO for calls with this calling convention.
3501 switch (CC) {
3502 case CallingConv::C:
3504 return true;
3505 default:
3506 return canGuaranteeTCO(CC);
3507 }
3508}
3509
3511 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3513 const SmallVectorImpl<SDValue> &OutVals,
3514 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3515 if (AMDGPU::isChainCC(CalleeCC))
3516 return true;
3517
3518 if (!mayTailCallThisCC(CalleeCC))
3519 return false;
3520
3521 // For a divergent call target, we need to do a waterfall loop over the
3522 // possible callees which precludes us from using a simple jump.
3523 if (Callee->isDivergent())
3524 return false;
3525
3527 const Function &CallerF = MF.getFunction();
3528 CallingConv::ID CallerCC = CallerF.getCallingConv();
3530 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3531
3532 // Kernels aren't callable, and don't have a live in return address so it
3533 // doesn't make sense to do a tail call with entry functions.
3534 if (!CallerPreserved)
3535 return false;
3536
3537 bool CCMatch = CallerCC == CalleeCC;
3538
3540 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3541 return true;
3542 return false;
3543 }
3544
3545 // TODO: Can we handle var args?
3546 if (IsVarArg)
3547 return false;
3548
3549 for (const Argument &Arg : CallerF.args()) {
3550 if (Arg.hasByValAttr())
3551 return false;
3552 }
3553
3554 LLVMContext &Ctx = *DAG.getContext();
3555
3556 // Check that the call results are passed in the same way.
3557 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3558 CCAssignFnForCall(CalleeCC, IsVarArg),
3559 CCAssignFnForCall(CallerCC, IsVarArg)))
3560 return false;
3561
3562 // The callee has to preserve all registers the caller needs to preserve.
3563 if (!CCMatch) {
3564 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3565 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3566 return false;
3567 }
3568
3569 // Nothing more to check if the callee is taking no arguments.
3570 if (Outs.empty())
3571 return true;
3572
3574 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3575
3576 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3577
3578 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3579 // If the stack arguments for this call do not fit into our own save area then
3580 // the call cannot be made tail.
3581 // TODO: Is this really necessary?
3582 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3583 return false;
3584
3585 const MachineRegisterInfo &MRI = MF.getRegInfo();
3586 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3587}
3588
3590 if (!CI->isTailCall())
3591 return false;
3592
3593 const Function *ParentFn = CI->getParent()->getParent();
3595 return false;
3596 return true;
3597}
3598
3599// The wave scratch offset register is used as the global base pointer.
3601 SmallVectorImpl<SDValue> &InVals) const {
3602 CallingConv::ID CallConv = CLI.CallConv;
3603 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3604
3605 SelectionDAG &DAG = CLI.DAG;
3606
3607 TargetLowering::ArgListEntry RequestedExec;
3608 if (IsChainCallConv) {
3609 // The last argument should be the value that we need to put in EXEC.
3610 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3611 // don't treat it like the rest of the arguments.
3612 RequestedExec = CLI.Args.back();
3613 assert(RequestedExec.Node && "No node for EXEC");
3614
3615 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3616 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3617
3618 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3619 CLI.Outs.pop_back();
3620 CLI.OutVals.pop_back();
3621
3622 if (RequestedExec.Ty->isIntegerTy(64)) {
3623 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3624 CLI.Outs.pop_back();
3625 CLI.OutVals.pop_back();
3626 }
3627
3628 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3629 "Haven't popped all the pieces of the EXEC mask");
3630 }
3631
3632 const SDLoc &DL = CLI.DL;
3634 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3636 SDValue Chain = CLI.Chain;
3637 SDValue Callee = CLI.Callee;
3638 bool &IsTailCall = CLI.IsTailCall;
3639 bool IsVarArg = CLI.IsVarArg;
3640 bool IsSibCall = false;
3642
3643 if (Callee.isUndef() || isNullConstant(Callee)) {
3644 if (!CLI.IsTailCall) {
3645 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3646 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3647 }
3648
3649 return Chain;
3650 }
3651
3652 if (IsVarArg) {
3653 return lowerUnhandledCall(CLI, InVals,
3654 "unsupported call to variadic function ");
3655 }
3656
3657 if (!CLI.CB)
3658 report_fatal_error("unsupported libcall legalization");
3659
3660 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3661 return lowerUnhandledCall(CLI, InVals,
3662 "unsupported required tail call to function ");
3663 }
3664
3665 if (IsTailCall) {
3667 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3668 if (!IsTailCall &&
3669 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3670 report_fatal_error("failed to perform tail call elimination on a call "
3671 "site marked musttail or on llvm.amdgcn.cs.chain");
3672 }
3673
3674 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3675
3676 // A sibling call is one where we're under the usual C ABI and not planning
3677 // to change that but can still do a tail call:
3678 if (!TailCallOpt && IsTailCall)
3679 IsSibCall = true;
3680
3681 if (IsTailCall)
3682 ++NumTailCalls;
3683 }
3684
3687 SmallVector<SDValue, 8> MemOpChains;
3688
3689 // Analyze operands of the call, assigning locations to each operand.
3691 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3692 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3693
3694 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3695 // With a fixed ABI, allocate fixed registers before user arguments.
3696 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3697 }
3698
3699 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3700
3701 // Get a count of how many bytes are to be pushed on the stack.
3702 unsigned NumBytes = CCInfo.getStackSize();
3703
3704 if (IsSibCall) {
3705 // Since we're not changing the ABI to make this a tail call, the memory
3706 // operands are already available in the caller's incoming argument space.
3707 NumBytes = 0;
3708 }
3709
3710 // FPDiff is the byte offset of the call's argument area from the callee's.
3711 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3712 // by this amount for a tail call. In a sibling call it must be 0 because the
3713 // caller will deallocate the entire stack and the callee still expects its
3714 // arguments to begin at SP+0. Completely unused for non-tail calls.
3715 int32_t FPDiff = 0;
3716 MachineFrameInfo &MFI = MF.getFrameInfo();
3717
3718 // Adjust the stack pointer for the new arguments...
3719 // These operations are automatically eliminated by the prolog/epilog pass
3720 if (!IsSibCall)
3721 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3722
3723 if (!IsSibCall || IsChainCallConv) {
3724 if (!Subtarget->enableFlatScratch()) {
3725 SmallVector<SDValue, 4> CopyFromChains;
3726
3727 // In the HSA case, this should be an identity copy.
3728 SDValue ScratchRSrcReg
3729 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3730 RegsToPass.emplace_back(IsChainCallConv
3731 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3732 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3733 ScratchRSrcReg);
3734 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3735 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3736 }
3737 }
3738
3739 MVT PtrVT = MVT::i32;
3740
3741 // Walk the register/memloc assignments, inserting copies/loads.
3742 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3743 CCValAssign &VA = ArgLocs[i];
3744 SDValue Arg = OutVals[i];
3745
3746 // Promote the value if needed.
3747 switch (VA.getLocInfo()) {
3748 case CCValAssign::Full:
3749 break;
3750 case CCValAssign::BCvt:
3751 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3752 break;
3753 case CCValAssign::ZExt:
3754 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3755 break;
3756 case CCValAssign::SExt:
3757 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3758 break;
3759 case CCValAssign::AExt:
3760 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3761 break;
3762 case CCValAssign::FPExt:
3763 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3764 break;
3765 default:
3766 llvm_unreachable("Unknown loc info!");
3767 }
3768
3769 if (VA.isRegLoc()) {
3770 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3771 } else {
3772 assert(VA.isMemLoc());
3773
3774 SDValue DstAddr;
3775 MachinePointerInfo DstInfo;
3776
3777 unsigned LocMemOffset = VA.getLocMemOffset();
3778 int32_t Offset = LocMemOffset;
3779
3780 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3781 MaybeAlign Alignment;
3782
3783 if (IsTailCall) {
3784 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3785 unsigned OpSize = Flags.isByVal() ?
3786 Flags.getByValSize() : VA.getValVT().getStoreSize();
3787
3788 // FIXME: We can have better than the minimum byval required alignment.
3789 Alignment =
3790 Flags.isByVal()
3791 ? Flags.getNonZeroByValAlign()
3792 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3793
3794 Offset = Offset + FPDiff;
3795 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3796
3797 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3798 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3799
3800 // Make sure any stack arguments overlapping with where we're storing
3801 // are loaded before this eventual operation. Otherwise they'll be
3802 // clobbered.
3803
3804 // FIXME: Why is this really necessary? This seems to just result in a
3805 // lot of code to copy the stack and write them back to the same
3806 // locations, which are supposed to be immutable?
3807 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3808 } else {
3809 // Stores to the argument stack area are relative to the stack pointer.
3810 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3811 MVT::i32);
3812 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3813 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3814 Alignment =
3815 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3816 }
3817
3818 if (Outs[i].Flags.isByVal()) {
3819 SDValue SizeNode =
3820 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3821 SDValue Cpy =
3822 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3823 Outs[i].Flags.getNonZeroByValAlign(),
3824 /*isVol = */ false, /*AlwaysInline = */ true,
3825 /*isTailCall = */ false, DstInfo,
3827
3828 MemOpChains.push_back(Cpy);
3829 } else {
3830 SDValue Store =
3831 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3832 MemOpChains.push_back(Store);
3833 }
3834 }
3835 }
3836
3837 if (!MemOpChains.empty())
3838 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3839
3840 // Build a sequence of copy-to-reg nodes chained together with token chain
3841 // and flag operands which copy the outgoing args into the appropriate regs.
3842 SDValue InGlue;
3843 for (auto &RegToPass : RegsToPass) {
3844 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3845 RegToPass.second, InGlue);
3846 InGlue = Chain.getValue(1);
3847 }
3848
3849
3850 // We don't usually want to end the call-sequence here because we would tidy
3851 // the frame up *after* the call, however in the ABI-changing tail-call case
3852 // we've carefully laid out the parameters so that when sp is reset they'll be
3853 // in the correct location.
3854 if (IsTailCall && !IsSibCall) {
3855 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3856 InGlue = Chain.getValue(1);
3857 }
3858
3859 std::vector<SDValue> Ops;
3860 Ops.push_back(Chain);
3861 Ops.push_back(Callee);
3862 // Add a redundant copy of the callee global which will not be legalized, as
3863 // we need direct access to the callee later.
3864 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3865 const GlobalValue *GV = GSD->getGlobal();
3866 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3867 } else {
3868 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3869 }
3870
3871 if (IsTailCall) {
3872 // Each tail call may have to adjust the stack by a different amount, so
3873 // this information must travel along with the operation for eventual
3874 // consumption by emitEpilogue.
3875 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3876 }
3877
3878 if (IsChainCallConv)
3879 Ops.push_back(RequestedExec.Node);
3880
3881 // Add argument registers to the end of the list so that they are known live
3882 // into the call.
3883 for (auto &RegToPass : RegsToPass) {
3884 Ops.push_back(DAG.getRegister(RegToPass.first,
3885 RegToPass.second.getValueType()));
3886 }
3887
3888 // Add a register mask operand representing the call-preserved registers.
3889 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3890 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3891 assert(Mask && "Missing call preserved mask for calling convention");
3892 Ops.push_back(DAG.getRegisterMask(Mask));
3893
3894 if (SDValue Token = CLI.ConvergenceControlToken) {
3896 GlueOps.push_back(Token);
3897 if (InGlue)
3898 GlueOps.push_back(InGlue);
3899
3900 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3901 MVT::Glue, GlueOps),
3902 0);
3903 }
3904
3905 if (InGlue)
3906 Ops.push_back(InGlue);
3907
3908 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3909
3910 // If we're doing a tall call, use a TC_RETURN here rather than an
3911 // actual call instruction.
3912 if (IsTailCall) {
3913 MFI.setHasTailCall();
3914 unsigned OPC = AMDGPUISD::TC_RETURN;
3915 switch (CallConv) {
3918 break;
3922 break;
3923 }
3924
3925 return DAG.getNode(OPC, DL, NodeTys, Ops);
3926 }
3927
3928 // Returns a chain and a flag for retval copy to use.
3929 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3930 Chain = Call.getValue(0);
3931 InGlue = Call.getValue(1);
3932
3933 uint64_t CalleePopBytes = NumBytes;
3934 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3935 if (!Ins.empty())
3936 InGlue = Chain.getValue(1);
3937
3938 // Handle result values, copying them out of physregs into vregs that we
3939 // return.
3940 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3941 InVals, /*IsThisReturn=*/false, SDValue());
3942}
3943
3944// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3945// except for applying the wave size scale to the increment amount.
3947 SDValue Op, SelectionDAG &DAG) const {
3948 const MachineFunction &MF = DAG.getMachineFunction();
3950
3951 SDLoc dl(Op);
3952 EVT VT = Op.getValueType();
3953 SDValue Tmp1 = Op;
3954 SDValue Tmp2 = Op.getValue(1);
3955 SDValue Tmp3 = Op.getOperand(2);
3956 SDValue Chain = Tmp1.getOperand(0);
3957
3958 Register SPReg = Info->getStackPtrOffsetReg();
3959
3960 // Chain the dynamic stack allocation so that it doesn't modify the stack
3961 // pointer when other instructions are using the stack.
3962 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3963
3964 SDValue Size = Tmp2.getOperand(1);
3965 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3966 Chain = SP.getValue(1);
3967 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3968 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3969 unsigned Opc =
3972
3973 SDValue ScaledSize = DAG.getNode(
3974 ISD::SHL, dl, VT, Size,
3975 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3976
3977 Align StackAlign = TFL->getStackAlign();
3978 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3979 if (Alignment && *Alignment > StackAlign) {
3980 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3981 DAG.getConstant(-(uint64_t)Alignment->value()
3982 << Subtarget->getWavefrontSizeLog2(),
3983 dl, VT));
3984 }
3985
3986 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3987 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3988
3989 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3990}
3991
3993 SelectionDAG &DAG) const {
3994 // We only handle constant sizes here to allow non-entry block, static sized
3995 // allocas. A truly dynamic value is more difficult to support because we
3996 // don't know if the size value is uniform or not. If the size isn't uniform,
3997 // we would need to do a wave reduction to get the maximum size to know how
3998 // much to increment the uniform stack pointer.
3999 SDValue Size = Op.getOperand(1);
4000 if (isa<ConstantSDNode>(Size))
4001 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4002
4004}
4005
4007 if (Op.getValueType() != MVT::i32)
4008 return Op; // Defer to cannot select error.
4009
4011 SDLoc SL(Op);
4012
4013 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4014
4015 // Convert from wave uniform to swizzled vector address. This should protect
4016 // from any edge cases where the stacksave result isn't directly used with
4017 // stackrestore.
4018 SDValue VectorAddress =
4019 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4020 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4021}
4022
4024 SelectionDAG &DAG) const {
4025 SDLoc SL(Op);
4026 assert(Op.getValueType() == MVT::i32);
4027
4028 uint32_t BothRoundHwReg =
4030 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4031
4032 SDValue IntrinID =
4033 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4034 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4035 Op.getOperand(0), IntrinID, GetRoundBothImm);
4036
4037 // There are two rounding modes, one for f32 and one for f64/f16. We only
4038 // report in the standard value range if both are the same.
4039 //
4040 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4041 // ties away from zero is not supported, and the other values are rotated by
4042 // 1.
4043 //
4044 // If the two rounding modes are not the same, report a target defined value.
4045
4046 // Mode register rounding mode fields:
4047 //
4048 // [1:0] Single-precision round mode.
4049 // [3:2] Double/Half-precision round mode.
4050 //
4051 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4052 //
4053 // Hardware Spec
4054 // Toward-0 3 0
4055 // Nearest Even 0 1
4056 // +Inf 1 2
4057 // -Inf 2 3
4058 // NearestAway0 N/A 4
4059 //
4060 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4061 // table we can index by the raw hardware mode.
4062 //
4063 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4064
4065 SDValue BitTable =
4067
4068 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4069 SDValue RoundModeTimesNumBits =
4070 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4071
4072 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4073 // knew only one mode was demanded.
4074 SDValue TableValue =
4075 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4076 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4077
4078 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4079 SDValue TableEntry =
4080 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4081
4082 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4083 // if it's an extended value.
4084 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4085 SDValue IsStandardValue =
4086 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4087 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4088 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4089 TableEntry, EnumOffset);
4090
4091 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4092}
4093
4095 SelectionDAG &DAG) const {
4096 SDLoc SL(Op);
4097
4098 SDValue NewMode = Op.getOperand(1);
4099 assert(NewMode.getValueType() == MVT::i32);
4100
4101 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4102 // hardware MODE.fp_round values.
4103 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4104 uint32_t ClampedVal = std::min(
4105 static_cast<uint32_t>(ConstMode->getZExtValue()),
4107 NewMode = DAG.getConstant(
4108 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4109 } else {
4110 // If we know the input can only be one of the supported standard modes in
4111 // the range 0-3, we can use a simplified mapping to hardware values.
4112 KnownBits KB = DAG.computeKnownBits(NewMode);
4113 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4114 // The supported standard values are 0-3. The extended values start at 8. We
4115 // need to offset by 4 if the value is in the extended range.
4116
4117 if (UseReducedTable) {
4118 // Truncate to the low 32-bits.
4119 SDValue BitTable = DAG.getConstant(
4120 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4121
4122 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4123 SDValue RoundModeTimesNumBits =
4124 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4125
4126 NewMode =
4127 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4128
4129 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4130 // the table extracted bits into inline immediates.
4131 } else {
4132 // table_index = umin(value, value - 4)
4133 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4134 SDValue BitTable =
4136
4137 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4138 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4139 SDValue IndexVal =
4140 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4141
4142 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4143 SDValue RoundModeTimesNumBits =
4144 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4145
4146 SDValue TableValue =
4147 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4148 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4149
4150 // No need to mask out the high bits since the setreg will ignore them
4151 // anyway.
4152 NewMode = TruncTable;
4153 }
4154
4155 // Insert a readfirstlane in case the value is a VGPR. We could do this
4156 // earlier and keep more operations scalar, but that interferes with
4157 // combining the source.
4158 SDValue ReadFirstLaneID =
4159 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4160 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4161 ReadFirstLaneID, NewMode);
4162 }
4163
4164 // N.B. The setreg will be later folded into s_round_mode on supported
4165 // targets.
4166 SDValue IntrinID =
4167 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4168 uint32_t BothRoundHwReg =
4170 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4171
4172 SDValue SetReg =
4173 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4174 IntrinID, RoundBothImm, NewMode);
4175
4176 return SetReg;
4177}
4178
4180 if (Op->isDivergent())
4181 return SDValue();
4182
4183 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4188 break;
4189 default:
4190 return SDValue();
4191 }
4192
4193 return Op;
4194}
4195
4196// Work around DAG legality rules only based on the result type.
4198 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4199 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4200 EVT SrcVT = Src.getValueType();
4201
4202 if (SrcVT.getScalarType() != MVT::bf16)
4203 return Op;
4204
4205 SDLoc SL(Op);
4206 SDValue BitCast =
4207 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4208
4209 EVT DstVT = Op.getValueType();
4210 if (IsStrict)
4211 llvm_unreachable("Need STRICT_BF16_TO_FP");
4212
4213 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4214}
4215
4217 SDLoc SL(Op);
4218 if (Op.getValueType() != MVT::i64)
4219 return Op;
4220
4221 uint32_t ModeHwReg =
4223 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4224 uint32_t TrapHwReg =
4226 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4227
4228 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4229 SDValue IntrinID =
4230 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4231 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4232 Op.getOperand(0), IntrinID, ModeHwRegImm);
4233 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4234 Op.getOperand(0), IntrinID, TrapHwRegImm);
4235 SDValue TokenReg =
4236 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4237 GetTrapReg.getValue(1));
4238
4239 SDValue CvtPtr =
4240 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4241 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4242
4243 return DAG.getMergeValues({Result, TokenReg}, SL);
4244}
4245
4247 SDLoc SL(Op);
4248 if (Op.getOperand(1).getValueType() != MVT::i64)
4249 return Op;
4250
4251 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4252 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4253 DAG.getConstant(0, SL, MVT::i32));
4254 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4255 DAG.getConstant(1, SL, MVT::i32));
4256
4257 SDValue ReadFirstLaneID =
4258 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4259 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4260 ReadFirstLaneID, NewModeReg);
4261 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4262 ReadFirstLaneID, NewTrapReg);
4263
4264 unsigned ModeHwReg =
4266 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4267 unsigned TrapHwReg =
4269 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4270
4271 SDValue IntrinID =
4272 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4273 SDValue SetModeReg =
4274 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4275 IntrinID, ModeHwRegImm, NewModeReg);
4276 SDValue SetTrapReg =
4277 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4278 IntrinID, TrapHwRegImm, NewTrapReg);
4279 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4280}
4281
4283 const MachineFunction &MF) const {
4285 .Case("m0", AMDGPU::M0)
4286 .Case("exec", AMDGPU::EXEC)
4287 .Case("exec_lo", AMDGPU::EXEC_LO)
4288 .Case("exec_hi", AMDGPU::EXEC_HI)
4289 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4290 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4291 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4292 .Default(Register());
4293
4294 if (Reg == AMDGPU::NoRegister) {
4295 report_fatal_error(Twine("invalid register name \""
4296 + StringRef(RegName) + "\"."));
4297
4298 }
4299
4300 if (!Subtarget->hasFlatScrRegister() &&
4301 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4302 report_fatal_error(Twine("invalid register \""
4303 + StringRef(RegName) + "\" for subtarget."));
4304 }
4305
4306 switch (Reg) {
4307 case AMDGPU::M0:
4308 case AMDGPU::EXEC_LO:
4309 case AMDGPU::EXEC_HI:
4310 case AMDGPU::FLAT_SCR_LO:
4311 case AMDGPU::FLAT_SCR_HI:
4312 if (VT.getSizeInBits() == 32)
4313 return Reg;
4314 break;
4315 case AMDGPU::EXEC:
4316 case AMDGPU::FLAT_SCR:
4317 if (VT.getSizeInBits() == 64)
4318 return Reg;
4319 break;
4320 default:
4321 llvm_unreachable("missing register type checking");
4322 }
4323
4324 report_fatal_error(Twine("invalid type for register \""
4325 + StringRef(RegName) + "\"."));
4326}
4327
4328// If kill is not the last instruction, split the block so kill is always a
4329// proper terminator.
4332 MachineBasicBlock *BB) const {
4333 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4335 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4336 return SplitBB;
4337}
4338
4339// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4340// \p MI will be the only instruction in the loop body block. Otherwise, it will
4341// be the first instruction in the remainder block.
4342//
4343/// \returns { LoopBody, Remainder }
4344static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4348
4349 // To insert the loop we need to split the block. Move everything after this
4350 // point to a new block, and insert a new empty block between the two.
4352 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4354 ++MBBI;
4355
4356 MF->insert(MBBI, LoopBB);
4357 MF->insert(MBBI, RemainderBB);
4358
4359 LoopBB->addSuccessor(LoopBB);
4360 LoopBB->addSuccessor(RemainderBB);
4361
4362 // Move the rest of the block into a new block.
4363 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4364
4365 if (InstInLoop) {
4366 auto Next = std::next(I);
4367
4368 // Move instruction to loop body.
4369 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4370
4371 // Move the rest of the block.
4372 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4373 } else {
4374 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4375 }
4376
4377 MBB.addSuccessor(LoopBB);
4378
4379 return std::pair(LoopBB, RemainderBB);
4380}
4381
4382/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4384 MachineBasicBlock *MBB = MI.getParent();
4386 auto I = MI.getIterator();
4387 auto E = std::next(I);
4388
4389 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4390 .addImm(0);
4391
4392 MIBundleBuilder Bundler(*MBB, I, E);
4393 finalizeBundle(*MBB, Bundler.begin());
4394}
4395
4398 MachineBasicBlock *BB) const {
4399 const DebugLoc &DL = MI.getDebugLoc();
4400
4402
4403 MachineBasicBlock *LoopBB;
4404 MachineBasicBlock *RemainderBB;
4406
4407 // Apparently kill flags are only valid if the def is in the same block?
4408 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4409 Src->setIsKill(false);
4410
4411 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4412
4413 MachineBasicBlock::iterator I = LoopBB->end();
4414
4415 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4417
4418 // Clear TRAP_STS.MEM_VIOL
4419 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4420 .addImm(0)
4421 .addImm(EncodedReg);
4422
4424
4425 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4426
4427 // Load and check TRAP_STS.MEM_VIOL
4428 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4429 .addImm(EncodedReg);
4430
4431 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4432 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4433 .addReg(Reg, RegState::Kill)
4434 .addImm(0);
4435 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4436 .addMBB(LoopBB);
4437
4438 return RemainderBB;
4439}
4440
4441// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4442// wavefront. If the value is uniform and just happens to be in a VGPR, this
4443// will only do one iteration. In the worst case, this will loop 64 times.
4444//
4445// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4448 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4449 const DebugLoc &DL, const MachineOperand &Idx,
4450 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4451 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4452 Register &SGPRIdxReg) {
4453
4454 MachineFunction *MF = OrigBB.getParent();
4455 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4456 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4458
4459 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4460 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4461 Register NewExec = MRI.createVirtualRegister(BoolRC);
4462 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4463 Register CondReg = MRI.createVirtualRegister(BoolRC);
4464
4465 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4466 .addReg(InitReg)
4467 .addMBB(&OrigBB)
4468 .addReg(ResultReg)
4469 .addMBB(&LoopBB);
4470
4471 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4472 .addReg(InitSaveExecReg)
4473 .addMBB(&OrigBB)
4474 .addReg(NewExec)
4475 .addMBB(&LoopBB);
4476
4477 // Read the next variant <- also loop target.
4478 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4479 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4480
4481 // Compare the just read M0 value to all possible Idx values.
4482 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4483 .addReg(CurrentIdxReg)
4484 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4485
4486 // Update EXEC, save the original EXEC value to VCC.
4487 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4488 : AMDGPU::S_AND_SAVEEXEC_B64),
4489 NewExec)
4490 .addReg(CondReg, RegState::Kill);
4491
4492 MRI.setSimpleHint(NewExec, CondReg);
4493
4494 if (UseGPRIdxMode) {
4495 if (Offset == 0) {
4496 SGPRIdxReg = CurrentIdxReg;
4497 } else {
4498 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4499 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4500 .addReg(CurrentIdxReg, RegState::Kill)
4501 .addImm(Offset);
4502 }
4503 } else {
4504 // Move index from VCC into M0
4505 if (Offset == 0) {
4506 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4507 .addReg(CurrentIdxReg, RegState::Kill);
4508 } else {
4509 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4510 .addReg(CurrentIdxReg, RegState::Kill)
4511 .addImm(Offset);
4512 }
4513 }
4514
4515 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4516 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4517 MachineInstr *InsertPt =
4518 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4519 : AMDGPU::S_XOR_B64_term), Exec)
4520 .addReg(Exec)
4521 .addReg(NewExec);
4522
4523 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4524 // s_cbranch_scc0?
4525
4526 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4527 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4528 .addMBB(&LoopBB);
4529
4530 return InsertPt->getIterator();
4531}
4532
4533// This has slightly sub-optimal regalloc when the source vector is killed by
4534// the read. The register allocator does not understand that the kill is
4535// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4536// subregister from it, using 1 more VGPR than necessary. This was saved when
4537// this was expanded after register allocation.
4540 unsigned InitResultReg, unsigned PhiReg, int Offset,
4541 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4543 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4544 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4546 const DebugLoc &DL = MI.getDebugLoc();
4548
4549 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4550 Register DstReg = MI.getOperand(0).getReg();
4551 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4552 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4553 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4554 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4555
4556 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4557
4558 // Save the EXEC mask
4559 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4560 .addReg(Exec);
4561
4562 MachineBasicBlock *LoopBB;
4563 MachineBasicBlock *RemainderBB;
4564 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4565
4566 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4567
4568 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4569 InitResultReg, DstReg, PhiReg, TmpExec,
4570 Offset, UseGPRIdxMode, SGPRIdxReg);
4571
4572 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4574 ++MBBI;
4575 MF->insert(MBBI, LandingPad);
4576 LoopBB->removeSuccessor(RemainderBB);
4577 LandingPad->addSuccessor(RemainderBB);
4578 LoopBB->addSuccessor(LandingPad);
4579 MachineBasicBlock::iterator First = LandingPad->begin();
4580 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4581 .addReg(SaveExec);
4582
4583 return InsPt;
4584}
4585
4586// Returns subreg index, offset
4587static std::pair<unsigned, int>
4589 const TargetRegisterClass *SuperRC,
4590 unsigned VecReg,
4591 int Offset) {
4592 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4593
4594 // Skip out of bounds offsets, or else we would end up using an undefined
4595 // register.
4596 if (Offset >= NumElts || Offset < 0)
4597 return std::pair(AMDGPU::sub0, Offset);
4598
4599 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4600}
4601
4604 int Offset) {
4605 MachineBasicBlock *MBB = MI.getParent();
4606 const DebugLoc &DL = MI.getDebugLoc();
4608
4609 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4610
4611 assert(Idx->getReg() != AMDGPU::NoRegister);
4612
4613 if (Offset == 0) {
4614 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4615 } else {
4616 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4617 .add(*Idx)
4618 .addImm(Offset);
4619 }
4620}
4621
4624 int Offset) {
4625 MachineBasicBlock *MBB = MI.getParent();
4626 const DebugLoc &DL = MI.getDebugLoc();
4628
4629 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4630
4631 if (Offset == 0)
4632 return Idx->getReg();
4633
4634 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4635 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4636 .add(*Idx)
4637 .addImm(Offset);
4638 return Tmp;
4639}
4640
4643 const GCNSubtarget &ST) {
4644 const SIInstrInfo *TII = ST.getInstrInfo();
4645 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4648
4649 Register Dst = MI.getOperand(0).getReg();
4650 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4651 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4652 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4653
4654 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4655 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4656
4657 unsigned SubReg;
4658 std::tie(SubReg, Offset)
4659 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4660
4661 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4662
4663 // Check for a SGPR index.
4664 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4666 const DebugLoc &DL = MI.getDebugLoc();
4667
4668 if (UseGPRIdxMode) {
4669 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4670 // to avoid interfering with other uses, so probably requires a new
4671 // optimization pass.
4673
4674 const MCInstrDesc &GPRIDXDesc =
4675 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4676 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4677 .addReg(SrcReg)
4678 .addReg(Idx)
4679 .addImm(SubReg);
4680 } else {
4682
4683 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4684 .addReg(SrcReg, 0, SubReg)
4685 .addReg(SrcReg, RegState::Implicit);
4686 }
4687
4688 MI.eraseFromParent();
4689
4690 return &MBB;
4691 }
4692
4693 // Control flow needs to be inserted if indexing with a VGPR.
4694 const DebugLoc &DL = MI.getDebugLoc();
4696
4697 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4698 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4699
4700 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4701
4702 Register SGPRIdxReg;
4703 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4704 UseGPRIdxMode, SGPRIdxReg);
4705
4706 MachineBasicBlock *LoopBB = InsPt->getParent();
4707
4708 if (UseGPRIdxMode) {
4709 const MCInstrDesc &GPRIDXDesc =
4710 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4711
4712 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4713 .addReg(SrcReg)
4714 .addReg(SGPRIdxReg)
4715 .addImm(SubReg);
4716 } else {
4717 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4718 .addReg(SrcReg, 0, SubReg)
4719 .addReg(SrcReg, RegState::Implicit);
4720 }
4721
4722 MI.eraseFromParent();
4723
4724 return LoopBB;
4725}
4726
4729 const GCNSubtarget &ST) {
4730 const SIInstrInfo *TII = ST.getInstrInfo();
4731 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4734
4735 Register Dst = MI.getOperand(0).getReg();
4736 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4737 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4738 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4739 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4740 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4741 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4742
4743 // This can be an immediate, but will be folded later.
4744 assert(Val->getReg());
4745
4746 unsigned SubReg;
4747 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4748 SrcVec->getReg(),
4749 Offset);
4750 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4751
4752 if (Idx->getReg() == AMDGPU::NoRegister) {
4754 const DebugLoc &DL = MI.getDebugLoc();
4755
4756 assert(Offset == 0);
4757
4758 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4759 .add(*SrcVec)
4760 .add(*Val)
4761 .addImm(SubReg);
4762
4763 MI.eraseFromParent();
4764 return &MBB;
4765 }
4766
4767 // Check for a SGPR index.
4768 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4770 const DebugLoc &DL = MI.getDebugLoc();
4771
4772 if (UseGPRIdxMode) {
4774
4775 const MCInstrDesc &GPRIDXDesc =
4776 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4777 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4778 .addReg(SrcVec->getReg())
4779 .add(*Val)
4780 .addReg(Idx)
4781 .addImm(SubReg);
4782 } else {
4784
4785 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4786 TRI.getRegSizeInBits(*VecRC), 32, false);
4787 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4788 .addReg(SrcVec->getReg())
4789 .add(*Val)
4790 .addImm(SubReg);
4791 }
4792 MI.eraseFromParent();
4793 return &MBB;
4794 }
4795
4796 // Control flow needs to be inserted if indexing with a VGPR.
4797 if (Val->isReg())
4798 MRI.clearKillFlags(Val->getReg());
4799
4800 const DebugLoc &DL = MI.getDebugLoc();
4801
4802 Register PhiReg = MRI.createVirtualRegister(VecRC);
4803
4804 Register SGPRIdxReg;
4805 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4806 UseGPRIdxMode, SGPRIdxReg);
4807 MachineBasicBlock *LoopBB = InsPt->getParent();
4808
4809 if (UseGPRIdxMode) {
4810 const MCInstrDesc &GPRIDXDesc =
4811 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4812
4813 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4814 .addReg(PhiReg)
4815 .add(*Val)
4816 .addReg(SGPRIdxReg)
4817 .addImm(AMDGPU::sub0);
4818 } else {
4819 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4820 TRI.getRegSizeInBits(*VecRC), 32, false);
4821 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4822 .addReg(PhiReg)
4823 .add(*Val)
4824 .addImm(AMDGPU::sub0);
4825 }
4826
4827 MI.eraseFromParent();
4828 return LoopBB;
4829}
4830
4833 const GCNSubtarget &ST,
4834 unsigned Opc) {
4836 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4837 const DebugLoc &DL = MI.getDebugLoc();
4838 const SIInstrInfo *TII = ST.getInstrInfo();
4839
4840 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4841 Register SrcReg = MI.getOperand(1).getReg();
4842 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4843 Register DstReg = MI.getOperand(0).getReg();
4844 MachineBasicBlock *RetBB = nullptr;
4845 if (isSGPR) {
4846 // These operations with a uniform value i.e. SGPR are idempotent.
4847 // Reduced value will be same as given sgpr.
4848 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4849 RetBB = &BB;
4850 } else {
4851 // TODO: Implement DPP Strategy and switch based on immediate strategy
4852 // operand. For now, for all the cases (default, Iterative and DPP we use
4853 // iterative approach by default.)
4854
4855 // To reduce the VGPR using iterative approach, we need to iterate
4856 // over all the active lanes. Lowering consists of ComputeLoop,
4857 // which iterate over only active lanes. We use copy of EXEC register
4858 // as induction variable and every active lane modifies it using bitset0
4859 // so that we will get the next active lane for next iteration.
4861 Register SrcReg = MI.getOperand(1).getReg();
4862
4863 // Create Control flow for loop
4864 // Split MI's Machine Basic block into For loop
4865 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4866
4867 // Create virtual registers required for lowering.
4868 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4869 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4870 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4871 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4872
4873 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4874 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4875 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4876
4877 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4878 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4879
4880 bool IsWave32 = ST.isWave32();
4881 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4882 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4883
4884 // Create initail values of induction variable from Exec, Accumulator and
4885 // insert branch instr to newly created ComputeBlockk
4886 uint32_t InitalValue =
4887 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4888 auto TmpSReg =
4889 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4890 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4891 .addImm(InitalValue);
4892 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4893
4894 // Start constructing ComputeLoop
4895 I = ComputeLoop->end();
4896 auto Accumulator =
4897 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4898 .addReg(InitalValReg)
4899 .addMBB(&BB);
4900 auto ActiveBits =
4901 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4902 .addReg(TmpSReg->getOperand(0).getReg())
4903 .addMBB(&BB);
4904
4905 // Perform the computations
4906 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4907 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4908 .addReg(ActiveBits->getOperand(0).getReg());
4909 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4910 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4911 .addReg(SrcReg)
4912 .addReg(FF1->getOperand(0).getReg());
4913 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4914 .addReg(Accumulator->getOperand(0).getReg())
4915 .addReg(LaneValue->getOperand(0).getReg());
4916
4917 // Manipulate the iterator to get the next active lane
4918 unsigned BITSETOpc =
4919 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4920 auto NewActiveBits =
4921 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4922 .addReg(FF1->getOperand(0).getReg())
4923 .addReg(ActiveBits->getOperand(0).getReg());
4924
4925 // Add phi nodes
4926 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4927 .addMBB(ComputeLoop);
4928 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4929 .addMBB(ComputeLoop);
4930
4931 // Creating branching
4932 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4933 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4934 .addReg(NewActiveBits->getOperand(0).getReg())
4935 .addImm(0);
4936 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4937 .addMBB(ComputeLoop);
4938
4939 RetBB = ComputeEnd;
4940 }
4941 MI.eraseFromParent();
4942 return RetBB;
4943}
4944
4946 MachineInstr &MI, MachineBasicBlock *BB) const {
4947
4949 MachineFunction *MF = BB->getParent();
4951
4952 switch (MI.getOpcode()) {
4953 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4954 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4955 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4956 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4957 case AMDGPU::S_UADDO_PSEUDO:
4958 case AMDGPU::S_USUBO_PSEUDO: {
4959 const DebugLoc &DL = MI.getDebugLoc();
4960 MachineOperand &Dest0 = MI.getOperand(0);
4961 MachineOperand &Dest1 = MI.getOperand(1);
4962 MachineOperand &Src0 = MI.getOperand(2);
4963 MachineOperand &Src1 = MI.getOperand(3);
4964
4965 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4966 ? AMDGPU::S_ADD_I32
4967 : AMDGPU::S_SUB_I32;
4968 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4969
4970 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4971 .addImm(1)
4972 .addImm(0);
4973
4974 MI.eraseFromParent();
4975 return BB;
4976 }
4977 case AMDGPU::S_ADD_U64_PSEUDO:
4978 case AMDGPU::S_SUB_U64_PSEUDO: {
4979 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4980 // For GFX12, we emit s_add_u64 and s_sub_u64.
4981 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4983 const DebugLoc &DL = MI.getDebugLoc();
4984 MachineOperand &Dest = MI.getOperand(0);
4985 MachineOperand &Src0 = MI.getOperand(1);
4986 MachineOperand &Src1 = MI.getOperand(2);
4987 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4988 if (Subtarget->hasScalarAddSub64()) {
4989 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4990 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4991 .add(Src0)
4992 .add(Src1);
4993 } else {
4994 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4995 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4996
4997 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4998 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4999
5000 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5001 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5002 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5003 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5004
5005 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5006 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5007 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5008 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5009
5010 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5011 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5012 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5013 .add(Src0Sub0)
5014 .add(Src1Sub0);
5015 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5016 .add(Src0Sub1)
5017 .add(Src1Sub1);
5018 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5019 .addReg(DestSub0)
5020 .addImm(AMDGPU::sub0)
5021 .addReg(DestSub1)
5022 .addImm(AMDGPU::sub1);
5023 }
5024 MI.eraseFromParent();
5025 return BB;
5026 }
5027 case AMDGPU::V_ADD_U64_PSEUDO:
5028 case AMDGPU::V_SUB_U64_PSEUDO: {
5030 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5031 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5032 const DebugLoc &DL = MI.getDebugLoc();
5033
5034 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5035
5036 MachineOperand &Dest = MI.getOperand(0);
5037 MachineOperand &Src0 = MI.getOperand(1);
5038 MachineOperand &Src1 = MI.getOperand(2);
5039
5040 if (IsAdd && ST.hasLshlAddB64()) {
5041 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5042 Dest.getReg())
5043 .add(Src0)
5044 .addImm(0)
5045 .add(Src1);
5046 TII->legalizeOperands(*Add);
5047 MI.eraseFromParent();
5048 return BB;
5049 }
5050
5051 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5052
5053 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5054 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5055
5056 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5057 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5058
5059 const TargetRegisterClass *Src0RC = Src0.isReg()
5060 ? MRI.getRegClass(Src0.getReg())
5061 : &AMDGPU::VReg_64RegClass;
5062 const TargetRegisterClass *Src1RC = Src1.isReg()
5063 ? MRI.getRegClass(Src1.getReg())
5064 : &AMDGPU::VReg_64RegClass;
5065
5066 const TargetRegisterClass *Src0SubRC =
5067 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5068 const TargetRegisterClass *Src1SubRC =
5069 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5070
5071 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5072 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5073 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5074 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5075
5076 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5077 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5078 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5079 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5080
5081 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5082 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5083 .addReg(CarryReg, RegState::Define)
5084 .add(SrcReg0Sub0)
5085 .add(SrcReg1Sub0)
5086 .addImm(0); // clamp bit
5087
5088 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5089 MachineInstr *HiHalf =
5090 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5091 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5092 .add(SrcReg0Sub1)
5093 .add(SrcReg1Sub1)
5094 .addReg(CarryReg, RegState::Kill)
5095 .addImm(0); // clamp bit
5096
5097 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5098 .addReg(DestSub0)
5099 .addImm(AMDGPU::sub0)
5100 .addReg(DestSub1)
5101 .addImm(AMDGPU::sub1);
5102 TII->legalizeOperands(*LoHalf);
5103 TII->legalizeOperands(*HiHalf);
5104 MI.eraseFromParent();
5105 return BB;
5106 }
5107 case AMDGPU::S_ADD_CO_PSEUDO:
5108 case AMDGPU::S_SUB_CO_PSEUDO: {
5109 // This pseudo has a chance to be selected
5110 // only from uniform add/subcarry node. All the VGPR operands
5111 // therefore assumed to be splat vectors.
5113 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5114 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5116 const DebugLoc &DL = MI.getDebugLoc();
5117 MachineOperand &Dest = MI.getOperand(0);
5118 MachineOperand &CarryDest = MI.getOperand(1);
5119 MachineOperand &Src0 = MI.getOperand(2);
5120 MachineOperand &Src1 = MI.getOperand(3);
5121 MachineOperand &Src2 = MI.getOperand(4);
5122 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5123 ? AMDGPU::S_ADDC_U32
5124 : AMDGPU::S_SUBB_U32;
5125 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5126 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5127 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5128 .addReg(Src0.getReg());
5129 Src0.setReg(RegOp0);
5130 }
5131 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5132 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5133 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5134 .addReg(Src1.getReg());
5135 Src1.setReg(RegOp1);
5136 }
5137 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5138 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5139 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5140 .addReg(Src2.getReg());
5141 Src2.setReg(RegOp2);
5142 }
5143
5144 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5145 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5146 assert(WaveSize == 64 || WaveSize == 32);
5147
5148 if (WaveSize == 64) {
5149 if (ST.hasScalarCompareEq64()) {
5150 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5151 .addReg(Src2.getReg())
5152 .addImm(0);
5153 } else {
5154 const TargetRegisterClass *SubRC =
5155 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5156 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5157 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5158 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5159 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5160 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5161
5162 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5163 .add(Src2Sub0)
5164 .add(Src2Sub1);
5165
5166 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5167 .addReg(Src2_32, RegState::Kill)
5168 .addImm(0);
5169 }
5170 } else {
5171 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5172 .addReg(Src2.getReg())
5173 .addImm(0);
5174 }
5175
5176 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5177
5178 unsigned SelOpc =
5179 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5180
5181 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5182 .addImm(-1)
5183 .addImm(0);
5184
5185 MI.eraseFromParent();
5186 return BB;
5187 }
5188 case AMDGPU::SI_INIT_M0: {
5189 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5190 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5191 .add(MI.getOperand(0));
5192 MI.eraseFromParent();
5193 return BB;
5194 }
5195 case AMDGPU::GET_GROUPSTATICSIZE: {
5196 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5197 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5198 DebugLoc DL = MI.getDebugLoc();
5199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5200 .add(MI.getOperand(0))
5201 .addImm(MFI->getLDSSize());
5202 MI.eraseFromParent();
5203 return BB;
5204 }
5205 case AMDGPU::GET_SHADERCYCLESHILO: {
5208 const DebugLoc &DL = MI.getDebugLoc();
5209 // The algorithm is:
5210 //
5211 // hi1 = getreg(SHADER_CYCLES_HI)
5212 // lo1 = getreg(SHADER_CYCLES_LO)
5213 // hi2 = getreg(SHADER_CYCLES_HI)
5214 //
5215 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5216 // Otherwise there was overflow and the result is hi2:0. In both cases the
5217 // result should represent the actual time at some point during the sequence
5218 // of three getregs.
5219 using namespace AMDGPU::Hwreg;
5220 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5222 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5223 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5225 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5226 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5227 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5228 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5229 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5230 .addReg(RegHi1)
5231 .addReg(RegHi2);
5232 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5233 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5234 .addReg(RegLo1)
5235 .addImm(0);
5236 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5237 .add(MI.getOperand(0))
5238 .addReg(RegLo)
5239 .addImm(AMDGPU::sub0)
5240 .addReg(RegHi2)
5241 .addImm(AMDGPU::sub1);
5242 MI.eraseFromParent();
5243 return BB;
5244 }
5245 case AMDGPU::SI_INDIRECT_SRC_V1:
5246 case AMDGPU::SI_INDIRECT_SRC_V2:
5247 case AMDGPU::SI_INDIRECT_SRC_V4:
5248 case AMDGPU::SI_INDIRECT_SRC_V8:
5249 case AMDGPU::SI_INDIRECT_SRC_V9:
5250 case AMDGPU::SI_INDIRECT_SRC_V10:
5251 case AMDGPU::SI_INDIRECT_SRC_V11:
5252 case AMDGPU::SI_INDIRECT_SRC_V12:
5253 case AMDGPU::SI_INDIRECT_SRC_V16:
5254 case AMDGPU::SI_INDIRECT_SRC_V32:
5255 return emitIndirectSrc(MI, *BB, *getSubtarget());
5256 case AMDGPU::SI_INDIRECT_DST_V1:
5257 case AMDGPU::SI_INDIRECT_DST_V2:
5258 case AMDGPU::SI_INDIRECT_DST_V4:
5259 case AMDGPU::SI_INDIRECT_DST_V8:
5260 case AMDGPU::SI_INDIRECT_DST_V9:
5261 case AMDGPU::SI_INDIRECT_DST_V10:
5262 case AMDGPU::SI_INDIRECT_DST_V11:
5263 case AMDGPU::SI_INDIRECT_DST_V12:
5264 case AMDGPU::SI_INDIRECT_DST_V16:
5265 case AMDGPU::SI_INDIRECT_DST_V32:
5266 return emitIndirectDst(MI, *BB, *getSubtarget());
5267 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5268 case AMDGPU::SI_KILL_I1_PSEUDO:
5269 return splitKillBlock(MI, BB);
5270 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5272 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5273 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5274
5275 Register Dst = MI.getOperand(0).getReg();
5276 const MachineOperand &Src0 = MI.getOperand(1);
5277 const MachineOperand &Src1 = MI.getOperand(2);
5278 const DebugLoc &DL = MI.getDebugLoc();
5279 Register SrcCond = MI.getOperand(3).getReg();
5280
5281 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5282 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5283 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5284 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5285
5286 const TargetRegisterClass *Src0RC = Src0.isReg()
5287 ? MRI.getRegClass(Src0.getReg())
5288 : &AMDGPU::VReg_64RegClass;
5289 const TargetRegisterClass *Src1RC = Src1.isReg()
5290 ? MRI.getRegClass(Src1.getReg())
5291 : &AMDGPU::VReg_64RegClass;
5292
5293 const TargetRegisterClass *Src0SubRC =
5294 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5295 const TargetRegisterClass *Src1SubRC =
5296 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5297
5298 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5299 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5300 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5301 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5302
5303 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5304 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5305 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5306 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5307
5308 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5309 .addReg(SrcCond);
5310 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5311 .addImm(0)
5312 .add(Src0Sub0)
5313 .addImm(0)
5314 .add(Src1Sub0)
5315 .addReg(SrcCondCopy);
5316 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5317 .addImm(0)
5318 .add(Src0Sub1)
5319 .addImm(0)
5320 .add(Src1Sub1)
5321 .addReg(SrcCondCopy);
5322
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5324 .addReg(DstLo)
5325 .addImm(AMDGPU::sub0)
5326 .addReg(DstHi)
5327 .addImm(AMDGPU::sub1);
5328 MI.eraseFromParent();
5329 return BB;
5330 }
5331 case AMDGPU::SI_BR_UNDEF: {
5333 const DebugLoc &DL = MI.getDebugLoc();
5334 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5335 .add(MI.getOperand(0));
5336 Br->getOperand(1).setIsUndef(); // read undef SCC
5337 MI.eraseFromParent();
5338 return BB;
5339 }
5340 case AMDGPU::ADJCALLSTACKUP:
5341 case AMDGPU::ADJCALLSTACKDOWN: {
5343 MachineInstrBuilder MIB(*MF, &MI);
5344 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5345 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5346 return BB;
5347 }
5348 case AMDGPU::SI_CALL_ISEL: {
5350 const DebugLoc &DL = MI.getDebugLoc();
5351
5352 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5353
5355 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5356
5357 for (const MachineOperand &MO : MI.operands())
5358 MIB.add(MO);
5359
5360 MIB.cloneMemRefs(MI);
5361 MI.eraseFromParent();
5362 return BB;
5363 }
5364 case AMDGPU::V_ADD_CO_U32_e32:
5365 case AMDGPU::V_SUB_CO_U32_e32:
5366 case AMDGPU::V_SUBREV_CO_U32_e32: {
5367 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5368 const DebugLoc &DL = MI.getDebugLoc();
5369 unsigned Opc = MI.getOpcode();
5370
5371 bool NeedClampOperand = false;
5372 if (TII->pseudoToMCOpcode(Opc) == -1) {
5373 Opc = AMDGPU::getVOPe64(Opc);
5374 NeedClampOperand = true;
5375 }
5376
5377 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5378 if (TII->isVOP3(*I)) {
5379 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5380 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5381 I.addReg(TRI->getVCC(), RegState::Define);
5382 }
5383 I.add(MI.getOperand(1))
5384 .add(MI.getOperand(2));
5385 if (NeedClampOperand)
5386 I.addImm(0); // clamp bit for e64 encoding
5387
5388 TII->legalizeOperands(*I);
5389
5390 MI.eraseFromParent();
5391 return BB;
5392 }
5393 case AMDGPU::V_ADDC_U32_e32:
5394 case AMDGPU::V_SUBB_U32_e32:
5395 case AMDGPU::V_SUBBREV_U32_e32:
5396 // These instructions have an implicit use of vcc which counts towards the
5397 // constant bus limit.
5398 TII->legalizeOperands(MI);
5399 return BB;
5400 case AMDGPU::DS_GWS_INIT:
5401 case AMDGPU::DS_GWS_SEMA_BR:
5402 case AMDGPU::DS_GWS_BARRIER:
5403 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5404 [[fallthrough]];
5405 case AMDGPU::DS_GWS_SEMA_V:
5406 case AMDGPU::DS_GWS_SEMA_P:
5407 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5408 // A s_waitcnt 0 is required to be the instruction immediately following.
5409 if (getSubtarget()->hasGWSAutoReplay()) {
5411 return BB;
5412 }
5413
5414 return emitGWSMemViolTestLoop(MI, BB);
5415 case AMDGPU::S_SETREG_B32: {
5416 // Try to optimize cases that only set the denormal mode or rounding mode.
5417 //
5418 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5419 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5420 // instead.
5421 //
5422 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5423 // allow you to have a no side effect instruction in the output of a
5424 // sideeffecting pattern.
5425 auto [ID, Offset, Width] =
5426 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5428 return BB;
5429
5430 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5431 const unsigned SetMask = WidthMask << Offset;
5432
5433 if (getSubtarget()->hasDenormModeInst()) {
5434 unsigned SetDenormOp = 0;
5435 unsigned SetRoundOp = 0;
5436
5437 // The dedicated instructions can only set the whole denorm or round mode
5438 // at once, not a subset of bits in either.
5439 if (SetMask ==
5441 // If this fully sets both the round and denorm mode, emit the two
5442 // dedicated instructions for these.
5443 SetRoundOp = AMDGPU::S_ROUND_MODE;
5444 SetDenormOp = AMDGPU::S_DENORM_MODE;
5445 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5446 SetRoundOp = AMDGPU::S_ROUND_MODE;
5447 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5448 SetDenormOp = AMDGPU::S_DENORM_MODE;
5449 }
5450
5451 if (SetRoundOp || SetDenormOp) {
5453 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5454 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5455 unsigned ImmVal = Def->getOperand(1).getImm();
5456 if (SetRoundOp) {
5457 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5458 .addImm(ImmVal & 0xf);
5459
5460 // If we also have the denorm mode, get just the denorm mode bits.
5461 ImmVal >>= 4;
5462 }
5463
5464 if (SetDenormOp) {
5465 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5466 .addImm(ImmVal & 0xf);
5467 }
5468
5469 MI.eraseFromParent();
5470 return BB;
5471 }
5472 }
5473 }
5474
5475 // If only FP bits are touched, used the no side effects pseudo.
5476 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5477 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5478 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5479
5480 return BB;
5481 }
5482 case AMDGPU::S_INVERSE_BALLOT_U32:
5483 case AMDGPU::S_INVERSE_BALLOT_U64: {
5485 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5486 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5487 const DebugLoc &DL = MI.getDebugLoc();
5488 const Register DstReg = MI.getOperand(0).getReg();
5489 Register MaskReg = MI.getOperand(1).getReg();
5490
5491 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5492
5493 if (IsVALU) {
5494 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5495 }
5496
5497 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5498 MI.eraseFromParent();
5499 return BB;
5500 }
5501 case AMDGPU::ENDPGM_TRAP: {
5502 const DebugLoc &DL = MI.getDebugLoc();
5503 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5504 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5505 MI.addOperand(MachineOperand::CreateImm(0));
5506 return BB;
5507 }
5508
5509 // We need a block split to make the real endpgm a terminator. We also don't
5510 // want to break phis in successor blocks, so we can't just delete to the
5511 // end of the block.
5512
5513 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5515 MF->push_back(TrapBB);
5516 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5517 .addImm(0);
5518 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5519 .addMBB(TrapBB);
5520
5521 BB->addSuccessor(TrapBB);
5522 MI.eraseFromParent();
5523 return SplitBB;
5524 }
5525 case AMDGPU::SIMULATED_TRAP: {
5526 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5528 MachineBasicBlock *SplitBB =
5529 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5530 MI.eraseFromParent();
5531 return SplitBB;
5532 }
5533 default:
5534 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5535 if (!MI.mayStore())
5537 return BB;
5538 }
5540 }
5541}
5542
5544 // This currently forces unfolding various combinations of fsub into fma with
5545 // free fneg'd operands. As long as we have fast FMA (controlled by
5546 // isFMAFasterThanFMulAndFAdd), we should perform these.
5547
5548 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5549 // most of these combines appear to be cycle neutral but save on instruction
5550 // count / code size.
5551 return true;
5552}
5553
5555
5557 EVT VT) const {
5558 if (!VT.isVector()) {
5559 return MVT::i1;
5560 }
5561 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5562}
5563
5565 // TODO: Should i16 be used always if legal? For now it would force VALU
5566 // shifts.
5567 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5568}
5569
5571 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5572 ? Ty.changeElementSize(16)
5573 : Ty.changeElementSize(32);
5574}
5575
5576// Answering this is somewhat tricky and depends on the specific device which
5577// have different rates for fma or all f64 operations.
5578//
5579// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5580// regardless of which device (although the number of cycles differs between
5581// devices), so it is always profitable for f64.
5582//
5583// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5584// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5585// which we can always do even without fused FP ops since it returns the same
5586// result as the separate operations and since it is always full
5587// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5588// however does not support denormals, so we do report fma as faster if we have
5589// a fast fma device and require denormals.
5590//
5592 EVT VT) const {
5593 VT = VT.getScalarType();
5594
5595 switch (VT.getSimpleVT().SimpleTy) {
5596 case MVT::f32: {
5597 // If mad is not available this depends only on if f32 fma is full rate.
5598 if (!Subtarget->hasMadMacF32Insts())
5599 return Subtarget->hasFastFMAF32();
5600
5601 // Otherwise f32 mad is always full rate and returns the same result as
5602 // the separate operations so should be preferred over fma.
5603 // However does not support denormals.
5605 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5606
5607 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5608 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5609 }
5610 case MVT::f64:
5611 return true;
5612 case MVT::f16:
5613 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5614 default:
5615 break;
5616 }
5617
5618 return false;
5619}
5620
5622 LLT Ty) const {
5623 switch (Ty.getScalarSizeInBits()) {
5624 case 16:
5625 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5626 case 32:
5627 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5628 case 64:
5629 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5630 default:
5631 break;
5632 }
5633
5634 return false;
5635}
5636
5638 if (!Ty.isScalar())
5639 return false;
5640
5641 if (Ty.getScalarSizeInBits() == 16)
5642 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5643 if (Ty.getScalarSizeInBits() == 32)
5644 return Subtarget->hasMadMacF32Insts() &&
5645 denormalModeIsFlushAllF32(*MI.getMF());
5646
5647 return false;
5648}
5649
5651 const SDNode *N) const {
5652 // TODO: Check future ftz flag
5653 // v_mad_f32/v_mac_f32 do not support denormals.
5654 EVT VT = N->getValueType(0);
5655 if (VT == MVT::f32)
5656 return Subtarget->hasMadMacF32Insts() &&
5658 if (VT == MVT::f16) {
5659 return Subtarget->hasMadF16() &&
5661 }
5662
5663 return false;
5664}
5665
5666//===----------------------------------------------------------------------===//
5667// Custom DAG Lowering Operations
5668//===----------------------------------------------------------------------===//
5669
5670// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5671// wider vector type is legal.
5673 SelectionDAG &DAG) const {
5674 unsigned Opc = Op.getOpcode();
5675 EVT VT = Op.getValueType();
5676 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5677 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5678 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5680
5681 SDValue Lo, Hi;
5682 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5683
5684 SDLoc SL(Op);
5685 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5686 Op->getFlags());
5687 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5688 Op->getFlags());
5689
5690 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5691}
5692
5693// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5694// wider vector type is legal.
5696 SelectionDAG &DAG) const {
5697 unsigned Opc = Op.getOpcode();
5698 EVT VT = Op.getValueType();
5699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5700 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5702 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5703
5704 SDValue Lo0, Hi0;
5705 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5706 SDValue Lo1, Hi1;
5707 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5708
5709 SDLoc SL(Op);
5710
5711 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5712 Op->getFlags());
5713 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5714 Op->getFlags());
5715
5716 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5717}
5718
5720 SelectionDAG &DAG) const {
5721 unsigned Opc = Op.getOpcode();
5722 EVT VT = Op.getValueType();
5723 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5724 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5725 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5726 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5727 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5728 VT == MVT::v32bf16);
5729
5730 SDValue Lo0, Hi0;
5731 SDValue Op0 = Op.getOperand(0);
5732 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5733 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5734 : std::pair(Op0, Op0);
5735 SDValue Lo1, Hi1;
5736 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5737 SDValue Lo2, Hi2;
5738 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5739
5740 SDLoc SL(Op);
5741 auto ResVT = DAG.GetSplitDestVTs(VT);
5742
5743 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5744 Op->getFlags());
5745 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5746 Op->getFlags());
5747
5748 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5749}
5750
5751
5753 switch (Op.getOpcode()) {
5754 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5755 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5756 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5757 case ISD::LOAD: {
5758 SDValue Result = LowerLOAD(Op, DAG);
5759 assert((!Result.getNode() ||
5760 Result.getNode()->getNumValues() == 2) &&
5761 "Load should return a value and a chain");
5762 return Result;
5763 }
5764 case ISD::FSQRT: {
5765 EVT VT = Op.getValueType();
5766 if (VT == MVT::f32)
5767 return lowerFSQRTF32(Op, DAG);
5768 if (VT == MVT::f64)
5769 return lowerFSQRTF64(Op, DAG);
5770 return SDValue();
5771 }
5772 case ISD::FSIN:
5773 case ISD::FCOS:
5774 return LowerTrig(Op, DAG);
5775 case ISD::SELECT: return LowerSELECT(Op, DAG);
5776 case ISD::FDIV: return LowerFDIV(Op, DAG);
5777 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5778 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5779 case ISD::STORE: return LowerSTORE(Op, DAG);
5780 case ISD::GlobalAddress: {
5783 return LowerGlobalAddress(MFI, Op, DAG);
5784 }
5785 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5786 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5787 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5788 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5790 return lowerINSERT_SUBVECTOR(Op, DAG);
5792 return lowerINSERT_VECTOR_ELT(Op, DAG);
5794 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5796 return lowerVECTOR_SHUFFLE(Op, DAG);
5798 return lowerSCALAR_TO_VECTOR(Op, DAG);
5799 case ISD::BUILD_VECTOR:
5800 return lowerBUILD_VECTOR(Op, DAG);
5801 case ISD::FP_ROUND:
5803 return lowerFP_ROUND(Op, DAG);
5804 case ISD::FPTRUNC_ROUND: {
5805 unsigned Opc;
5806 SDLoc DL(Op);
5807
5808 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5809 return SDValue();
5810
5811 // Get the rounding mode from the last operand
5812 int RoundMode = Op.getConstantOperandVal(1);
5813 if (RoundMode == (int)RoundingMode::TowardPositive)
5815 else if (RoundMode == (int)RoundingMode::TowardNegative)
5817 else
5818 return SDValue();
5819
5820 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5821 }
5822 case ISD::TRAP:
5823 return lowerTRAP(Op, DAG);
5824 case ISD::DEBUGTRAP:
5825 return lowerDEBUGTRAP(Op, DAG);
5826 case ISD::ABS:
5827 case ISD::FABS:
5828 case ISD::FNEG:
5829 case ISD::FCANONICALIZE:
5830 case ISD::BSWAP:
5831 return splitUnaryVectorOp(Op, DAG);
5832 case ISD::FMINNUM:
5833 case ISD::FMAXNUM:
5834 return lowerFMINNUM_FMAXNUM(Op, DAG);
5835 case ISD::FLDEXP:
5836 case ISD::STRICT_FLDEXP:
5837 return lowerFLDEXP(Op, DAG);
5838 case ISD::FMA:
5839 return splitTernaryVectorOp(Op, DAG);
5840 case ISD::FP_TO_SINT:
5841 case ISD::FP_TO_UINT:
5842 return LowerFP_TO_INT(Op, DAG);
5843 case ISD::SHL:
5844 case ISD::SRA:
5845 case ISD::SRL:
5846 case ISD::ADD:
5847 case ISD::SUB:
5848 case ISD::SMIN:
5849 case ISD::SMAX:
5850 case ISD::UMIN:
5851 case ISD::UMAX:
5852 case ISD::FADD:
5853 case ISD::FMUL:
5854 case ISD::FMINNUM_IEEE:
5855 case ISD::FMAXNUM_IEEE:
5856 case ISD::FMINIMUM:
5857 case ISD::FMAXIMUM:
5858 case ISD::UADDSAT:
5859 case ISD::USUBSAT:
5860 case ISD::SADDSAT:
5861 case ISD::SSUBSAT:
5862 return splitBinaryVectorOp(Op, DAG);
5863 case ISD::MUL:
5864 return lowerMUL(Op, DAG);
5865 case ISD::SMULO:
5866 case ISD::UMULO:
5867 return lowerXMULO(Op, DAG);
5868 case ISD::SMUL_LOHI:
5869 case ISD::UMUL_LOHI:
5870 return lowerXMUL_LOHI(Op, DAG);
5872 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5873 case ISD::STACKSAVE:
5874 return LowerSTACKSAVE(Op, DAG);
5875 case ISD::GET_ROUNDING:
5876 return lowerGET_ROUNDING(Op, DAG);
5877 case ISD::SET_ROUNDING:
5878 return lowerSET_ROUNDING(Op, DAG);
5879 case ISD::PREFETCH:
5880 return lowerPREFETCH(Op, DAG);
5881 case ISD::FP_EXTEND:
5883 return lowerFP_EXTEND(Op, DAG);
5884 case ISD::GET_FPENV:
5885 return lowerGET_FPENV(Op, DAG);
5886 case ISD::SET_FPENV:
5887 return lowerSET_FPENV(Op, DAG);
5888 }
5889 return SDValue();
5890}
5891
5892// Used for D16: Casts the result of an instruction into the right vector,
5893// packs values if loads return unpacked values.
5895 const SDLoc &DL,
5896 SelectionDAG &DAG, bool Unpacked) {
5897 if (!LoadVT.isVector())
5898 return Result;
5899
5900 // Cast back to the original packed type or to a larger type that is a
5901 // multiple of 32 bit for D16. Widening the return type is a required for
5902 // legalization.
5903 EVT FittingLoadVT = LoadVT;
5904 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5905 FittingLoadVT =
5907 LoadVT.getVectorNumElements() + 1);
5908 }
5909
5910 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5911 // Truncate to v2i16/v4i16.
5912 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5913
5914 // Workaround legalizer not scalarizing truncate after vector op
5915 // legalization but not creating intermediate vector trunc.
5917 DAG.ExtractVectorElements(Result, Elts);
5918 for (SDValue &Elt : Elts)
5919 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5920
5921 // Pad illegal v1i16/v3fi6 to v4i16
5922 if ((LoadVT.getVectorNumElements() % 2) == 1)
5923 Elts.push_back(DAG.getUNDEF(MVT::i16));
5924
5925 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5926
5927 // Bitcast to original type (v2f16/v4f16).
5928 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5929 }
5930
5931 // Cast back to the original packed type.
5932 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5933}
5934
5935SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5936 MemSDNode *M,
5937 SelectionDAG &DAG,
5939 bool IsIntrinsic) const {
5940 SDLoc DL(M);
5941
5942 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5943 EVT LoadVT = M->getValueType(0);
5944
5945 EVT EquivLoadVT = LoadVT;
5946 if (LoadVT.isVector()) {
5947 if (Unpacked) {
5948 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5949 LoadVT.getVectorNumElements());
5950 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5951 // Widen v3f16 to legal type
5952 EquivLoadVT =
5954 LoadVT.getVectorNumElements() + 1);
5955 }
5956 }
5957
5958 // Change from v4f16/v2f16 to EquivLoadVT.
5959 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5960
5962 = DAG.getMemIntrinsicNode(
5963 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5964 VTList, Ops, M->getMemoryVT(),
5965 M->getMemOperand());
5966
5967 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5968
5969 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5970}
5971
5972SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5973 SelectionDAG &DAG,
5974 ArrayRef<SDValue> Ops) const {
5975 SDLoc DL(M);
5976 EVT LoadVT = M->getValueType(0);
5977 EVT EltType = LoadVT.getScalarType();
5978 EVT IntVT = LoadVT.changeTypeToInteger();
5979
5980 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5981
5982 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5983 bool IsTFE = M->getNumValues() == 3;
5984
5985 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5989
5990 if (IsD16) {
5991 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5992 }
5993
5994 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5995 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5996 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5997 IsTFE);
5998
5999 if (isTypeLegal(LoadVT)) {
6000 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6001 M->getMemOperand(), DAG);
6002 }
6003
6004 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6005 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6006 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6007 M->getMemOperand(), DAG);
6008 return DAG.getMergeValues(
6009 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6010 DL);
6011}
6012
6014 SDNode *N, SelectionDAG &DAG) {
6015 EVT VT = N->getValueType(0);
6016 unsigned CondCode = N->getConstantOperandVal(3);
6017 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6018 return DAG.getUNDEF(VT);
6019
6020 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6021
6022 SDValue LHS = N->getOperand(1);
6023 SDValue RHS = N->getOperand(2);
6024
6025 SDLoc DL(N);
6026
6027 EVT CmpVT = LHS.getValueType();
6028 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6029 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6031 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6032 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6033 }
6034
6035 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6036
6037 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6038 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6039
6040 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6041 DAG.getCondCode(CCOpcode));
6042 if (VT.bitsEq(CCVT))
6043 return SetCC;
6044 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6045}
6046
6048 SDNode *N, SelectionDAG &DAG) {
6049 EVT VT = N->getValueType(0);
6050
6051 unsigned CondCode = N->getConstantOperandVal(3);
6052 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6053 return DAG.getUNDEF(VT);
6054
6055 SDValue Src0 = N->getOperand(1);
6056 SDValue Src1 = N->getOperand(2);
6057 EVT CmpVT = Src0.getValueType();
6058 SDLoc SL(N);
6059
6060 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6061 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6062 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6063 }
6064
6065 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6066 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6067 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6068 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6069 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6070 Src1, DAG.getCondCode(CCOpcode));
6071 if (VT.bitsEq(CCVT))
6072 return SetCC;
6073 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6074}
6075
6077 SelectionDAG &DAG) {
6078 EVT VT = N->getValueType(0);
6079 SDValue Src = N->getOperand(1);
6080 SDLoc SL(N);
6081
6082 if (Src.getOpcode() == ISD::SETCC) {
6083 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6084 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6085 Src.getOperand(1), Src.getOperand(2));
6086 }
6087 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6088 // (ballot 0) -> 0
6089 if (Arg->isZero())
6090 return DAG.getConstant(0, SL, VT);
6091
6092 // (ballot 1) -> EXEC/EXEC_LO
6093 if (Arg->isOne()) {
6094 Register Exec;
6095 if (VT.getScalarSizeInBits() == 32)
6096 Exec = AMDGPU::EXEC_LO;
6097 else if (VT.getScalarSizeInBits() == 64)
6098 Exec = AMDGPU::EXEC;
6099 else
6100 return SDValue();
6101
6102 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6103 }
6104 }
6105
6106 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6107 // ISD::SETNE)
6108 return DAG.getNode(
6109 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6110 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6111}
6112
6114 SelectionDAG &DAG) {
6115 EVT VT = N->getValueType(0);
6116 unsigned ValSize = VT.getSizeInBits();
6117 unsigned IID = N->getConstantOperandVal(0);
6118 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6119 IID == Intrinsic::amdgcn_permlanex16;
6120 SDLoc SL(N);
6121 MVT IntVT = MVT::getIntegerVT(ValSize);
6122
6123 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6124 SDValue Src2, MVT ValT) -> SDValue {
6126 switch (IID) {
6127 case Intrinsic::amdgcn_permlane16:
6128 case Intrinsic::amdgcn_permlanex16:
6129 Operands.push_back(N->getOperand(6));
6130 Operands.push_back(N->getOperand(5));
6131 Operands.push_back(N->getOperand(4));
6132 [[fallthrough]];
6133 case Intrinsic::amdgcn_writelane:
6134 Operands.push_back(Src2);
6135 [[fallthrough]];
6136 case Intrinsic::amdgcn_readlane:
6137 Operands.push_back(Src1);
6138 [[fallthrough]];
6139 case Intrinsic::amdgcn_readfirstlane:
6140 case Intrinsic::amdgcn_permlane64:
6141 Operands.push_back(Src0);
6142 break;
6143 default:
6144 llvm_unreachable("unhandled lane op");
6145 }
6146
6147 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6148 std::reverse(Operands.begin(), Operands.end());
6149
6150 if (SDNode *GL = N->getGluedNode()) {
6151 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6152 GL = GL->getOperand(0).getNode();
6153 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6154 SDValue(GL, 0)));
6155 }
6156
6157 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6158 };
6159
6160 SDValue Src0 = N->getOperand(1);
6161 SDValue Src1, Src2;
6162 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6163 IsPermLane16) {
6164 Src1 = N->getOperand(2);
6165 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6166 Src2 = N->getOperand(3);
6167 }
6168
6169 if (ValSize == 32) {
6170 // Already legal
6171 return SDValue();
6172 }
6173
6174 if (ValSize < 32) {
6175 bool IsFloat = VT.isFloatingPoint();
6176 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6177 SL, MVT::i32);
6178
6179 if (IsPermLane16) {
6180 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6181 SL, MVT::i32);
6182 }
6183
6184 if (IID == Intrinsic::amdgcn_writelane) {
6185 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6186 SL, MVT::i32);
6187 }
6188
6189 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6190 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6191 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6192 }
6193
6194 if (ValSize % 32 != 0)
6195 return SDValue();
6196
6197 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6198 EVT VT = N->getValueType(0);
6199 unsigned NE = VT.getVectorNumElements();
6200 EVT EltVT = VT.getVectorElementType();
6202 unsigned NumOperands = N->getNumOperands();
6203 SmallVector<SDValue, 4> Operands(NumOperands);
6204 SDNode *GL = N->getGluedNode();
6205
6206 // only handle convergencectrl_glue
6208
6209 for (unsigned i = 0; i != NE; ++i) {
6210 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6211 ++j) {
6212 SDValue Operand = N->getOperand(j);
6213 EVT OperandVT = Operand.getValueType();
6214 if (OperandVT.isVector()) {
6215 // A vector operand; extract a single element.
6216 EVT OperandEltVT = OperandVT.getVectorElementType();
6217 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6218 Operand, DAG.getVectorIdxConstant(i, SL));
6219 } else {
6220 // A scalar operand; just use it as is.
6221 Operands[j] = Operand;
6222 }
6223 }
6224
6225 if (GL)
6226 Operands[NumOperands - 1] =
6227 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6228 SDValue(GL->getOperand(0).getNode(), 0));
6229
6230 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6231 }
6232
6233 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6234 return DAG.getBuildVector(VecVT, SL, Scalars);
6235 };
6236
6237 if (VT.isVector()) {
6238 switch (MVT::SimpleValueType EltTy =
6240 case MVT::i32:
6241 case MVT::f32: {
6242 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6243 return unrollLaneOp(LaneOp.getNode());
6244 }
6245 case MVT::i16:
6246 case MVT::f16:
6247 case MVT::bf16: {
6248 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6250 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6251 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6252 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6253 DAG.getConstant(EltIdx, SL, MVT::i32));
6254
6255 if (IsPermLane16)
6256 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6257 DAG.getConstant(EltIdx, SL, MVT::i32));
6258
6259 if (IID == Intrinsic::amdgcn_writelane)
6260 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6261 DAG.getConstant(EltIdx, SL, MVT::i32));
6262
6263 Pieces.push_back(
6264 IsPermLane16
6265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6267 EltIdx += 2;
6268 }
6269 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6270 }
6271 default:
6272 // Handle all other cases by bitcasting to i32 vectors
6273 break;
6274 }
6275 }
6276
6277 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6278 Src0 = DAG.getBitcast(VecVT, Src0);
6279
6280 if (IsPermLane16)
6281 Src1 = DAG.getBitcast(VecVT, Src1);
6282
6283 if (IID == Intrinsic::amdgcn_writelane)
6284 Src2 = DAG.getBitcast(VecVT, Src2);
6285
6286 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6287 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6288 return DAG.getBitcast(VT, UnrolledLaneOp);
6289}
6290
6293 SelectionDAG &DAG) const {
6294 switch (N->getOpcode()) {
6296 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6297 Results.push_back(Res);
6298 return;
6299 }
6301 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6302 Results.push_back(Res);
6303 return;
6304 }
6306 unsigned IID = N->getConstantOperandVal(0);
6307 switch (IID) {
6308 case Intrinsic::amdgcn_make_buffer_rsrc:
6309 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6310 return;
6311 case Intrinsic::amdgcn_cvt_pkrtz: {
6312 SDValue Src0 = N->getOperand(1);
6313 SDValue Src1 = N->getOperand(2);
6314 SDLoc SL(N);
6315 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6316 Src0, Src1);
6317 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6318 return;
6319 }
6320 case Intrinsic::amdgcn_cvt_pknorm_i16:
6321 case Intrinsic::amdgcn_cvt_pknorm_u16:
6322 case Intrinsic::amdgcn_cvt_pk_i16:
6323 case Intrinsic::amdgcn_cvt_pk_u16: {
6324 SDValue Src0 = N->getOperand(1);
6325 SDValue Src1 = N->getOperand(2);
6326 SDLoc SL(N);
6327 unsigned Opcode;
6328
6329 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6331 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6333 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6335 else
6337
6338 EVT VT = N->getValueType(0);
6339 if (isTypeLegal(VT))
6340 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6341 else {
6342 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6343 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6344 }
6345 return;
6346 }
6347 case Intrinsic::amdgcn_s_buffer_load: {
6348 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6349 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6350 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6351 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6352 // s_buffer_load_i8.
6353 if (!Subtarget->hasScalarSubwordLoads())
6354 return;
6355 SDValue Op = SDValue(N, 0);
6356 SDValue Rsrc = Op.getOperand(1);
6357 SDValue Offset = Op.getOperand(2);
6358 SDValue CachePolicy = Op.getOperand(3);
6359 EVT VT = Op.getValueType();
6360 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6361 SDLoc DL(Op);
6363 const DataLayout &DataLayout = DAG.getDataLayout();
6364 Align Alignment =
6370 VT.getStoreSize(), Alignment);
6371 SDValue LoadVal;
6372 if (!Offset->isDivergent()) {
6373 SDValue Ops[] = {Rsrc, // source register
6374 Offset, CachePolicy};
6375 SDValue BufferLoad =
6377 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6378 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6379 } else {
6380 SDValue Ops[] = {
6381 DAG.getEntryNode(), // Chain
6382 Rsrc, // rsrc
6383 DAG.getConstant(0, DL, MVT::i32), // vindex
6384 {}, // voffset
6385 {}, // soffset
6386 {}, // offset
6387 CachePolicy, // cachepolicy
6388 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6389 };
6390 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6391 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6392 }
6393 Results.push_back(LoadVal);
6394 return;
6395 }
6396 }
6397 break;
6398 }
6400 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6401 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6402 // FIXME: Hacky
6403 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6404 Results.push_back(Res.getOperand(I));
6405 }
6406 } else {
6407 Results.push_back(Res);
6408 Results.push_back(Res.getValue(1));
6409 }
6410 return;
6411 }
6412
6413 break;
6414 }
6415 case ISD::SELECT: {
6416 SDLoc SL(N);
6417 EVT VT = N->getValueType(0);
6418 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6419 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6420 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6421
6422 EVT SelectVT = NewVT;
6423 if (NewVT.bitsLT(MVT::i32)) {
6424 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6425 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6426 SelectVT = MVT::i32;
6427 }
6428
6429 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6430 N->getOperand(0), LHS, RHS);
6431
6432 if (NewVT != SelectVT)
6433 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6435 return;
6436 }
6437 case ISD::FNEG: {
6438 if (N->getValueType(0) != MVT::v2f16)
6439 break;
6440
6441 SDLoc SL(N);
6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6443
6444 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6445 BC,
6446 DAG.getConstant(0x80008000, SL, MVT::i32));
6447 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6448 return;
6449 }
6450 case ISD::FABS: {
6451 if (N->getValueType(0) != MVT::v2f16)
6452 break;
6453
6454 SDLoc SL(N);
6455 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6456
6457 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6458 BC,
6459 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6460 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6461 return;
6462 }
6463 case ISD::FSQRT: {
6464 if (N->getValueType(0) != MVT::f16)
6465 break;
6466 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6467 break;
6468 }
6469 default:
6471 break;
6472 }
6473}
6474
6475/// Helper function for LowerBRCOND
6476static SDNode *findUser(SDValue Value, unsigned Opcode) {
6477
6478 SDNode *Parent = Value.getNode();
6479 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6480 I != E; ++I) {
6481
6482 if (I.getUse().get() != Value)
6483 continue;
6484
6485 if (I->getOpcode() == Opcode)
6486 return *I;
6487 }
6488 return nullptr;
6489}
6490
6491unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6492 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6493 switch (Intr->getConstantOperandVal(1)) {
6494 case Intrinsic::amdgcn_if:
6495 return AMDGPUISD::IF;
6496 case Intrinsic::amdgcn_else:
6497 return AMDGPUISD::ELSE;
6498 case Intrinsic::amdgcn_loop:
6499 return AMDGPUISD::LOOP;
6500 case Intrinsic::amdgcn_end_cf:
6501 llvm_unreachable("should not occur");
6502 default:
6503 return 0;
6504 }
6505 }
6506
6507 // break, if_break, else_break are all only used as inputs to loop, not
6508 // directly as branch conditions.
6509 return 0;
6510}
6511
6513 const Triple &TT = getTargetMachine().getTargetTriple();
6517}
6518
6520 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6521 return false;
6522
6523 // FIXME: Either avoid relying on address space here or change the default
6524 // address space for functions to avoid the explicit check.
6525 return (GV->getValueType()->isFunctionTy() ||
6528}
6529
6531 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6532}
6533
6535 if (!GV->hasExternalLinkage())
6536 return true;
6537
6538 const auto OS = getTargetMachine().getTargetTriple().getOS();
6539 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6540}
6541
6542/// This transforms the control flow intrinsics to get the branch destination as
6543/// last parameter, also switches branch target with BR if the need arise
6544SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6545 SelectionDAG &DAG) const {
6546 SDLoc DL(BRCOND);
6547
6548 SDNode *Intr = BRCOND.getOperand(1).getNode();
6549 SDValue Target = BRCOND.getOperand(2);
6550 SDNode *BR = nullptr;
6551 SDNode *SetCC = nullptr;
6552
6553 if (Intr->getOpcode() == ISD::SETCC) {
6554 // As long as we negate the condition everything is fine
6555 SetCC = Intr;
6556 Intr = SetCC->getOperand(0).getNode();
6557
6558 } else {
6559 // Get the target from BR if we don't negate the condition
6560 BR = findUser(BRCOND, ISD::BR);
6561 assert(BR && "brcond missing unconditional branch user");
6562 Target = BR->getOperand(1);
6563 }
6564
6565 unsigned CFNode = isCFIntrinsic(Intr);
6566 if (CFNode == 0) {
6567 // This is a uniform branch so we don't need to legalize.
6568 return BRCOND;
6569 }
6570
6571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6572 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6573
6574 assert(!SetCC ||
6575 (SetCC->getConstantOperandVal(1) == 1 &&
6576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6577 ISD::SETNE));
6578
6579 // operands of the new intrinsic call
6581 if (HaveChain)
6582 Ops.push_back(BRCOND.getOperand(0));
6583
6584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6585 Ops.push_back(Target);
6586
6587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6588
6589 // build the new intrinsic call
6590 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6591
6592 if (!HaveChain) {
6593 SDValue Ops[] = {
6594 SDValue(Result, 0),
6595 BRCOND.getOperand(0)
6596 };
6597
6598 Result = DAG.getMergeValues(Ops, DL).getNode();
6599 }
6600
6601 if (BR) {
6602 // Give the branch instruction our target
6603 SDValue Ops[] = {
6604 BR->getOperand(0),
6605 BRCOND.getOperand(2)
6606 };
6607 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6608 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6609 }
6610
6611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6612
6613 // Copy the intrinsic results to registers
6614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6616 if (!CopyToReg)
6617 continue;
6618
6619 Chain = DAG.getCopyToReg(
6620 Chain, DL,
6621 CopyToReg->getOperand(1),
6622 SDValue(Result, i - 1),
6623 SDValue());
6624
6625 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6626 }
6627
6628 // Remove the old intrinsic from the chain
6630 SDValue(Intr, Intr->getNumValues() - 1),
6631 Intr->getOperand(0));
6632
6633 return Chain;
6634}
6635
6636SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6637 SelectionDAG &DAG) const {
6638 MVT VT = Op.getSimpleValueType();
6639 SDLoc DL(Op);
6640 // Checking the depth
6641 if (Op.getConstantOperandVal(0) != 0)
6642 return DAG.getConstant(0, DL, VT);
6643
6646 // Check for kernel and shader functions
6647 if (Info->isEntryFunction())
6648 return DAG.getConstant(0, DL, VT);
6649
6650 MachineFrameInfo &MFI = MF.getFrameInfo();
6651 // There is a call to @llvm.returnaddress in this function
6652 MFI.setReturnAddressIsTaken(true);
6653
6655 // Get the return address reg and mark it as an implicit live-in
6656 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6657
6658 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6659}
6660
6661SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6662 SDValue Op,
6663 const SDLoc &DL,
6664 EVT VT) const {
6665 return Op.getValueType().bitsLE(VT) ?
6666 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6667 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6668 DAG.getTargetConstant(0, DL, MVT::i32));
6669}
6670
6671SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6672 assert(Op.getValueType() == MVT::f16 &&
6673 "Do not know how to custom lower FP_ROUND for non-f16 type");
6674
6675 SDValue Src = Op.getOperand(0);
6676 EVT SrcVT = Src.getValueType();
6677 if (SrcVT != MVT::f64)
6678 return Op;
6679
6680 // TODO: Handle strictfp
6681 if (Op.getOpcode() != ISD::FP_ROUND)
6682 return Op;
6683
6684 SDLoc DL(Op);
6685
6686 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6687 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6688 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6689}
6690
6691SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6692 SelectionDAG &DAG) const {
6693 EVT VT = Op.getValueType();
6694 const MachineFunction &MF = DAG.getMachineFunction();
6696 bool IsIEEEMode = Info->getMode().IEEE;
6697
6698 // FIXME: Assert during selection that this is only selected for
6699 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6700 // mode functions, but this happens to be OK since it's only done in cases
6701 // where there is known no sNaN.
6702 if (IsIEEEMode)
6703 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6704
6705 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6706 VT == MVT::v16bf16)
6707 return splitBinaryVectorOp(Op, DAG);
6708 return Op;
6709}
6710
6711SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6712 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6713 EVT VT = Op.getValueType();
6714 assert(VT == MVT::f16);
6715
6716 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6717 EVT ExpVT = Exp.getValueType();
6718 if (ExpVT == MVT::i16)
6719 return Op;
6720
6721 SDLoc DL(Op);
6722
6723 // Correct the exponent type for f16 to i16.
6724 // Clamp the range of the exponent to the instruction's range.
6725
6726 // TODO: This should be a generic narrowing legalization, and can easily be
6727 // for GlobalISel.
6728
6729 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6730 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6731
6732 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6733 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6734
6735 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6736
6737 if (IsStrict) {
6738 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6739 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6740 }
6741
6742 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6743}
6744
6745// Custom lowering for vector multiplications and s_mul_u64.
6746SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6747 EVT VT = Op.getValueType();
6748
6749 // Split vector operands.
6750 if (VT.isVector())
6751 return splitBinaryVectorOp(Op, DAG);
6752
6753 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6754
6755 // There are four ways to lower s_mul_u64:
6756 //
6757 // 1. If all the operands are uniform, then we lower it as it is.
6758 //
6759 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6760 // multiplications because there is not a vector equivalent of s_mul_u64.
6761 //
6762 // 3. If the cost model decides that it is more efficient to use vector
6763 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6764 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6765 //
6766 // 4. If the cost model decides to use vector registers and both of the
6767 // operands are zero-extended/sign-extended from 32-bits, then we split the
6768 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6769 // possible to check if the operands are zero-extended or sign-extended in
6770 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6771 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6772 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6773 // If the cost model decides that we have to use vector registers, then
6774 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6775 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6776 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6777 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6778 // SIInstrInfo.cpp .
6779
6780 if (Op->isDivergent())
6781 return SDValue();
6782
6783 SDValue Op0 = Op.getOperand(0);
6784 SDValue Op1 = Op.getOperand(1);
6785 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6786 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6787 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6788 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6789 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6790 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6791 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6792 SDLoc SL(Op);
6793 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6794 return SDValue(
6795 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6796 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6797 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6798 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6799 return SDValue(
6800 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6801 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6802 return Op;
6803}
6804
6805SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6806 EVT VT = Op.getValueType();
6807 SDLoc SL(Op);
6808 SDValue LHS = Op.getOperand(0);
6809 SDValue RHS = Op.getOperand(1);
6810 bool isSigned = Op.getOpcode() == ISD::SMULO;
6811
6812 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6813 const APInt &C = RHSC->getAPIntValue();
6814 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6815 if (C.isPowerOf2()) {
6816 // smulo(x, signed_min) is same as umulo(x, signed_min).
6817 bool UseArithShift = isSigned && !C.isMinSignedValue();
6818 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6819 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6820 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6821 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6822 SL, VT, Result, ShiftAmt),
6823 LHS, ISD::SETNE);
6824 return DAG.getMergeValues({ Result, Overflow }, SL);
6825 }
6826 }
6827
6828 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6830 SL, VT, LHS, RHS);
6831
6832 SDValue Sign = isSigned
6833 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6834 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6835 : DAG.getConstant(0, SL, VT);
6836 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6837
6838 return DAG.getMergeValues({ Result, Overflow }, SL);
6839}
6840
6841SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6842 if (Op->isDivergent()) {
6843 // Select to V_MAD_[IU]64_[IU]32.
6844 return Op;
6845 }
6846 if (Subtarget->hasSMulHi()) {
6847 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6848 return SDValue();
6849 }
6850 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6851 // calculate the high part, so we might as well do the whole thing with
6852 // V_MAD_[IU]64_[IU]32.
6853 return Op;
6854}
6855
6856SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6857 if (!Subtarget->isTrapHandlerEnabled() ||
6859 return lowerTrapEndpgm(Op, DAG);
6860
6861 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6862 lowerTrapHsaQueuePtr(Op, DAG);
6863}
6864
6865SDValue SITargetLowering::lowerTrapEndpgm(
6866 SDValue Op, SelectionDAG &DAG) const {
6867 SDLoc SL(Op);
6868 SDValue Chain = Op.getOperand(0);
6869 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6870}
6871
6872SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6873 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6876 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6878 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6881}
6882
6883SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6884 SDValue Op, SelectionDAG &DAG) const {
6885 SDLoc SL(Op);
6886 SDValue Chain = Op.getOperand(0);
6887
6888 SDValue QueuePtr;
6889 // For code object version 5, QueuePtr is passed through implicit kernarg.
6890 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6892 QueuePtr =
6893 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6894 } else {
6897 Register UserSGPR = Info->getQueuePtrUserSGPR();
6898
6899 if (UserSGPR == AMDGPU::NoRegister) {
6900 // We probably are in a function incorrectly marked with
6901 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6902 // trap, so just use a null pointer.
6903 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6904 } else {
6905 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6906 MVT::i64);
6907 }
6908 }
6909
6910 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6911 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6912 QueuePtr, SDValue());
6913
6915 SDValue Ops[] = {
6916 ToReg,
6917 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6918 SGPR01,
6919 ToReg.getValue(1)
6920 };
6921 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6922}
6923
6924SDValue SITargetLowering::lowerTrapHsa(
6925 SDValue Op, SelectionDAG &DAG) const {
6926 SDLoc SL(Op);
6927 SDValue Chain = Op.getOperand(0);
6928
6929 // We need to simulate the 's_trap 2' instruction on targets that run in
6930 // PRIV=1 (where it is treated as a nop).
6931 if (Subtarget->hasPrivEnabledTrap2NopBug())
6932 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6933
6935 SDValue Ops[] = {
6936 Chain,
6937 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6938 };
6939 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6940}
6941
6942SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6943 SDLoc SL(Op);
6944 SDValue Chain = Op.getOperand(0);
6946
6947 if (!Subtarget->isTrapHandlerEnabled() ||
6950 "debugtrap handler not supported",
6951 Op.getDebugLoc(),
6952 DS_Warning);
6953 LLVMContext &Ctx = MF.getFunction().getContext();
6954 Ctx.diagnose(NoTrap);
6955 return Chain;
6956 }
6957
6959 SDValue Ops[] = {
6960 Chain,
6961 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6962 };
6963 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6964}
6965
6966SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6967 SelectionDAG &DAG) const {
6968 if (Subtarget->hasApertureRegs()) {
6969 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6970 ? AMDGPU::SRC_SHARED_BASE
6971 : AMDGPU::SRC_PRIVATE_BASE;
6972 // Note: this feature (register) is broken. When used as a 32-bit operand,
6973 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6974 // bits.
6975 //
6976 // To work around the issue, directly emit a 64 bit mov from this register
6977 // then extract the high bits. Note that this shouldn't even result in a
6978 // shift being emitted and simply become a pair of registers (e.g.):
6979 // s_mov_b64 s[6:7], src_shared_base
6980 // v_mov_b32_e32 v1, s7
6981 //
6982 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6983 // coalescing would kick in and it would think it's okay to use the "HI"
6984 // subregister directly (instead of extracting the HI 32 bits) which is an
6985 // artificial (unusable) register.
6986 // Register TableGen definitions would need an overhaul to get rid of the
6987 // artificial "HI" aperture registers and prevent this kind of issue from
6988 // happening.
6989 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6990 DAG.getRegister(ApertureRegNo, MVT::i64));
6991 return DAG.getNode(
6992 ISD::TRUNCATE, DL, MVT::i32,
6993 DAG.getNode(ISD::SRL, DL, MVT::i64,
6994 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6995 }
6996
6997 // For code object version 5, private_base and shared_base are passed through
6998 // implicit kernargs.
6999 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7003 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7004 }
7005
7008 Register UserSGPR = Info->getQueuePtrUserSGPR();
7009 if (UserSGPR == AMDGPU::NoRegister) {
7010 // We probably are in a function incorrectly marked with
7011 // amdgpu-no-queue-ptr. This is undefined.
7012 return DAG.getUNDEF(MVT::i32);
7013 }
7014
7015 SDValue QueuePtr = CreateLiveInRegister(
7016 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7017
7018 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7019 // private_segment_aperture_base_hi.
7020 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7021
7022 SDValue Ptr =
7023 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7024
7025 // TODO: Use custom target PseudoSourceValue.
7026 // TODO: We should use the value from the IR intrinsic call, but it might not
7027 // be available and how do we get it?
7029 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7030 commonAlignment(Align(64), StructOffset),
7033}
7034
7035/// Return true if the value is a known valid address, such that a null check is
7036/// not necessary.
7038 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7039 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7040 isa<BasicBlockSDNode>(Val))
7041 return true;
7042
7043 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7044 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7045
7046 // TODO: Search through arithmetic, handle arguments and loads
7047 // marked nonnull.
7048 return false;
7049}
7050
7051SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7052 SelectionDAG &DAG) const {
7053 SDLoc SL(Op);
7054
7055 const AMDGPUTargetMachine &TM =
7056 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7057
7058 unsigned DestAS, SrcAS;
7059 SDValue Src;
7060 bool IsNonNull = false;
7061 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7062 SrcAS = ASC->getSrcAddressSpace();
7063 Src = ASC->getOperand(0);
7064 DestAS = ASC->getDestAddressSpace();
7065 } else {
7066 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7067 Op.getConstantOperandVal(0) ==
7068 Intrinsic::amdgcn_addrspacecast_nonnull);
7069 Src = Op->getOperand(1);
7070 SrcAS = Op->getConstantOperandVal(2);
7071 DestAS = Op->getConstantOperandVal(3);
7072 IsNonNull = true;
7073 }
7074
7075 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7076
7077 // flat -> local/private
7078 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7079 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7080 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7081 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7082
7083 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7084 return Ptr;
7085
7086 unsigned NullVal = TM.getNullPointerValue(DestAS);
7087 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7088 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7089
7090 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7091 SegmentNullPtr);
7092 }
7093 }
7094
7095 // local/private -> flat
7096 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7097 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7098 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7099
7100 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7101 SDValue CvtPtr =
7102 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7103 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7104
7105 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7106 return CvtPtr;
7107
7108 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7109 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7110
7111 SDValue NonNull
7112 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7113
7114 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7115 FlatNullPtr);
7116 }
7117 }
7118
7119 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7120 Op.getValueType() == MVT::i64) {
7123 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7124 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7125 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7126 }
7127
7128 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7129 Src.getValueType() == MVT::i64)
7130 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7131
7132 // global <-> flat are no-ops and never emitted.
7133
7134 const MachineFunction &MF = DAG.getMachineFunction();
7135 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7136 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7137 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7138
7139 return DAG.getUNDEF(Op->getValueType(0));
7140}
7141
7142// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7143// the small vector and inserting them into the big vector. That is better than
7144// the default expansion of doing it via a stack slot. Even though the use of
7145// the stack slot would be optimized away afterwards, the stack slot itself
7146// remains.
7147SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7148 SelectionDAG &DAG) const {
7149 SDValue Vec = Op.getOperand(0);
7150 SDValue Ins = Op.getOperand(1);
7151 SDValue Idx = Op.getOperand(2);
7152 EVT VecVT = Vec.getValueType();
7153 EVT InsVT = Ins.getValueType();
7154 EVT EltVT = VecVT.getVectorElementType();
7155 unsigned InsNumElts = InsVT.getVectorNumElements();
7156 unsigned IdxVal = Idx->getAsZExtVal();
7157 SDLoc SL(Op);
7158
7159 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7160 // Insert 32-bit registers at a time.
7161 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7162
7163 unsigned VecNumElts = VecVT.getVectorNumElements();
7164 EVT NewVecVT =
7165 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7166 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7168 MVT::i32, InsNumElts / 2);
7169
7170 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7171 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7172
7173 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7174 SDValue Elt;
7175 if (InsNumElts == 2) {
7176 Elt = Ins;
7177 } else {
7178 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7179 DAG.getConstant(I, SL, MVT::i32));
7180 }
7181 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7182 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7183 }
7184
7185 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7186 }
7187
7188 for (unsigned I = 0; I != InsNumElts; ++I) {
7189 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7190 DAG.getConstant(I, SL, MVT::i32));
7191 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7192 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7193 }
7194 return Vec;
7195}
7196
7197SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7198 SelectionDAG &DAG) const {
7199 SDValue Vec = Op.getOperand(0);
7200 SDValue InsVal = Op.getOperand(1);
7201 SDValue Idx = Op.getOperand(2);
7202 EVT VecVT = Vec.getValueType();
7203 EVT EltVT = VecVT.getVectorElementType();
7204 unsigned VecSize = VecVT.getSizeInBits();
7205 unsigned EltSize = EltVT.getSizeInBits();
7206 SDLoc SL(Op);
7207
7208 // Specially handle the case of v4i16 with static indexing.
7209 unsigned NumElts = VecVT.getVectorNumElements();
7210 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7211 if (NumElts == 4 && EltSize == 16 && KIdx) {
7212 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7213
7214 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7215 DAG.getConstant(0, SL, MVT::i32));
7216 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7217 DAG.getConstant(1, SL, MVT::i32));
7218
7219 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7220 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7221
7222 unsigned Idx = KIdx->getZExtValue();
7223 bool InsertLo = Idx < 2;
7224 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7225 InsertLo ? LoVec : HiVec,
7226 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7227 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7228
7229 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7230
7231 SDValue Concat = InsertLo ?
7232 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7233 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7234
7235 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7236 }
7237
7238 // Static indexing does not lower to stack access, and hence there is no need
7239 // for special custom lowering to avoid stack access.
7240 if (isa<ConstantSDNode>(Idx))
7241 return SDValue();
7242
7243 // Avoid stack access for dynamic indexing by custom lowering to
7244 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7245
7246 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7247
7248 MVT IntVT = MVT::getIntegerVT(VecSize);
7249
7250 // Convert vector index to bit-index and get the required bit mask.
7251 assert(isPowerOf2_32(EltSize));
7252 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7253 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7254 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7255 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7256 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7257
7258 // 1. Create a congruent vector with the target value in each element.
7259 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7260 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7261
7262 // 2. Mask off all other indices except the required index within (1).
7263 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7264
7265 // 3. Mask off the required index within the target vector.
7266 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7267 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7268 DAG.getNOT(SL, BFM, IntVT), BCVec);
7269
7270 // 4. Get (2) and (3) ORed into the target vector.
7271 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7272
7273 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7274}
7275
7276SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7277 SelectionDAG &DAG) const {
7278 SDLoc SL(Op);
7279
7280 EVT ResultVT = Op.getValueType();
7281 SDValue Vec = Op.getOperand(0);
7282 SDValue Idx = Op.getOperand(1);
7283 EVT VecVT = Vec.getValueType();
7284 unsigned VecSize = VecVT.getSizeInBits();
7285 EVT EltVT = VecVT.getVectorElementType();
7286
7287 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7288
7289 // Make sure we do any optimizations that will make it easier to fold
7290 // source modifiers before obscuring it with bit operations.
7291
7292 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7293 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7294 return Combined;
7295
7296 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7297 SDValue Lo, Hi;
7298 EVT LoVT, HiVT;
7299 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7300
7301 if (VecSize == 128) {
7302 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7303 Lo = DAG.getBitcast(LoVT,
7304 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7305 DAG.getConstant(0, SL, MVT::i32)));
7306 Hi = DAG.getBitcast(HiVT,
7307 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7308 DAG.getConstant(1, SL, MVT::i32)));
7309 } else if (VecSize == 256) {
7310 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7311 SDValue Parts[4];
7312 for (unsigned P = 0; P < 4; ++P) {
7313 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7314 DAG.getConstant(P, SL, MVT::i32));
7315 }
7316
7317 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7318 Parts[0], Parts[1]));
7319 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7320 Parts[2], Parts[3]));
7321 } else {
7322 assert(VecSize == 512);
7323
7324 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7325 SDValue Parts[8];
7326 for (unsigned P = 0; P < 8; ++P) {
7327 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7328 DAG.getConstant(P, SL, MVT::i32));
7329 }
7330
7331 Lo = DAG.getBitcast(LoVT,
7332 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7333 Parts[0], Parts[1], Parts[2], Parts[3]));
7334 Hi = DAG.getBitcast(HiVT,
7335 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7336 Parts[4], Parts[5],Parts[6], Parts[7]));
7337 }
7338
7339 EVT IdxVT = Idx.getValueType();
7340 unsigned NElem = VecVT.getVectorNumElements();
7341 assert(isPowerOf2_32(NElem));
7342 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7343 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7344 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7345 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7346 }
7347
7348 assert(VecSize <= 64);
7349
7350 MVT IntVT = MVT::getIntegerVT(VecSize);
7351
7352 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7353 SDValue VecBC = peekThroughBitcasts(Vec);
7354 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7355 SDValue Src = VecBC.getOperand(0);
7356 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7357 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7358 }
7359
7360 unsigned EltSize = EltVT.getSizeInBits();
7361 assert(isPowerOf2_32(EltSize));
7362
7363 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7364
7365 // Convert vector index to bit-index (* EltSize)
7366 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7367
7368 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7369 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7370
7371 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7372 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7373 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7374 }
7375
7376 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7377}
7378
7379static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7380 assert(Elt % 2 == 0);
7381 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7382}
7383
7384SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7385 SelectionDAG &DAG) const {
7386 SDLoc SL(Op);
7387 EVT ResultVT = Op.getValueType();
7388 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7389
7390 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7391 EVT EltVT = PackVT.getVectorElementType();
7392 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7393
7394 // vector_shuffle <0,1,6,7> lhs, rhs
7395 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7396 //
7397 // vector_shuffle <6,7,2,3> lhs, rhs
7398 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7399 //
7400 // vector_shuffle <6,7,0,1> lhs, rhs
7401 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7402
7403 // Avoid scalarizing when both halves are reading from consecutive elements.
7405 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7406 if (elementPairIsContiguous(SVN->getMask(), I)) {
7407 const int Idx = SVN->getMaskElt(I);
7408 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7409 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7410 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7411 PackVT, SVN->getOperand(VecIdx),
7412 DAG.getConstant(EltIdx, SL, MVT::i32));
7413 Pieces.push_back(SubVec);
7414 } else {
7415 const int Idx0 = SVN->getMaskElt(I);
7416 const int Idx1 = SVN->getMaskElt(I + 1);
7417 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7418 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7419 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7420 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7421
7422 SDValue Vec0 = SVN->getOperand(VecIdx0);
7423 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7424 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7425
7426 SDValue Vec1 = SVN->getOperand(VecIdx1);
7427 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7428 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7429 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7430 }
7431 }
7432
7433 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7434}
7435
7436SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7437 SelectionDAG &DAG) const {
7438 SDValue SVal = Op.getOperand(0);
7439 EVT ResultVT = Op.getValueType();
7440 EVT SValVT = SVal.getValueType();
7441 SDValue UndefVal = DAG.getUNDEF(SValVT);
7442 SDLoc SL(Op);
7443
7445 VElts.push_back(SVal);
7446 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7447 VElts.push_back(UndefVal);
7448
7449 return DAG.getBuildVector(ResultVT, SL, VElts);
7450}
7451
7452SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7453 SelectionDAG &DAG) const {
7454 SDLoc SL(Op);
7455 EVT VT = Op.getValueType();
7456
7457 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7458 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7460 VT.getVectorNumElements() / 2);
7461 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7462
7463 // Turn into pair of packed build_vectors.
7464 // TODO: Special case for constants that can be materialized with s_mov_b64.
7465 SmallVector<SDValue, 4> LoOps, HiOps;
7466 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7467 LoOps.push_back(Op.getOperand(I));
7468 HiOps.push_back(Op.getOperand(I + E));
7469 }
7470 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7471 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7472
7473 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7474 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7475
7476 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7477 { CastLo, CastHi });
7478 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7479 }
7480
7481 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7483 VT.getVectorNumElements() / 4);
7484 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7485
7486 SmallVector<SDValue, 4> Parts[4];
7487 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7488 for (unsigned P = 0; P < 4; ++P)
7489 Parts[P].push_back(Op.getOperand(I + P * E));
7490 }
7491 SDValue Casts[4];
7492 for (unsigned P = 0; P < 4; ++P) {
7493 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7494 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7495 }
7496
7497 SDValue Blend =
7498 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7499 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7500 }
7501
7502 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7504 VT.getVectorNumElements() / 8);
7505 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7506
7507 SmallVector<SDValue, 8> Parts[8];
7508 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7509 for (unsigned P = 0; P < 8; ++P)
7510 Parts[P].push_back(Op.getOperand(I + P * E));
7511 }
7512 SDValue Casts[8];
7513 for (unsigned P = 0; P < 8; ++P) {
7514 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7515 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7516 }
7517
7518 SDValue Blend =
7519 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7520 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7521 }
7522
7523 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7524 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7525
7526 SDValue Lo = Op.getOperand(0);
7527 SDValue Hi = Op.getOperand(1);
7528
7529 // Avoid adding defined bits with the zero_extend.
7530 if (Hi.isUndef()) {
7531 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7532 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7533 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7534 }
7535
7536 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7537 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7538
7539 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7540 DAG.getConstant(16, SL, MVT::i32));
7541 if (Lo.isUndef())
7542 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7543
7544 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7545 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7546
7547 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7548 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7549}
7550
7551bool
7553 // OSes that use ELF REL relocations (instead of RELA) can only store a
7554 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7555 // which can create arbitrary 64-bit addends. (This is only a problem for
7556 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7557 // the high 32 bits of the addend.)
7558 //
7559 // This should be kept in sync with how HasRelocationAddend is initialized in
7560 // the constructor of ELFAMDGPUAsmBackend.
7561 if (!Subtarget->isAmdHsaOS())
7562 return false;
7563
7564 // We can fold offsets for anything that doesn't require a GOT relocation.
7565 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7569}
7570
7571static SDValue
7573 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7574 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7575 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7576 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7577 // lowered to the following code sequence:
7578 //
7579 // For constant address space:
7580 // s_getpc_b64 s[0:1]
7581 // s_add_u32 s0, s0, $symbol
7582 // s_addc_u32 s1, s1, 0
7583 //
7584 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7585 // a fixup or relocation is emitted to replace $symbol with a literal
7586 // constant, which is a pc-relative offset from the encoding of the $symbol
7587 // operand to the global variable.
7588 //
7589 // For global address space:
7590 // s_getpc_b64 s[0:1]
7591 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7592 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7593 //
7594 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7595 // fixups or relocations are emitted to replace $symbol@*@lo and
7596 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7597 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7598 // operand to the global variable.
7599 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7600 SDValue PtrHi;
7601 if (GAFlags == SIInstrInfo::MO_NONE)
7602 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7603 else
7604 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7605 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7606}
7607
7608SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7609 SDValue Op,
7610 SelectionDAG &DAG) const {
7611 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7612 SDLoc DL(GSD);
7613 EVT PtrVT = Op.getValueType();
7614
7615 const GlobalValue *GV = GSD->getGlobal();
7621 GV->hasExternalLinkage()) {
7622 Type *Ty = GV->getValueType();
7623 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7624 // zero-sized type in other languages to declare the dynamic shared
7625 // memory which size is not known at the compile time. They will be
7626 // allocated by the runtime and placed directly after the static
7627 // allocated ones. They all share the same offset.
7628 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7629 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7630 // Adjust alignment for that dynamic shared memory array.
7632 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7633 MFI->setUsesDynamicLDS(true);
7634 return SDValue(
7635 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7636 }
7637 }
7639 }
7640
7642 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7644 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7645 }
7646
7647 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7648 SDValue AddrLo = DAG.getTargetGlobalAddress(
7649 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7650 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7651
7652 SDValue AddrHi = DAG.getTargetGlobalAddress(
7653 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7654 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7655
7656 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7657 }
7658
7659 if (shouldEmitFixup(GV))
7660 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7661
7662 if (shouldEmitPCReloc(GV))
7663 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7665
7666 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7668
7669 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7671 const DataLayout &DataLayout = DAG.getDataLayout();
7672 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7673 MachinePointerInfo PtrInfo
7675
7676 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7679}
7680
7682 const SDLoc &DL, SDValue V) const {
7683 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7684 // the destination register.
7685 //
7686 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7687 // so we will end up with redundant moves to m0.
7688 //
7689 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7690
7691 // A Null SDValue creates a glue result.
7692 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7693 V, Chain);
7694 return SDValue(M0, 0);
7695}
7696
7697SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7698 SDValue Op,
7699 MVT VT,
7700 unsigned Offset) const {
7701 SDLoc SL(Op);
7702 SDValue Param = lowerKernargMemParameter(
7703 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7704 // The local size values will have the hi 16-bits as zero.
7705 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7706 DAG.getValueType(VT));
7707}
7708
7710 EVT VT) {
7712 "non-hsa intrinsic with hsa target",
7713 DL.getDebugLoc());
7714 DAG.getContext()->diagnose(BadIntrin);
7715 return DAG.getUNDEF(VT);
7716}
7717
7719 EVT VT) {
7721 "intrinsic not supported on subtarget",
7722 DL.getDebugLoc());
7723 DAG.getContext()->diagnose(BadIntrin);
7724 return DAG.getUNDEF(VT);
7725}
7726
7728 ArrayRef<SDValue> Elts) {
7729 assert(!Elts.empty());
7730 MVT Type;
7731 unsigned NumElts = Elts.size();
7732
7733 if (NumElts <= 12) {
7734 Type = MVT::getVectorVT(MVT::f32, NumElts);
7735 } else {
7736 assert(Elts.size() <= 16);
7737 Type = MVT::v16f32;
7738 NumElts = 16;
7739 }
7740
7741 SmallVector<SDValue, 16> VecElts(NumElts);
7742 for (unsigned i = 0; i < Elts.size(); ++i) {
7743 SDValue Elt = Elts[i];
7744 if (Elt.getValueType() != MVT::f32)
7745 Elt = DAG.getBitcast(MVT::f32, Elt);
7746 VecElts[i] = Elt;
7747 }
7748 for (unsigned i = Elts.size(); i < NumElts; ++i)
7749 VecElts[i] = DAG.getUNDEF(MVT::f32);
7750
7751 if (NumElts == 1)
7752 return VecElts[0];
7753 return DAG.getBuildVector(Type, DL, VecElts);
7754}
7755
7756static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7757 SDValue Src, int ExtraElts) {
7758 EVT SrcVT = Src.getValueType();
7759
7761
7762 if (SrcVT.isVector())
7763 DAG.ExtractVectorElements(Src, Elts);
7764 else
7765 Elts.push_back(Src);
7766
7767 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7768 while (ExtraElts--)
7769 Elts.push_back(Undef);
7770
7771 return DAG.getBuildVector(CastVT, DL, Elts);
7772}
7773
7774// Re-construct the required return value for a image load intrinsic.
7775// This is more complicated due to the optional use TexFailCtrl which means the required
7776// return type is an aggregate
7778 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7779 bool Unpacked, bool IsD16, int DMaskPop,
7780 int NumVDataDwords, bool IsAtomicPacked16Bit,
7781 const SDLoc &DL) {
7782 // Determine the required return type. This is the same regardless of IsTexFail flag
7783 EVT ReqRetVT = ResultTypes[0];
7784 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7785 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7786 ? (ReqRetNumElts + 1) / 2
7787 : ReqRetNumElts;
7788
7789 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7790
7791 MVT DataDwordVT = NumDataDwords == 1 ?
7792 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7793
7794 MVT MaskPopVT = MaskPopDwords == 1 ?
7795 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7796
7797 SDValue Data(Result, 0);
7798 SDValue TexFail;
7799
7800 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7801 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7802 if (MaskPopVT.isVector()) {
7803 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7804 SDValue(Result, 0), ZeroIdx);
7805 } else {
7806 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7807 SDValue(Result, 0), ZeroIdx);
7808 }
7809 }
7810
7811 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7812 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7813 NumDataDwords - MaskPopDwords);
7814
7815 if (IsD16)
7816 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7817
7818 EVT LegalReqRetVT = ReqRetVT;
7819 if (!ReqRetVT.isVector()) {
7820 if (!Data.getValueType().isInteger())
7821 Data = DAG.getNode(ISD::BITCAST, DL,
7822 Data.getValueType().changeTypeToInteger(), Data);
7823 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7824 } else {
7825 // We need to widen the return vector to a legal type
7826 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7827 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7828 LegalReqRetVT =
7830 ReqRetVT.getVectorNumElements() + 1);
7831 }
7832 }
7833 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7834
7835 if (IsTexFail) {
7836 TexFail =
7837 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7838 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7839
7840 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7841 }
7842
7843 if (Result->getNumValues() == 1)
7844 return Data;
7845
7846 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7847}
7848
7849static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7850 SDValue *LWE, bool &IsTexFail) {
7851 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7852
7853 uint64_t Value = TexFailCtrlConst->getZExtValue();
7854 if (Value) {
7855 IsTexFail = true;
7856 }
7857
7858 SDLoc DL(TexFailCtrlConst);
7859 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7860 Value &= ~(uint64_t)0x1;
7861 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7862 Value &= ~(uint64_t)0x2;
7863
7864 return Value == 0;
7865}
7866
7868 MVT PackVectorVT,
7869 SmallVectorImpl<SDValue> &PackedAddrs,
7870 unsigned DimIdx, unsigned EndIdx,
7871 unsigned NumGradients) {
7872 SDLoc DL(Op);
7873 for (unsigned I = DimIdx; I < EndIdx; I++) {
7874 SDValue Addr = Op.getOperand(I);
7875
7876 // Gradients are packed with undef for each coordinate.
7877 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7878 // 1D: undef,dx/dh; undef,dx/dv
7879 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7880 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7881 if (((I + 1) >= EndIdx) ||
7882 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7883 I == DimIdx + NumGradients - 1))) {
7884 if (Addr.getValueType() != MVT::i16)
7885 Addr = DAG.getBitcast(MVT::i16, Addr);
7886 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7887 } else {
7888 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7889 I++;
7890 }
7891 Addr = DAG.getBitcast(MVT::f32, Addr);
7892 PackedAddrs.push_back(Addr);
7893 }
7894}
7895
7896SDValue SITargetLowering::lowerImage(SDValue Op,
7898 SelectionDAG &DAG, bool WithChain) const {
7899 SDLoc DL(Op);
7901 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7902 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7904 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7905 unsigned IntrOpcode = Intr->BaseOpcode;
7906 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7907 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7908 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7909
7910 SmallVector<EVT, 3> ResultTypes(Op->values());
7911 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7912 bool IsD16 = false;
7913 bool IsG16 = false;
7914 bool IsA16 = false;
7915 SDValue VData;
7916 int NumVDataDwords;
7917 bool AdjustRetType = false;
7918 bool IsAtomicPacked16Bit = false;
7919
7920 // Offset of intrinsic arguments
7921 const unsigned ArgOffset = WithChain ? 2 : 1;
7922
7923 unsigned DMask;
7924 unsigned DMaskLanes = 0;
7925
7926 if (BaseOpcode->Atomic) {
7927 VData = Op.getOperand(2);
7928
7929 IsAtomicPacked16Bit =
7930 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7931 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7932
7933 bool Is64Bit = VData.getValueSizeInBits() == 64;
7934 if (BaseOpcode->AtomicX2) {
7935 SDValue VData2 = Op.getOperand(3);
7936 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7937 {VData, VData2});
7938 if (Is64Bit)
7939 VData = DAG.getBitcast(MVT::v4i32, VData);
7940
7941 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7942 DMask = Is64Bit ? 0xf : 0x3;
7943 NumVDataDwords = Is64Bit ? 4 : 2;
7944 } else {
7945 DMask = Is64Bit ? 0x3 : 0x1;
7946 NumVDataDwords = Is64Bit ? 2 : 1;
7947 }
7948 } else {
7949 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7950 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7951
7952 if (BaseOpcode->Store) {
7953 VData = Op.getOperand(2);
7954
7955 MVT StoreVT = VData.getSimpleValueType();
7956 if (StoreVT.getScalarType() == MVT::f16) {
7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7958 return Op; // D16 is unsupported for this instruction
7959
7960 IsD16 = true;
7961 VData = handleD16VData(VData, DAG, true);
7962 }
7963
7964 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7965 } else {
7966 // Work out the num dwords based on the dmask popcount and underlying type
7967 // and whether packing is supported.
7968 MVT LoadVT = ResultTypes[0].getSimpleVT();
7969 if (LoadVT.getScalarType() == MVT::f16) {
7970 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7971 return Op; // D16 is unsupported for this instruction
7972
7973 IsD16 = true;
7974 }
7975
7976 // Confirm that the return type is large enough for the dmask specified
7977 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7978 (!LoadVT.isVector() && DMaskLanes > 1))
7979 return Op;
7980
7981 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7982 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7983 // instructions.
7984 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7985 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7986 NumVDataDwords = (DMaskLanes + 1) / 2;
7987 else
7988 NumVDataDwords = DMaskLanes;
7989
7990 AdjustRetType = true;
7991 }
7992 }
7993
7994 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7996
7997 // Check for 16 bit addresses or derivatives and pack if true.
7998 MVT VAddrVT =
7999 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8000 MVT VAddrScalarVT = VAddrVT.getScalarType();
8001 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8002 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8003
8004 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8005 VAddrScalarVT = VAddrVT.getScalarType();
8006 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8007 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8008
8009 // Push back extra arguments.
8010 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8011 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8012 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8013 // Special handling of bias when A16 is on. Bias is of type half but
8014 // occupies full 32-bit.
8015 SDValue Bias = DAG.getBuildVector(
8016 MVT::v2f16, DL,
8017 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8018 VAddrs.push_back(Bias);
8019 } else {
8020 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8021 "Bias needs to be converted to 16 bit in A16 mode");
8022 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8023 }
8024 }
8025
8026 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8027 // 16 bit gradients are supported, but are tied to the A16 control
8028 // so both gradients and addresses must be 16 bit
8029 LLVM_DEBUG(
8030 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8031 "require 16 bit args for both gradients and addresses");
8032 return Op;
8033 }
8034
8035 if (IsA16) {
8036 if (!ST->hasA16()) {
8037 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8038 "support 16 bit addresses\n");
8039 return Op;
8040 }
8041 }
8042
8043 // We've dealt with incorrect input so we know that if IsA16, IsG16
8044 // are set then we have to compress/pack operands (either address,
8045 // gradient or both)
8046 // In the case where a16 and gradients are tied (no G16 support) then we
8047 // have already verified that both IsA16 and IsG16 are true
8048 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8049 // Activate g16
8050 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8052 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8053 }
8054
8055 // Add gradients (packed or unpacked)
8056 if (IsG16) {
8057 // Pack the gradients
8058 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8059 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8060 ArgOffset + Intr->GradientStart,
8061 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8062 } else {
8063 for (unsigned I = ArgOffset + Intr->GradientStart;
8064 I < ArgOffset + Intr->CoordStart; I++)
8065 VAddrs.push_back(Op.getOperand(I));
8066 }
8067
8068 // Add addresses (packed or unpacked)
8069 if (IsA16) {
8070 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8071 ArgOffset + Intr->CoordStart, VAddrEnd,
8072 0 /* No gradients */);
8073 } else {
8074 // Add uncompressed address
8075 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8076 VAddrs.push_back(Op.getOperand(I));
8077 }
8078
8079 // If the register allocator cannot place the address registers contiguously
8080 // without introducing moves, then using the non-sequential address encoding
8081 // is always preferable, since it saves VALU instructions and is usually a
8082 // wash in terms of code size or even better.
8083 //
8084 // However, we currently have no way of hinting to the register allocator that
8085 // MIMG addresses should be placed contiguously when it is possible to do so,
8086 // so force non-NSA for the common 2-address case as a heuristic.
8087 //
8088 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8089 // allocation when possible.
8090 //
8091 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8092 // set of the remaining addresses.
8093 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8094 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8095 const bool UseNSA = ST->hasNSAEncoding() &&
8096 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8097 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8098 const bool UsePartialNSA =
8099 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8100
8101 SDValue VAddr;
8102 if (UsePartialNSA) {
8103 VAddr = getBuildDwordsVector(DAG, DL,
8104 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8105 }
8106 else if (!UseNSA) {
8107 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8108 }
8109
8110 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8111 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8112 SDValue Unorm;
8113 if (!BaseOpcode->Sampler) {
8114 Unorm = True;
8115 } else {
8116 uint64_t UnormConst =
8117 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8118
8119 Unorm = UnormConst ? True : False;
8120 }
8121
8122 SDValue TFE;
8123 SDValue LWE;
8124 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8125 bool IsTexFail = false;
8126 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8127 return Op;
8128
8129 if (IsTexFail) {
8130 if (!DMaskLanes) {
8131 // Expecting to get an error flag since TFC is on - and dmask is 0
8132 // Force dmask to be at least 1 otherwise the instruction will fail
8133 DMask = 0x1;
8134 DMaskLanes = 1;
8135 NumVDataDwords = 1;
8136 }
8137 NumVDataDwords += 1;
8138 AdjustRetType = true;
8139 }
8140
8141 // Has something earlier tagged that the return type needs adjusting
8142 // This happens if the instruction is a load or has set TexFailCtrl flags
8143 if (AdjustRetType) {
8144 // NumVDataDwords reflects the true number of dwords required in the return type
8145 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8146 // This is a no-op load. This can be eliminated
8147 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8148 if (isa<MemSDNode>(Op))
8149 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8150 return Undef;
8151 }
8152
8153 EVT NewVT = NumVDataDwords > 1 ?
8154 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8155 : MVT::i32;
8156
8157 ResultTypes[0] = NewVT;
8158 if (ResultTypes.size() == 3) {
8159 // Original result was aggregate type used for TexFailCtrl results
8160 // The actual instruction returns as a vector type which has now been
8161 // created. Remove the aggregate result.
8162 ResultTypes.erase(&ResultTypes[1]);
8163 }
8164 }
8165
8166 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8167 if (BaseOpcode->Atomic)
8168 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8169 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8171 return Op;
8172
8174 if (BaseOpcode->Store || BaseOpcode->Atomic)
8175 Ops.push_back(VData); // vdata
8176 if (UsePartialNSA) {
8177 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8178 Ops.push_back(VAddr);
8179 }
8180 else if (UseNSA)
8181 append_range(Ops, VAddrs);
8182 else
8183 Ops.push_back(VAddr);
8184 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8185 if (BaseOpcode->Sampler)
8186 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8187 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8188 if (IsGFX10Plus)
8189 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8190 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8191 Ops.push_back(Unorm);
8192 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8193 Ops.push_back(IsA16 && // r128, a16 for gfx9
8194 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8195 if (IsGFX10Plus)
8196 Ops.push_back(IsA16 ? True : False);
8197 if (!Subtarget->hasGFX90AInsts()) {
8198 Ops.push_back(TFE); //tfe
8199 } else if (TFE->getAsZExtVal()) {
8200 report_fatal_error("TFE is not supported on this GPU");
8201 }
8202 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8203 Ops.push_back(LWE); // lwe
8204 if (!IsGFX10Plus)
8205 Ops.push_back(DimInfo->DA ? True : False);
8206 if (BaseOpcode->HasD16)
8207 Ops.push_back(IsD16 ? True : False);
8208 if (isa<MemSDNode>(Op))
8209 Ops.push_back(Op.getOperand(0)); // chain
8210
8211 int NumVAddrDwords =
8212 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8213 int Opcode = -1;
8214
8215 if (IsGFX12Plus) {
8216 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8217 NumVDataDwords, NumVAddrDwords);
8218 } else if (IsGFX11Plus) {
8219 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8220 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8221 : AMDGPU::MIMGEncGfx11Default,
8222 NumVDataDwords, NumVAddrDwords);
8223 } else if (IsGFX10Plus) {
8224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8225 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8226 : AMDGPU::MIMGEncGfx10Default,
8227 NumVDataDwords, NumVAddrDwords);
8228 } else {
8229 if (Subtarget->hasGFX90AInsts()) {
8230 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8231 NumVDataDwords, NumVAddrDwords);
8232 if (Opcode == -1)
8234 "requested image instruction is not supported on this GPU");
8235 }
8236 if (Opcode == -1 &&
8238 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8239 NumVDataDwords, NumVAddrDwords);
8240 if (Opcode == -1)
8241 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8242 NumVDataDwords, NumVAddrDwords);
8243 }
8244 if (Opcode == -1)
8245 return Op;
8246
8247 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8248 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8249 MachineMemOperand *MemRef = MemOp->getMemOperand();
8250 DAG.setNodeMemRefs(NewNode, {MemRef});
8251 }
8252
8253 if (BaseOpcode->AtomicX2) {
8255 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8256 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8257 }
8258 if (BaseOpcode->Store)
8259 return SDValue(NewNode, 0);
8260 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8261 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8262 NumVDataDwords, IsAtomicPacked16Bit, DL);
8263}
8264
8265SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8266 SDValue Offset, SDValue CachePolicy,
8267 SelectionDAG &DAG) const {
8269
8270 const DataLayout &DataLayout = DAG.getDataLayout();
8271 Align Alignment =
8273
8278 VT.getStoreSize(), Alignment);
8279
8280 if (!Offset->isDivergent()) {
8281 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8282
8283 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8284 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8285 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8286 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8287 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8288 SDValue BufferLoad =
8290 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8291 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8292 }
8293
8294 // Widen vec3 load to vec4.
8295 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8296 !Subtarget->hasScalarDwordx3Loads()) {
8297 EVT WidenedVT =
8299 auto WidenedOp = DAG.getMemIntrinsicNode(
8300 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8301 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8302 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8303 DAG.getVectorIdxConstant(0, DL));
8304 return Subvector;
8305 }
8306
8308 DAG.getVTList(VT), Ops, VT, MMO);
8309 }
8310
8311 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8312 // assume that the buffer is unswizzled.
8313 SDValue Ops[] = {
8314 DAG.getEntryNode(), // Chain
8315 Rsrc, // rsrc
8316 DAG.getConstant(0, DL, MVT::i32), // vindex
8317 {}, // voffset
8318 {}, // soffset
8319 {}, // offset
8320 CachePolicy, // cachepolicy
8321 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8322 };
8323 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8324 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8325 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8326 }
8327
8329 unsigned NumLoads = 1;
8330 MVT LoadVT = VT.getSimpleVT();
8331 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8332 assert((LoadVT.getScalarType() == MVT::i32 ||
8333 LoadVT.getScalarType() == MVT::f32));
8334
8335 if (NumElts == 8 || NumElts == 16) {
8336 NumLoads = NumElts / 4;
8337 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8338 }
8339
8340 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8341
8342 // Use the alignment to ensure that the required offsets will fit into the
8343 // immediate offsets.
8344 setBufferOffsets(Offset, DAG, &Ops[3],
8345 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8346
8347 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8348 for (unsigned i = 0; i < NumLoads; ++i) {
8349 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8350 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8351 LoadVT, MMO, DAG));
8352 }
8353
8354 if (NumElts == 8 || NumElts == 16)
8355 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8356
8357 return Loads[0];
8358}
8359
8360SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8361 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8362 if (!Subtarget->hasArchitectedSGPRs())
8363 return {};
8364 SDLoc SL(Op);
8365 MVT VT = MVT::i32;
8366 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8367 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8368 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8369}
8370
8371SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8372 unsigned Dim,
8373 const ArgDescriptor &Arg) const {
8374 SDLoc SL(Op);
8376 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8377 if (MaxID == 0)
8378 return DAG.getConstant(0, SL, MVT::i32);
8379
8380 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8381 SDLoc(DAG.getEntryNode()), Arg);
8382
8383 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8384 // masking operations anyway.
8385 //
8386 // TODO: We could assert the top bit is 0 for the source copy.
8387 if (Arg.isMasked())
8388 return Val;
8389
8390 // Preserve the known bits after expansion to a copy.
8392 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8393 DAG.getValueType(SmallVT));
8394}
8395
8396SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8397 SelectionDAG &DAG) const {
8399 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8400
8401 EVT VT = Op.getValueType();
8402 SDLoc DL(Op);
8403 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8404
8405 // TODO: Should this propagate fast-math-flags?
8406
8407 switch (IntrinsicID) {
8408 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8409 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8410 return emitNonHSAIntrinsicError(DAG, DL, VT);
8411 return getPreloadedValue(DAG, *MFI, VT,
8413 }
8414 case Intrinsic::amdgcn_dispatch_ptr:
8415 case Intrinsic::amdgcn_queue_ptr: {
8416 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8417 DiagnosticInfoUnsupported BadIntrin(
8418 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8419 DL.getDebugLoc());
8420 DAG.getContext()->diagnose(BadIntrin);
8421 return DAG.getUNDEF(VT);
8422 }
8423
8424 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8426 return getPreloadedValue(DAG, *MFI, VT, RegID);
8427 }
8428 case Intrinsic::amdgcn_implicitarg_ptr: {
8429 if (MFI->isEntryFunction())
8430 return getImplicitArgPtr(DAG, DL);
8431 return getPreloadedValue(DAG, *MFI, VT,
8433 }
8434 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8436 // This only makes sense to call in a kernel, so just lower to null.
8437 return DAG.getConstant(0, DL, VT);
8438 }
8439
8440 return getPreloadedValue(DAG, *MFI, VT,
8442 }
8443 case Intrinsic::amdgcn_dispatch_id: {
8444 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8445 }
8446 case Intrinsic::amdgcn_rcp:
8447 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8448 case Intrinsic::amdgcn_rsq:
8449 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8450 case Intrinsic::amdgcn_rsq_legacy:
8452 return emitRemovedIntrinsicError(DAG, DL, VT);
8453 return SDValue();
8454 case Intrinsic::amdgcn_rcp_legacy:
8456 return emitRemovedIntrinsicError(DAG, DL, VT);
8457 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8458 case Intrinsic::amdgcn_rsq_clamp: {
8460 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8461
8462 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8465
8466 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8467 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8468 DAG.getConstantFP(Max, DL, VT));
8469 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8470 DAG.getConstantFP(Min, DL, VT));
8471 }
8472 case Intrinsic::r600_read_ngroups_x:
8473 if (Subtarget->isAmdHsaOS())
8474 return emitNonHSAIntrinsicError(DAG, DL, VT);
8475
8476 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8478 false);
8479 case Intrinsic::r600_read_ngroups_y:
8480 if (Subtarget->isAmdHsaOS())
8481 return emitNonHSAIntrinsicError(DAG, DL, VT);
8482
8483 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8485 false);
8486 case Intrinsic::r600_read_ngroups_z:
8487 if (Subtarget->isAmdHsaOS())
8488 return emitNonHSAIntrinsicError(DAG, DL, VT);
8489
8490 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8492 false);
8493 case Intrinsic::r600_read_global_size_x:
8494 if (Subtarget->isAmdHsaOS())
8495 return emitNonHSAIntrinsicError(DAG, DL, VT);
8496
8497 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8499 Align(4), false);
8500 case Intrinsic::r600_read_global_size_y:
8501 if (Subtarget->isAmdHsaOS())
8502 return emitNonHSAIntrinsicError(DAG, DL, VT);
8503
8504 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8506 Align(4), false);
8507 case Intrinsic::r600_read_global_size_z:
8508 if (Subtarget->isAmdHsaOS())
8509 return emitNonHSAIntrinsicError(DAG, DL, VT);
8510
8511 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8513 Align(4), false);
8514 case Intrinsic::r600_read_local_size_x:
8515 if (Subtarget->isAmdHsaOS())
8516 return emitNonHSAIntrinsicError(DAG, DL, VT);
8517
8518 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8520 case Intrinsic::r600_read_local_size_y:
8521 if (Subtarget->isAmdHsaOS())
8522 return emitNonHSAIntrinsicError(DAG, DL, VT);
8523
8524 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8526 case Intrinsic::r600_read_local_size_z:
8527 if (Subtarget->isAmdHsaOS())
8528 return emitNonHSAIntrinsicError(DAG, DL, VT);
8529
8530 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8532 case Intrinsic::amdgcn_workgroup_id_x:
8533 return getPreloadedValue(DAG, *MFI, VT,
8535 case Intrinsic::amdgcn_workgroup_id_y:
8536 return getPreloadedValue(DAG, *MFI, VT,
8538 case Intrinsic::amdgcn_workgroup_id_z:
8539 return getPreloadedValue(DAG, *MFI, VT,
8541 case Intrinsic::amdgcn_wave_id:
8542 return lowerWaveID(DAG, Op);
8543 case Intrinsic::amdgcn_lds_kernel_id: {
8544 if (MFI->isEntryFunction())
8545 return getLDSKernelId(DAG, DL);
8546 return getPreloadedValue(DAG, *MFI, VT,
8548 }
8549 case Intrinsic::amdgcn_workitem_id_x:
8550 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8551 case Intrinsic::amdgcn_workitem_id_y:
8552 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8553 case Intrinsic::amdgcn_workitem_id_z:
8554 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8555 case Intrinsic::amdgcn_wavefrontsize:
8557 SDLoc(Op), MVT::i32);
8558 case Intrinsic::amdgcn_s_buffer_load: {
8559 unsigned CPol = Op.getConstantOperandVal(3);
8560 // s_buffer_load, because of how it's optimized, can't be volatile
8561 // so reject ones with the volatile bit set.
8562 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8565 return Op;
8566 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8567 DAG);
8568 }
8569 case Intrinsic::amdgcn_fdiv_fast:
8570 return lowerFDIV_FAST(Op, DAG);
8571 case Intrinsic::amdgcn_sin:
8572 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8573
8574 case Intrinsic::amdgcn_cos:
8575 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8576
8577 case Intrinsic::amdgcn_mul_u24:
8578 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8579 case Intrinsic::amdgcn_mul_i24:
8580 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8581
8582 case Intrinsic::amdgcn_log_clamp: {
8584 return SDValue();
8585
8586 return emitRemovedIntrinsicError(DAG, DL, VT);
8587 }
8588 case Intrinsic::amdgcn_fract:
8589 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8590
8591 case Intrinsic::amdgcn_class:
8592 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8593 Op.getOperand(1), Op.getOperand(2));
8594 case Intrinsic::amdgcn_div_fmas:
8595 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8596 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8597 Op.getOperand(4));
8598
8599 case Intrinsic::amdgcn_div_fixup:
8600 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8601 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8602
8603 case Intrinsic::amdgcn_div_scale: {
8604 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8605
8606 // Translate to the operands expected by the machine instruction. The
8607 // first parameter must be the same as the first instruction.
8608 SDValue Numerator = Op.getOperand(1);
8609 SDValue Denominator = Op.getOperand(2);
8610
8611 // Note this order is opposite of the machine instruction's operations,
8612 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8613 // intrinsic has the numerator as the first operand to match a normal
8614 // division operation.
8615
8616 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8617
8618 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8619 Denominator, Numerator);
8620 }
8621 case Intrinsic::amdgcn_icmp: {
8622 // There is a Pat that handles this variant, so return it as-is.
8623 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8624 Op.getConstantOperandVal(2) == 0 &&
8625 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8626 return Op;
8627 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8628 }
8629 case Intrinsic::amdgcn_fcmp: {
8630 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8631 }
8632 case Intrinsic::amdgcn_ballot:
8633 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8634 case Intrinsic::amdgcn_fmed3:
8635 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8636 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8637 case Intrinsic::amdgcn_fdot2:
8638 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8639 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8640 Op.getOperand(4));
8641 case Intrinsic::amdgcn_fmul_legacy:
8642 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8643 Op.getOperand(1), Op.getOperand(2));
8644 case Intrinsic::amdgcn_sffbh:
8645 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8646 case Intrinsic::amdgcn_sbfe:
8647 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8648 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8649 case Intrinsic::amdgcn_ubfe:
8650 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8651 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8652 case Intrinsic::amdgcn_cvt_pkrtz:
8653 case Intrinsic::amdgcn_cvt_pknorm_i16:
8654 case Intrinsic::amdgcn_cvt_pknorm_u16:
8655 case Intrinsic::amdgcn_cvt_pk_i16:
8656 case Intrinsic::amdgcn_cvt_pk_u16: {
8657 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8658 EVT VT = Op.getValueType();
8659 unsigned Opcode;
8660
8661 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8663 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8665 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8667 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8669 else
8671
8672 if (isTypeLegal(VT))
8673 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8674
8675 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8676 Op.getOperand(1), Op.getOperand(2));
8677 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8678 }
8679 case Intrinsic::amdgcn_fmad_ftz:
8680 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8681 Op.getOperand(2), Op.getOperand(3));
8682
8683 case Intrinsic::amdgcn_if_break:
8684 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8685 Op->getOperand(1), Op->getOperand(2)), 0);
8686
8687 case Intrinsic::amdgcn_groupstaticsize: {
8689 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8690 return Op;
8691
8692 const Module *M = MF.getFunction().getParent();
8693 const GlobalValue *GV =
8694 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8695 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8697 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8698 }
8699 case Intrinsic::amdgcn_is_shared:
8700 case Intrinsic::amdgcn_is_private: {
8701 SDLoc SL(Op);
8702 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8704 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8705 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8706 Op.getOperand(1));
8707
8708 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8709 DAG.getConstant(1, SL, MVT::i32));
8710 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8711 }
8712 case Intrinsic::amdgcn_perm:
8713 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8714 Op.getOperand(2), Op.getOperand(3));
8715 case Intrinsic::amdgcn_reloc_constant: {
8716 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8717 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8718 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8719 auto RelocSymbol = cast<GlobalVariable>(
8720 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8721 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8723 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8724 }
8725 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8726 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8727 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8728 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8729 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8730 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8731 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8732 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8733 if (Op.getOperand(4).getValueType() == MVT::i32)
8734 return SDValue();
8735
8736 SDLoc SL(Op);
8737 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8738 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8739 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8740 Op.getOperand(3), IndexKeyi32);
8741 }
8742 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8743 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8744 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8745 if (Op.getOperand(6).getValueType() == MVT::i32)
8746 return SDValue();
8747
8748 SDLoc SL(Op);
8749 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8750 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8751 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8752 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8753 IndexKeyi32, Op.getOperand(7)});
8754 }
8755 case Intrinsic::amdgcn_addrspacecast_nonnull:
8756 return lowerADDRSPACECAST(Op, DAG);
8757 case Intrinsic::amdgcn_readlane:
8758 case Intrinsic::amdgcn_readfirstlane:
8759 case Intrinsic::amdgcn_writelane:
8760 case Intrinsic::amdgcn_permlane16:
8761 case Intrinsic::amdgcn_permlanex16:
8762 case Intrinsic::amdgcn_permlane64:
8763 return lowerLaneOp(*this, Op.getNode(), DAG);
8764 default:
8765 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8767 return lowerImage(Op, ImageDimIntr, DAG, false);
8768
8769 return Op;
8770 }
8771}
8772
8773// On targets not supporting constant in soffset field, turn zero to
8774// SGPR_NULL to avoid generating an extra s_mov with zero.
8776 const GCNSubtarget *Subtarget) {
8777 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8778 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8779 return SOffset;
8780}
8781
8782SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8783 SelectionDAG &DAG,
8784 unsigned NewOpcode) const {
8785 SDLoc DL(Op);
8786
8787 SDValue VData = Op.getOperand(2);
8788 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8789 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8790 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8791 SDValue Ops[] = {
8792 Op.getOperand(0), // Chain
8793 VData, // vdata
8794 Rsrc, // rsrc
8795 DAG.getConstant(0, DL, MVT::i32), // vindex
8796 Offsets.first, // voffset
8797 SOffset, // soffset
8798 Offsets.second, // offset
8799 Op.getOperand(6), // cachepolicy
8800 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8801 };
8802
8803 auto *M = cast<MemSDNode>(Op);
8804
8805 EVT MemVT = VData.getValueType();
8806 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8807 M->getMemOperand());
8808}
8809
8810SDValue
8811SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8812 unsigned NewOpcode) const {
8813 SDLoc DL(Op);
8814
8815 SDValue VData = Op.getOperand(2);
8816 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8817 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8818 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8819 SDValue Ops[] = {
8820 Op.getOperand(0), // Chain
8821 VData, // vdata
8822 Rsrc, // rsrc
8823 Op.getOperand(4), // vindex
8824 Offsets.first, // voffset
8825 SOffset, // soffset
8826 Offsets.second, // offset
8827 Op.getOperand(7), // cachepolicy
8828 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8829 };
8830
8831 auto *M = cast<MemSDNode>(Op);
8832
8833 EVT MemVT = VData.getValueType();
8834 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8835 M->getMemOperand());
8836}
8837
8838SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8839 SelectionDAG &DAG) const {
8840 unsigned IntrID = Op.getConstantOperandVal(1);
8841 SDLoc DL(Op);
8842
8843 switch (IntrID) {
8844 case Intrinsic::amdgcn_ds_ordered_add:
8845 case Intrinsic::amdgcn_ds_ordered_swap: {
8846 MemSDNode *M = cast<MemSDNode>(Op);
8847 SDValue Chain = M->getOperand(0);
8848 SDValue M0 = M->getOperand(2);
8849 SDValue Value = M->getOperand(3);
8850 unsigned IndexOperand = M->getConstantOperandVal(7);
8851 unsigned WaveRelease = M->getConstantOperandVal(8);
8852 unsigned WaveDone = M->getConstantOperandVal(9);
8853
8854 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8855 IndexOperand &= ~0x3f;
8856 unsigned CountDw = 0;
8857
8858 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8859 CountDw = (IndexOperand >> 24) & 0xf;
8860 IndexOperand &= ~(0xf << 24);
8861
8862 if (CountDw < 1 || CountDw > 4) {
8864 "ds_ordered_count: dword count must be between 1 and 4");
8865 }
8866 }
8867
8868 if (IndexOperand)
8869 report_fatal_error("ds_ordered_count: bad index operand");
8870
8871 if (WaveDone && !WaveRelease)
8872 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8873
8874 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8875 unsigned ShaderType =
8877 unsigned Offset0 = OrderedCountIndex << 2;
8878 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8879
8880 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8881 Offset1 |= (CountDw - 1) << 6;
8882
8883 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8884 Offset1 |= ShaderType << 2;
8885
8886 unsigned Offset = Offset0 | (Offset1 << 8);
8887
8888 SDValue Ops[] = {
8889 Chain,
8890 Value,
8891 DAG.getTargetConstant(Offset, DL, MVT::i16),
8892 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8893 };
8895 M->getVTList(), Ops, M->getMemoryVT(),
8896 M->getMemOperand());
8897 }
8898 case Intrinsic::amdgcn_raw_buffer_load:
8899 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8900 case Intrinsic::amdgcn_raw_buffer_load_format:
8901 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8902 const bool IsFormat =
8903 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8904 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8905
8906 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8907 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8908 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8909 SDValue Ops[] = {
8910 Op.getOperand(0), // Chain
8911 Rsrc, // rsrc
8912 DAG.getConstant(0, DL, MVT::i32), // vindex
8913 Offsets.first, // voffset
8914 SOffset, // soffset
8915 Offsets.second, // offset
8916 Op.getOperand(5), // cachepolicy, swizzled buffer
8917 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8918 };
8919
8920 auto *M = cast<MemSDNode>(Op);
8921 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8922 }
8923 case Intrinsic::amdgcn_struct_buffer_load:
8924 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8925 case Intrinsic::amdgcn_struct_buffer_load_format:
8926 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8927 const bool IsFormat =
8928 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8929 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8930
8931 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8932 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8933 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8934 SDValue Ops[] = {
8935 Op.getOperand(0), // Chain
8936 Rsrc, // rsrc
8937 Op.getOperand(3), // vindex
8938 Offsets.first, // voffset
8939 SOffset, // soffset
8940 Offsets.second, // offset
8941 Op.getOperand(6), // cachepolicy, swizzled buffer
8942 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8943 };
8944
8945 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8946 }
8947 case Intrinsic::amdgcn_raw_tbuffer_load:
8948 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8949 MemSDNode *M = cast<MemSDNode>(Op);
8950 EVT LoadVT = Op.getValueType();
8951 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8952 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8953 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8954
8955 SDValue Ops[] = {
8956 Op.getOperand(0), // Chain
8957 Rsrc, // rsrc
8958 DAG.getConstant(0, DL, MVT::i32), // vindex
8959 Offsets.first, // voffset
8960 SOffset, // soffset
8961 Offsets.second, // offset
8962 Op.getOperand(5), // format
8963 Op.getOperand(6), // cachepolicy, swizzled buffer
8964 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8965 };
8966
8967 if (LoadVT.getScalarType() == MVT::f16)
8968 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8969 M, DAG, Ops);
8970 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8971 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8972 DAG);
8973 }
8974 case Intrinsic::amdgcn_struct_tbuffer_load:
8975 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8976 MemSDNode *M = cast<MemSDNode>(Op);
8977 EVT LoadVT = Op.getValueType();
8978 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8979 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8980 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8981
8982 SDValue Ops[] = {
8983 Op.getOperand(0), // Chain
8984 Rsrc, // rsrc
8985 Op.getOperand(3), // vindex
8986 Offsets.first, // voffset
8987 SOffset, // soffset
8988 Offsets.second, // offset
8989 Op.getOperand(6), // format
8990 Op.getOperand(7), // cachepolicy, swizzled buffer
8991 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8992 };
8993
8994 if (LoadVT.getScalarType() == MVT::f16)
8995 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8996 M, DAG, Ops);
8997 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8998 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8999 DAG);
9000 }
9001 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9002 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9003 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9004 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9005 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9006 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9007 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9008 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9009 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9010 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9011 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9012 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9013 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9014 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9015 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9016 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9018 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9019 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9020 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9021 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9022 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9024 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9025 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9026 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9027 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9028 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9030 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9031 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9033 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9034 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9036 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9037 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9038 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9039 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9040 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9042 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9043 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9045 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9046 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9048 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9049 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9050 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9051 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9052 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9054 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9055 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9056 return lowerRawBufferAtomicIntrin(Op, DAG,
9058 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9060 return lowerStructBufferAtomicIntrin(Op, DAG,
9062 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9064 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9065 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9067 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9068 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9070 return lowerStructBufferAtomicIntrin(Op, DAG,
9072 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9074 return lowerStructBufferAtomicIntrin(Op, DAG,
9076 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9077 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9078 return lowerStructBufferAtomicIntrin(Op, DAG,
9080 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9082 return lowerStructBufferAtomicIntrin(Op, DAG,
9084 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9085 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9086 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9087 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9089 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9090 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9091 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9092 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9093 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9094 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9095 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9096 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9097 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9098 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9099 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9100 return lowerStructBufferAtomicIntrin(Op, DAG,
9102
9103 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9104 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9105 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9106 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9107 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9108 SDValue Ops[] = {
9109 Op.getOperand(0), // Chain
9110 Op.getOperand(2), // src
9111 Op.getOperand(3), // cmp
9112 Rsrc, // rsrc
9113 DAG.getConstant(0, DL, MVT::i32), // vindex
9114 Offsets.first, // voffset
9115 SOffset, // soffset
9116 Offsets.second, // offset
9117 Op.getOperand(7), // cachepolicy
9118 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9119 };
9120 EVT VT = Op.getValueType();
9121 auto *M = cast<MemSDNode>(Op);
9122
9124 Op->getVTList(), Ops, VT, M->getMemOperand());
9125 }
9126 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9127 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9128 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9129 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9130 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9131 SDValue Ops[] = {
9132 Op.getOperand(0), // Chain
9133 Op.getOperand(2), // src
9134 Op.getOperand(3), // cmp
9135 Rsrc, // rsrc
9136 Op.getOperand(5), // vindex
9137 Offsets.first, // voffset
9138 SOffset, // soffset
9139 Offsets.second, // offset
9140 Op.getOperand(8), // cachepolicy
9141 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9142 };
9143 EVT VT = Op.getValueType();
9144 auto *M = cast<MemSDNode>(Op);
9145
9147 Op->getVTList(), Ops, VT, M->getMemOperand());
9148 }
9149 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9150 MemSDNode *M = cast<MemSDNode>(Op);
9151 SDValue NodePtr = M->getOperand(2);
9152 SDValue RayExtent = M->getOperand(3);
9153 SDValue RayOrigin = M->getOperand(4);
9154 SDValue RayDir = M->getOperand(5);
9155 SDValue RayInvDir = M->getOperand(6);
9156 SDValue TDescr = M->getOperand(7);
9157
9158 assert(NodePtr.getValueType() == MVT::i32 ||
9159 NodePtr.getValueType() == MVT::i64);
9160 assert(RayDir.getValueType() == MVT::v3f16 ||
9161 RayDir.getValueType() == MVT::v3f32);
9162
9163 if (!Subtarget->hasGFX10_AEncoding()) {
9164 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9165 return SDValue();
9166 }
9167
9168 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9169 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9170 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9171 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9172 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9173 const unsigned NumVDataDwords = 4;
9174 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9175 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9176 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9177 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9178 IsGFX12Plus;
9179 const unsigned BaseOpcodes[2][2] = {
9180 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9181 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9182 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9183 int Opcode;
9184 if (UseNSA) {
9185 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9186 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9187 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9188 : AMDGPU::MIMGEncGfx10NSA,
9189 NumVDataDwords, NumVAddrDwords);
9190 } else {
9191 assert(!IsGFX12Plus);
9192 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9193 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9194 : AMDGPU::MIMGEncGfx10Default,
9195 NumVDataDwords, NumVAddrDwords);
9196 }
9197 assert(Opcode != -1);
9198
9200
9201 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9203 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9204 if (Lanes[0].getValueSizeInBits() == 32) {
9205 for (unsigned I = 0; I < 3; ++I)
9206 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9207 } else {
9208 if (IsAligned) {
9209 Ops.push_back(
9210 DAG.getBitcast(MVT::i32,
9211 DAG.getBuildVector(MVT::v2f16, DL,
9212 { Lanes[0], Lanes[1] })));
9213 Ops.push_back(Lanes[2]);
9214 } else {
9215 SDValue Elt0 = Ops.pop_back_val();
9216 Ops.push_back(
9217 DAG.getBitcast(MVT::i32,
9218 DAG.getBuildVector(MVT::v2f16, DL,
9219 { Elt0, Lanes[0] })));
9220 Ops.push_back(
9221 DAG.getBitcast(MVT::i32,
9222 DAG.getBuildVector(MVT::v2f16, DL,
9223 { Lanes[1], Lanes[2] })));
9224 }
9225 }
9226 };
9227
9228 if (UseNSA && IsGFX11Plus) {
9229 Ops.push_back(NodePtr);
9230 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9231 Ops.push_back(RayOrigin);
9232 if (IsA16) {
9233 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9234 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9235 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9236 for (unsigned I = 0; I < 3; ++I) {
9237 MergedLanes.push_back(DAG.getBitcast(
9238 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9239 {DirLanes[I], InvDirLanes[I]})));
9240 }
9241 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9242 } else {
9243 Ops.push_back(RayDir);
9244 Ops.push_back(RayInvDir);
9245 }
9246 } else {
9247 if (Is64)
9248 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9249 2);
9250 else
9251 Ops.push_back(NodePtr);
9252
9253 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9254 packLanes(RayOrigin, true);
9255 packLanes(RayDir, true);
9256 packLanes(RayInvDir, false);
9257 }
9258
9259 if (!UseNSA) {
9260 // Build a single vector containing all the operands so far prepared.
9261 if (NumVAddrDwords > 12) {
9262 SDValue Undef = DAG.getUNDEF(MVT::i32);
9263 Ops.append(16 - Ops.size(), Undef);
9264 }
9265 assert(Ops.size() >= 8 && Ops.size() <= 12);
9266 SDValue MergedOps = DAG.getBuildVector(
9267 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9268 Ops.clear();
9269 Ops.push_back(MergedOps);
9270 }
9271
9272 Ops.push_back(TDescr);
9273 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9274 Ops.push_back(M->getChain());
9275
9276 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9277 MachineMemOperand *MemRef = M->getMemOperand();
9278 DAG.setNodeMemRefs(NewNode, {MemRef});
9279 return SDValue(NewNode, 0);
9280 }
9281 case Intrinsic::amdgcn_global_atomic_fmin:
9282 case Intrinsic::amdgcn_global_atomic_fmax:
9283 case Intrinsic::amdgcn_global_atomic_fmin_num:
9284 case Intrinsic::amdgcn_global_atomic_fmax_num:
9285 case Intrinsic::amdgcn_flat_atomic_fmin:
9286 case Intrinsic::amdgcn_flat_atomic_fmax:
9287 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9288 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9289 MemSDNode *M = cast<MemSDNode>(Op);
9290 SDValue Ops[] = {
9291 M->getOperand(0), // Chain
9292 M->getOperand(2), // Ptr
9293 M->getOperand(3) // Value
9294 };
9295 unsigned Opcode = 0;
9296 switch (IntrID) {
9297 case Intrinsic::amdgcn_global_atomic_fmin:
9298 case Intrinsic::amdgcn_global_atomic_fmin_num:
9299 case Intrinsic::amdgcn_flat_atomic_fmin:
9300 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9301 Opcode = ISD::ATOMIC_LOAD_FMIN;
9302 break;
9303 }
9304 case Intrinsic::amdgcn_global_atomic_fmax:
9305 case Intrinsic::amdgcn_global_atomic_fmax_num:
9306 case Intrinsic::amdgcn_flat_atomic_fmax:
9307 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9308 Opcode = ISD::ATOMIC_LOAD_FMAX;
9309 break;
9310 }
9311 default:
9312 llvm_unreachable("unhandled atomic opcode");
9313 }
9314 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9315 Ops, M->getMemOperand());
9316 }
9317 case Intrinsic::amdgcn_s_get_barrier_state: {
9318 SDValue Chain = Op->getOperand(0);
9320 unsigned Opc;
9321 bool IsInlinableBarID = false;
9322 int64_t BarID;
9323
9324 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9325 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9326 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9327 }
9328
9329 if (IsInlinableBarID) {
9330 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9331 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9332 Ops.push_back(K);
9333 } else {
9334 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9335 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9336 Ops.push_back(M0Val.getValue(0));
9337 }
9338
9339 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9340 return SDValue(NewMI, 0);
9341 }
9342 default:
9343
9344 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9346 return lowerImage(Op, ImageDimIntr, DAG, true);
9347
9348 return SDValue();
9349 }
9350}
9351
9352// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9353// dwordx4 if on SI and handle TFE loads.
9354SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9355 SDVTList VTList,
9356 ArrayRef<SDValue> Ops, EVT MemVT,
9357 MachineMemOperand *MMO,
9358 SelectionDAG &DAG) const {
9359 LLVMContext &C = *DAG.getContext();
9361 EVT VT = VTList.VTs[0];
9362
9363 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9364 bool IsTFE = VTList.NumVTs == 3;
9365 if (IsTFE) {
9366 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9367 unsigned NumOpDWords = NumValueDWords + 1;
9368 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9369 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9370 MachineMemOperand *OpDWordsMMO =
9371 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9372 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9373 OpDWordsVT, OpDWordsMMO, DAG);
9375 DAG.getVectorIdxConstant(NumValueDWords, DL));
9376 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9377 SDValue ValueDWords =
9378 NumValueDWords == 1
9379 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9381 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9382 ZeroIdx);
9383 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9384 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9385 }
9386
9387 if (!Subtarget->hasDwordx3LoadStores() &&
9388 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9389 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9390 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9391 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9392 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9393 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9394 WidenedMemVT, WidenedMMO);
9396 DAG.getVectorIdxConstant(0, DL));
9397 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9398 }
9399
9400 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9401}
9402
9403SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9404 bool ImageStore) const {
9405 EVT StoreVT = VData.getValueType();
9406
9407 // No change for f16 and legal vector D16 types.
9408 if (!StoreVT.isVector())
9409 return VData;
9410
9411 SDLoc DL(VData);
9412 unsigned NumElements = StoreVT.getVectorNumElements();
9413
9414 if (Subtarget->hasUnpackedD16VMem()) {
9415 // We need to unpack the packed data to store.
9416 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9417 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9418
9419 EVT EquivStoreVT =
9420 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9421 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9422 return DAG.UnrollVectorOp(ZExt.getNode());
9423 }
9424
9425 // The sq block of gfx8.1 does not estimate register use correctly for d16
9426 // image store instructions. The data operand is computed as if it were not a
9427 // d16 image instruction.
9428 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9429 // Bitcast to i16
9430 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9431 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9432
9433 // Decompose into scalars
9435 DAG.ExtractVectorElements(IntVData, Elts);
9436
9437 // Group pairs of i16 into v2i16 and bitcast to i32
9438 SmallVector<SDValue, 4> PackedElts;
9439 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9440 SDValue Pair =
9441 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9442 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9443 PackedElts.push_back(IntPair);
9444 }
9445 if ((NumElements % 2) == 1) {
9446 // Handle v3i16
9447 unsigned I = Elts.size() / 2;
9448 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9449 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9450 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9451 PackedElts.push_back(IntPair);
9452 }
9453
9454 // Pad using UNDEF
9455 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9456
9457 // Build final vector
9458 EVT VecVT =
9459 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9460 return DAG.getBuildVector(VecVT, DL, PackedElts);
9461 }
9462
9463 if (NumElements == 3) {
9464 EVT IntStoreVT =
9466 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9467
9468 EVT WidenedStoreVT = EVT::getVectorVT(
9469 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9470 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9471 WidenedStoreVT.getStoreSizeInBits());
9472 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9473 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9474 }
9475
9476 assert(isTypeLegal(StoreVT));
9477 return VData;
9478}
9479
9480SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9481 SelectionDAG &DAG) const {
9482 SDLoc DL(Op);
9483 SDValue Chain = Op.getOperand(0);
9484 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9486
9487 switch (IntrinsicID) {
9488 case Intrinsic::amdgcn_exp_compr: {
9489 if (!Subtarget->hasCompressedExport()) {
9490 DiagnosticInfoUnsupported BadIntrin(
9492 "intrinsic not supported on subtarget", DL.getDebugLoc());
9493 DAG.getContext()->diagnose(BadIntrin);
9494 }
9495 SDValue Src0 = Op.getOperand(4);
9496 SDValue Src1 = Op.getOperand(5);
9497 // Hack around illegal type on SI by directly selecting it.
9498 if (isTypeLegal(Src0.getValueType()))
9499 return SDValue();
9500
9501 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9502 SDValue Undef = DAG.getUNDEF(MVT::f32);
9503 const SDValue Ops[] = {
9504 Op.getOperand(2), // tgt
9505 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9506 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9507 Undef, // src2
9508 Undef, // src3
9509 Op.getOperand(7), // vm
9510 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9511 Op.getOperand(3), // en
9512 Op.getOperand(0) // Chain
9513 };
9514
9515 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9516 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9517 }
9518 case Intrinsic::amdgcn_s_barrier: {
9521 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9522 if (WGSize <= ST.getWavefrontSize())
9523 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9524 Op.getOperand(0)), 0);
9525 }
9526
9527 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9528 if (ST.hasSplitBarriers()) {
9529 SDValue K =
9531 SDValue BarSignal =
9532 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9533 MVT::Other, K, Op.getOperand(0)),
9534 0);
9535 SDValue BarWait =
9536 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9537 BarSignal.getValue(0)),
9538 0);
9539 return BarWait;
9540 }
9541
9542 return SDValue();
9543 };
9544
9545 case Intrinsic::amdgcn_struct_tbuffer_store:
9546 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9547 SDValue VData = Op.getOperand(2);
9548 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9549 if (IsD16)
9550 VData = handleD16VData(VData, DAG);
9551 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9552 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9553 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9554 SDValue Ops[] = {
9555 Chain,
9556 VData, // vdata
9557 Rsrc, // rsrc
9558 Op.getOperand(4), // vindex
9559 Offsets.first, // voffset
9560 SOffset, // soffset
9561 Offsets.second, // offset
9562 Op.getOperand(7), // format
9563 Op.getOperand(8), // cachepolicy, swizzled buffer
9564 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9565 };
9566 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9568 MemSDNode *M = cast<MemSDNode>(Op);
9569 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9570 M->getMemoryVT(), M->getMemOperand());
9571 }
9572
9573 case Intrinsic::amdgcn_raw_tbuffer_store:
9574 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9575 SDValue VData = Op.getOperand(2);
9576 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9577 if (IsD16)
9578 VData = handleD16VData(VData, DAG);
9579 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9580 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9581 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9582 SDValue Ops[] = {
9583 Chain,
9584 VData, // vdata
9585 Rsrc, // rsrc
9586 DAG.getConstant(0, DL, MVT::i32), // vindex
9587 Offsets.first, // voffset
9588 SOffset, // soffset
9589 Offsets.second, // offset
9590 Op.getOperand(6), // format
9591 Op.getOperand(7), // cachepolicy, swizzled buffer
9592 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9593 };
9594 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9596 MemSDNode *M = cast<MemSDNode>(Op);
9597 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9598 M->getMemoryVT(), M->getMemOperand());
9599 }
9600
9601 case Intrinsic::amdgcn_raw_buffer_store:
9602 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9603 case Intrinsic::amdgcn_raw_buffer_store_format:
9604 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9605 const bool IsFormat =
9606 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9607 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9608
9609 SDValue VData = Op.getOperand(2);
9610 EVT VDataVT = VData.getValueType();
9611 EVT EltType = VDataVT.getScalarType();
9612 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9613 if (IsD16) {
9614 VData = handleD16VData(VData, DAG);
9615 VDataVT = VData.getValueType();
9616 }
9617
9618 if (!isTypeLegal(VDataVT)) {
9619 VData =
9620 DAG.getNode(ISD::BITCAST, DL,
9621 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9622 }
9623
9624 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9625 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9626 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9627 SDValue Ops[] = {
9628 Chain,
9629 VData,
9630 Rsrc,
9631 DAG.getConstant(0, DL, MVT::i32), // vindex
9632 Offsets.first, // voffset
9633 SOffset, // soffset
9634 Offsets.second, // offset
9635 Op.getOperand(6), // cachepolicy, swizzled buffer
9636 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9637 };
9638 unsigned Opc =
9640 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9641 MemSDNode *M = cast<MemSDNode>(Op);
9642
9643 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9644 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9645 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9646
9647 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9648 M->getMemoryVT(), M->getMemOperand());
9649 }
9650
9651 case Intrinsic::amdgcn_struct_buffer_store:
9652 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9653 case Intrinsic::amdgcn_struct_buffer_store_format:
9654 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9655 const bool IsFormat =
9656 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9657 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9658
9659 SDValue VData = Op.getOperand(2);
9660 EVT VDataVT = VData.getValueType();
9661 EVT EltType = VDataVT.getScalarType();
9662 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9663
9664 if (IsD16) {
9665 VData = handleD16VData(VData, DAG);
9666 VDataVT = VData.getValueType();
9667 }
9668
9669 if (!isTypeLegal(VDataVT)) {
9670 VData =
9671 DAG.getNode(ISD::BITCAST, DL,
9672 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9673 }
9674
9675 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9676 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9677 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9678 SDValue Ops[] = {
9679 Chain,
9680 VData,
9681 Rsrc,
9682 Op.getOperand(4), // vindex
9683 Offsets.first, // voffset
9684 SOffset, // soffset
9685 Offsets.second, // offset
9686 Op.getOperand(7), // cachepolicy, swizzled buffer
9687 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9688 };
9689 unsigned Opc =
9691 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9692 MemSDNode *M = cast<MemSDNode>(Op);
9693
9694 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9695 EVT VDataType = VData.getValueType().getScalarType();
9696 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9697 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9698
9699 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9700 M->getMemoryVT(), M->getMemOperand());
9701 }
9702 case Intrinsic::amdgcn_raw_buffer_load_lds:
9703 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9704 case Intrinsic::amdgcn_struct_buffer_load_lds:
9705 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9706 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9707 unsigned Opc;
9708 bool HasVIndex =
9709 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9710 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9711 unsigned OpOffset = HasVIndex ? 1 : 0;
9712 SDValue VOffset = Op.getOperand(5 + OpOffset);
9713 bool HasVOffset = !isNullConstant(VOffset);
9714 unsigned Size = Op->getConstantOperandVal(4);
9715
9716 switch (Size) {
9717 default:
9718 return SDValue();
9719 case 1:
9720 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9721 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9722 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9723 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9724 break;
9725 case 2:
9726 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9727 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9728 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9729 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9730 break;
9731 case 4:
9732 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9733 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9734 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9735 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9736 break;
9737 }
9738
9739 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9740
9742
9743 if (HasVIndex && HasVOffset)
9744 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9745 { Op.getOperand(5), // VIndex
9746 VOffset }));
9747 else if (HasVIndex)
9748 Ops.push_back(Op.getOperand(5));
9749 else if (HasVOffset)
9750 Ops.push_back(VOffset);
9751
9752 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9753 Ops.push_back(Rsrc);
9754 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9755 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9756 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9757 Ops.push_back(
9758 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9760 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9761 Ops.push_back(M0Val.getValue(0)); // Chain
9762 Ops.push_back(M0Val.getValue(1)); // Glue
9763
9764 auto *M = cast<MemSDNode>(Op);
9765 MachineMemOperand *LoadMMO = M->getMemOperand();
9766 // Don't set the offset value here because the pointer points to the base of
9767 // the buffer.
9768 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9769
9770 MachinePointerInfo StorePtrI = LoadPtrI;
9771 LoadPtrI.V = PoisonValue::get(
9775
9776 auto F = LoadMMO->getFlags() &
9778 LoadMMO =
9780 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9781
9783 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9784 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9785
9786 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9787 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9788
9789 return SDValue(Load, 0);
9790 }
9791 case Intrinsic::amdgcn_global_load_lds: {
9792 unsigned Opc;
9793 unsigned Size = Op->getConstantOperandVal(4);
9794 switch (Size) {
9795 default:
9796 return SDValue();
9797 case 1:
9798 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9799 break;
9800 case 2:
9801 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9802 break;
9803 case 4:
9804 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9805 break;
9806 }
9807
9808 auto *M = cast<MemSDNode>(Op);
9809 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9810
9812
9813 SDValue Addr = Op.getOperand(2); // Global ptr
9814 SDValue VOffset;
9815 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9816 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9817 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9818 SDValue LHS = Addr.getOperand(0);
9819 SDValue RHS = Addr.getOperand(1);
9820
9821 if (LHS->isDivergent())
9822 std::swap(LHS, RHS);
9823
9824 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9825 RHS.getOperand(0).getValueType() == MVT::i32) {
9826 // add (i64 sgpr), (zero_extend (i32 vgpr))
9827 Addr = LHS;
9828 VOffset = RHS.getOperand(0);
9829 }
9830 }
9831
9832 Ops.push_back(Addr);
9833 if (!Addr->isDivergent()) {
9834 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9835 if (!VOffset)
9836 VOffset = SDValue(
9837 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9838 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9839 Ops.push_back(VOffset);
9840 }
9841
9842 Ops.push_back(Op.getOperand(5)); // Offset
9843 Ops.push_back(Op.getOperand(6)); // CPol
9844 Ops.push_back(M0Val.getValue(0)); // Chain
9845 Ops.push_back(M0Val.getValue(1)); // Glue
9846
9847 MachineMemOperand *LoadMMO = M->getMemOperand();
9848 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9849 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9850 MachinePointerInfo StorePtrI = LoadPtrI;
9851 LoadPtrI.V = PoisonValue::get(
9855 auto F = LoadMMO->getFlags() &
9857 LoadMMO =
9859 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9861 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9862 LoadMMO->getAAInfo());
9863
9864 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9865 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9866
9867 return SDValue(Load, 0);
9868 }
9869 case Intrinsic::amdgcn_end_cf:
9870 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9871 Op->getOperand(2), Chain), 0);
9872 case Intrinsic::amdgcn_s_barrier_init:
9873 case Intrinsic::amdgcn_s_barrier_join:
9874 case Intrinsic::amdgcn_s_wakeup_barrier: {
9875 SDValue Chain = Op->getOperand(0);
9877 SDValue BarOp = Op->getOperand(2);
9878 unsigned Opc;
9879 bool IsInlinableBarID = false;
9880 int64_t BarVal;
9881
9882 if (isa<ConstantSDNode>(BarOp)) {
9883 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9884 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9885 }
9886
9887 if (IsInlinableBarID) {
9888 switch (IntrinsicID) {
9889 default:
9890 return SDValue();
9891 case Intrinsic::amdgcn_s_barrier_init:
9892 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9893 break;
9894 case Intrinsic::amdgcn_s_barrier_join:
9895 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9896 break;
9897 case Intrinsic::amdgcn_s_wakeup_barrier:
9898 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9899 break;
9900 }
9901
9902 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9903 Ops.push_back(K);
9904 } else {
9905 switch (IntrinsicID) {
9906 default:
9907 return SDValue();
9908 case Intrinsic::amdgcn_s_barrier_init:
9909 Opc = AMDGPU::S_BARRIER_INIT_M0;
9910 break;
9911 case Intrinsic::amdgcn_s_barrier_join:
9912 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9913 break;
9914 case Intrinsic::amdgcn_s_wakeup_barrier:
9915 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9916 break;
9917 }
9918 }
9919
9920 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9921 SDValue M0Val;
9922 // Member count will be read from M0[16:22]
9923 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9924 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9925
9926 if (!IsInlinableBarID) {
9927 // If reference to barrier id is not an inline constant then it must be
9928 // referenced with M0[4:0]. Perform an OR with the member count to
9929 // include it in M0.
9930 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9931 Op.getOperand(2), M0Val),
9932 0);
9933 }
9934 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9935 } else if (!IsInlinableBarID) {
9936 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9937 }
9938
9939 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9940 return SDValue(NewMI, 0);
9941 }
9942 default: {
9943 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9945 return lowerImage(Op, ImageDimIntr, DAG, true);
9946
9947 return Op;
9948 }
9949 }
9950}
9951
9952// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9953// offset (the offset that is included in bounds checking and swizzling, to be
9954// split between the instruction's voffset and immoffset fields) and soffset
9955// (the offset that is excluded from bounds checking and swizzling, to go in
9956// the instruction's soffset field). This function takes the first kind of
9957// offset and figures out how to split it between voffset and immoffset.
9958std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9959 SDValue Offset, SelectionDAG &DAG) const {
9960 SDLoc DL(Offset);
9961 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9962 SDValue N0 = Offset;
9963 ConstantSDNode *C1 = nullptr;
9964
9965 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9966 N0 = SDValue();
9967 else if (DAG.isBaseWithConstantOffset(N0)) {
9968 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9969 N0 = N0.getOperand(0);
9970 }
9971
9972 if (C1) {
9973 unsigned ImmOffset = C1->getZExtValue();
9974 // If the immediate value is too big for the immoffset field, put only bits
9975 // that would normally fit in the immoffset field. The remaining value that
9976 // is copied/added for the voffset field is a large power of 2, and it
9977 // stands more chance of being CSEd with the copy/add for another similar
9978 // load/store.
9979 // However, do not do that rounding down if that is a negative
9980 // number, as it appears to be illegal to have a negative offset in the
9981 // vgpr, even if adding the immediate offset makes it positive.
9982 unsigned Overflow = ImmOffset & ~MaxImm;
9983 ImmOffset -= Overflow;
9984 if ((int32_t)Overflow < 0) {
9985 Overflow += ImmOffset;
9986 ImmOffset = 0;
9987 }
9988 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9989 if (Overflow) {
9990 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9991 if (!N0)
9992 N0 = OverflowVal;
9993 else {
9994 SDValue Ops[] = { N0, OverflowVal };
9995 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9996 }
9997 }
9998 }
9999 if (!N0)
10000 N0 = DAG.getConstant(0, DL, MVT::i32);
10001 if (!C1)
10002 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10003 return {N0, SDValue(C1, 0)};
10004}
10005
10006// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10007// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10008// pointed to by Offsets.
10009void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10010 SelectionDAG &DAG, SDValue *Offsets,
10011 Align Alignment) const {
10013 SDLoc DL(CombinedOffset);
10014 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10015 uint32_t Imm = C->getZExtValue();
10016 uint32_t SOffset, ImmOffset;
10017 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10018 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10019 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10020 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10021 return;
10022 }
10023 }
10024 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10025 SDValue N0 = CombinedOffset.getOperand(0);
10026 SDValue N1 = CombinedOffset.getOperand(1);
10027 uint32_t SOffset, ImmOffset;
10028 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10029 if (Offset >= 0 &&
10030 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10031 Offsets[0] = N0;
10032 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10033 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10034 return;
10035 }
10036 }
10037
10038 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10039 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10040 : DAG.getConstant(0, DL, MVT::i32);
10041
10042 Offsets[0] = CombinedOffset;
10043 Offsets[1] = SOffsetZero;
10044 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10045}
10046
10047SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10048 SelectionDAG &DAG) const {
10049 if (!MaybePointer.getValueType().isScalarInteger())
10050 return MaybePointer;
10051
10052 SDLoc DL(MaybePointer);
10053
10054 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10055 return Rsrc;
10056}
10057
10058// Wrap a global or flat pointer into a buffer intrinsic using the flags
10059// specified in the intrinsic.
10060SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10061 SelectionDAG &DAG) const {
10062 SDLoc Loc(Op);
10063
10064 SDValue Pointer = Op->getOperand(1);
10065 SDValue Stride = Op->getOperand(2);
10066 SDValue NumRecords = Op->getOperand(3);
10067 SDValue Flags = Op->getOperand(4);
10068
10069 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10070 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10071 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10072 std::optional<uint32_t> ConstStride = std::nullopt;
10073 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10074 ConstStride = ConstNode->getZExtValue();
10075
10076 SDValue NewHighHalf = Masked;
10077 if (!ConstStride || *ConstStride != 0) {
10078 SDValue ShiftedStride;
10079 if (ConstStride) {
10080 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10081 } else {
10082 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10083 ShiftedStride =
10084 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10085 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10086 }
10087 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10088 }
10089
10090 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10091 NewHighHalf, NumRecords, Flags);
10092 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10093 return RsrcPtr;
10094}
10095
10096// Handle 8 bit and 16 bit buffer loads
10097SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10098 EVT LoadVT, SDLoc DL,
10100 MachineMemOperand *MMO,
10101 bool IsTFE) const {
10102 EVT IntVT = LoadVT.changeTypeToInteger();
10103
10104 if (IsTFE) {
10105 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10109 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10110 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10111 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10113 DAG.getConstant(1, DL, MVT::i32));
10114 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10115 DAG.getConstant(0, DL, MVT::i32));
10116 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10117 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10118 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10119 }
10120
10121 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10123
10124 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10125 SDValue BufferLoad =
10126 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10127 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10128 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10129
10130 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10131}
10132
10133// Handle 8 bit and 16 bit buffer stores
10134SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10135 EVT VDataType, SDLoc DL,
10136 SDValue Ops[],
10137 MemSDNode *M) const {
10138 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10139 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10140
10141 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10142 Ops[1] = BufferStoreExt;
10143 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10145 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10146 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10147 M->getMemOperand());
10148}
10149
10151 ISD::LoadExtType ExtType, SDValue Op,
10152 const SDLoc &SL, EVT VT) {
10153 if (VT.bitsLT(Op.getValueType()))
10154 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10155
10156 switch (ExtType) {
10157 case ISD::SEXTLOAD:
10158 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10159 case ISD::ZEXTLOAD:
10160 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10161 case ISD::EXTLOAD:
10162 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10163 case ISD::NON_EXTLOAD:
10164 return Op;
10165 }
10166
10167 llvm_unreachable("invalid ext type");
10168}
10169
10170// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10171// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10172SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10173 SelectionDAG &DAG = DCI.DAG;
10174 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10175 return SDValue();
10176
10177 // FIXME: Constant loads should all be marked invariant.
10178 unsigned AS = Ld->getAddressSpace();
10179 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10181 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10182 return SDValue();
10183
10184 // Don't do this early, since it may interfere with adjacent load merging for
10185 // illegal types. We can avoid losing alignment information for exotic types
10186 // pre-legalize.
10187 EVT MemVT = Ld->getMemoryVT();
10188 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10189 MemVT.getSizeInBits() >= 32)
10190 return SDValue();
10191
10192 SDLoc SL(Ld);
10193
10194 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10195 "unexpected vector extload");
10196
10197 // TODO: Drop only high part of range.
10198 SDValue Ptr = Ld->getBasePtr();
10199 SDValue NewLoad = DAG.getLoad(
10200 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10201 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10202 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10203 nullptr); // Drop ranges
10204
10205 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10206 if (MemVT.isFloatingPoint()) {
10208 "unexpected fp extload");
10209 TruncVT = MemVT.changeTypeToInteger();
10210 }
10211
10212 SDValue Cvt = NewLoad;
10213 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10214 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10215 DAG.getValueType(TruncVT));
10216 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10218 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10219 } else {
10221 }
10222
10223 EVT VT = Ld->getValueType(0);
10224 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10225
10226 DCI.AddToWorklist(Cvt.getNode());
10227
10228 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10229 // the appropriate extension from the 32-bit load.
10230 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10231 DCI.AddToWorklist(Cvt.getNode());
10232
10233 // Handle conversion back to floating point if necessary.
10234 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10235
10236 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10237}
10238
10240 const SIMachineFunctionInfo &Info) {
10241 // TODO: Should check if the address can definitely not access stack.
10242 if (Info.isEntryFunction())
10243 return Info.getUserSGPRInfo().hasFlatScratchInit();
10244 return true;
10245}
10246
10247SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10248 SDLoc DL(Op);
10249 LoadSDNode *Load = cast<LoadSDNode>(Op);
10250 ISD::LoadExtType ExtType = Load->getExtensionType();
10251 EVT MemVT = Load->getMemoryVT();
10252
10253 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10254 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10255 return SDValue();
10256
10257 // FIXME: Copied from PPC
10258 // First, load into 32 bits, then truncate to 1 bit.
10259
10260 SDValue Chain = Load->getChain();
10261 SDValue BasePtr = Load->getBasePtr();
10262 MachineMemOperand *MMO = Load->getMemOperand();
10263
10264 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10265
10266 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10267 BasePtr, RealMemVT, MMO);
10268
10269 if (!MemVT.isVector()) {
10270 SDValue Ops[] = {
10271 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10272 NewLD.getValue(1)
10273 };
10274
10275 return DAG.getMergeValues(Ops, DL);
10276 }
10277
10279 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10280 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10281 DAG.getConstant(I, DL, MVT::i32));
10282
10283 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10284 }
10285
10286 SDValue Ops[] = {
10287 DAG.getBuildVector(MemVT, DL, Elts),
10288 NewLD.getValue(1)
10289 };
10290
10291 return DAG.getMergeValues(Ops, DL);
10292 }
10293
10294 if (!MemVT.isVector())
10295 return SDValue();
10296
10297 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10298 "Custom lowering for non-i32 vectors hasn't been implemented.");
10299
10300 Align Alignment = Load->getAlign();
10301 unsigned AS = Load->getAddressSpace();
10302 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10303 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10304 return SplitVectorLoad(Op, DAG);
10305 }
10306
10309 // If there is a possibility that flat instruction access scratch memory
10310 // then we need to use the same legalization rules we use for private.
10311 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10313 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10315
10316 unsigned NumElements = MemVT.getVectorNumElements();
10317
10318 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10320 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10321 if (MemVT.isPow2VectorType() ||
10322 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10323 return SDValue();
10324 return WidenOrSplitVectorLoad(Op, DAG);
10325 }
10326 // Non-uniform loads will be selected to MUBUF instructions, so they
10327 // have the same legalization requirements as global and private
10328 // loads.
10329 //
10330 }
10331
10332 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10335 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10336 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10337 Alignment >= Align(4) && NumElements < 32) {
10338 if (MemVT.isPow2VectorType() ||
10339 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10340 return SDValue();
10341 return WidenOrSplitVectorLoad(Op, DAG);
10342 }
10343 // Non-uniform loads will be selected to MUBUF instructions, so they
10344 // have the same legalization requirements as global and private
10345 // loads.
10346 //
10347 }
10348 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10351 AS == AMDGPUAS::FLAT_ADDRESS) {
10352 if (NumElements > 4)
10353 return SplitVectorLoad(Op, DAG);
10354 // v3 loads not supported on SI.
10355 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10356 return WidenOrSplitVectorLoad(Op, DAG);
10357
10358 // v3 and v4 loads are supported for private and global memory.
10359 return SDValue();
10360 }
10361 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10362 // Depending on the setting of the private_element_size field in the
10363 // resource descriptor, we can only make private accesses up to a certain
10364 // size.
10365 switch (Subtarget->getMaxPrivateElementSize()) {
10366 case 4: {
10367 SDValue Ops[2];
10368 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10369 return DAG.getMergeValues(Ops, DL);
10370 }
10371 case 8:
10372 if (NumElements > 2)
10373 return SplitVectorLoad(Op, DAG);
10374 return SDValue();
10375 case 16:
10376 // Same as global/flat
10377 if (NumElements > 4)
10378 return SplitVectorLoad(Op, DAG);
10379 // v3 loads not supported on SI.
10380 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10381 return WidenOrSplitVectorLoad(Op, DAG);
10382
10383 return SDValue();
10384 default:
10385 llvm_unreachable("unsupported private_element_size");
10386 }
10387 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10388 unsigned Fast = 0;
10389 auto Flags = Load->getMemOperand()->getFlags();
10391 Load->getAlign(), Flags, &Fast) &&
10392 Fast > 1)
10393 return SDValue();
10394
10395 if (MemVT.isVector())
10396 return SplitVectorLoad(Op, DAG);
10397 }
10398
10400 MemVT, *Load->getMemOperand())) {
10401 SDValue Ops[2];
10402 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10403 return DAG.getMergeValues(Ops, DL);
10404 }
10405
10406 return SDValue();
10407}
10408
10409SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10410 EVT VT = Op.getValueType();
10411 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10412 VT.getSizeInBits() == 512)
10413 return splitTernaryVectorOp(Op, DAG);
10414
10415 assert(VT.getSizeInBits() == 64);
10416
10417 SDLoc DL(Op);
10418 SDValue Cond = Op.getOperand(0);
10419
10420 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10421 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10422
10423 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10424 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10425
10426 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10427 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10428
10429 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10430
10431 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10432 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10433
10434 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10435
10436 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10437 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10438}
10439
10440// Catch division cases where we can use shortcuts with rcp and rsq
10441// instructions.
10442SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10443 SelectionDAG &DAG) const {
10444 SDLoc SL(Op);
10445 SDValue LHS = Op.getOperand(0);
10446 SDValue RHS = Op.getOperand(1);
10447 EVT VT = Op.getValueType();
10448 const SDNodeFlags Flags = Op->getFlags();
10449
10450 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10452
10453 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10454 // Without !fpmath accuracy information, we can't do more because we don't
10455 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10456 // f16 is always accurate enough
10457 if (!AllowInaccurateRcp && VT != MVT::f16)
10458 return SDValue();
10459
10460 if (CLHS->isExactlyValue(1.0)) {
10461 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10462 // the CI documentation has a worst case error of 1 ulp.
10463 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10464 // use it as long as we aren't trying to use denormals.
10465 //
10466 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10467
10468 // 1.0 / sqrt(x) -> rsq(x)
10469
10470 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10471 // error seems really high at 2^29 ULP.
10472 // 1.0 / x -> rcp(x)
10473 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10474 }
10475
10476 // Same as for 1.0, but expand the sign out of the constant.
10477 if (CLHS->isExactlyValue(-1.0)) {
10478 // -1.0 / x -> rcp (fneg x)
10479 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10480 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10481 }
10482 }
10483
10484 // For f16 require afn or arcp.
10485 // For f32 require afn.
10486 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10487 return SDValue();
10488
10489 // Turn into multiply by the reciprocal.
10490 // x / y -> x * (1.0 / y)
10491 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10492 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10493}
10494
10495SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10496 SelectionDAG &DAG) const {
10497 SDLoc SL(Op);
10498 SDValue X = Op.getOperand(0);
10499 SDValue Y = Op.getOperand(1);
10500 EVT VT = Op.getValueType();
10501 const SDNodeFlags Flags = Op->getFlags();
10502
10503 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10505 if (!AllowInaccurateDiv)
10506 return SDValue();
10507
10508 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10509 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10510
10511 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10512 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10513
10514 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10515 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10516 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10517 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10518 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10519 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10520}
10521
10522static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10523 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10524 SDNodeFlags Flags) {
10525 if (GlueChain->getNumValues() <= 1) {
10526 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10527 }
10528
10529 assert(GlueChain->getNumValues() == 3);
10530
10531 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10532 switch (Opcode) {
10533 default: llvm_unreachable("no chain equivalent for opcode");
10534 case ISD::FMUL:
10535 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10536 break;
10537 }
10538
10539 return DAG.getNode(Opcode, SL, VTList,
10540 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10541 Flags);
10542}
10543
10544static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10545 EVT VT, SDValue A, SDValue B, SDValue C,
10546 SDValue GlueChain, SDNodeFlags Flags) {
10547 if (GlueChain->getNumValues() <= 1) {
10548 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10549 }
10550
10551 assert(GlueChain->getNumValues() == 3);
10552
10553 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10554 switch (Opcode) {
10555 default: llvm_unreachable("no chain equivalent for opcode");
10556 case ISD::FMA:
10557 Opcode = AMDGPUISD::FMA_W_CHAIN;
10558 break;
10559 }
10560
10561 return DAG.getNode(Opcode, SL, VTList,
10562 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10563 Flags);
10564}
10565
10566SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10567 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10568 return FastLowered;
10569
10570 SDLoc SL(Op);
10571 SDValue Src0 = Op.getOperand(0);
10572 SDValue Src1 = Op.getOperand(1);
10573
10574 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10575 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10576
10577 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10578 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10579
10580 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10581 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10582
10583 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10584}
10585
10586// Faster 2.5 ULP division that does not support denormals.
10587SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10588 SDNodeFlags Flags = Op->getFlags();
10589 SDLoc SL(Op);
10590 SDValue LHS = Op.getOperand(1);
10591 SDValue RHS = Op.getOperand(2);
10592
10593 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10594
10595 const APFloat K0Val(0x1p+96f);
10596 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10597
10598 const APFloat K1Val(0x1p-32f);
10599 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10600
10601 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10602
10603 EVT SetCCVT =
10604 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10605
10606 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10607
10608 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10609
10610 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10611
10612 // rcp does not support denormals.
10613 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10614
10615 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10616
10617 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10618}
10619
10620// Returns immediate value for setting the F32 denorm mode when using the
10621// S_DENORM_MODE instruction.
10623 const SIMachineFunctionInfo *Info,
10624 const GCNSubtarget *ST) {
10625 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10626 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10627 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10628 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10629}
10630
10631SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10632 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10633 return FastLowered;
10634
10635 // The selection matcher assumes anything with a chain selecting to a
10636 // mayRaiseFPException machine instruction. Since we're introducing a chain
10637 // here, we need to explicitly report nofpexcept for the regular fdiv
10638 // lowering.
10639 SDNodeFlags Flags = Op->getFlags();
10640 Flags.setNoFPExcept(true);
10641
10642 SDLoc SL(Op);
10643 SDValue LHS = Op.getOperand(0);
10644 SDValue RHS = Op.getOperand(1);
10645
10646 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10647
10648 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10649
10650 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10651 {RHS, RHS, LHS}, Flags);
10652 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10653 {LHS, RHS, LHS}, Flags);
10654
10655 // Denominator is scaled to not be denormal, so using rcp is ok.
10656 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10657 DenominatorScaled, Flags);
10658 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10659 DenominatorScaled, Flags);
10660
10661 using namespace AMDGPU::Hwreg;
10662 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10663 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10664
10665 const MachineFunction &MF = DAG.getMachineFunction();
10667 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10668
10669 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10670 const bool HasDynamicDenormals =
10671 (DenormMode.Input == DenormalMode::Dynamic) ||
10672 (DenormMode.Output == DenormalMode::Dynamic);
10673
10674 SDValue SavedDenormMode;
10675
10676 if (!PreservesDenormals) {
10677 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10678 // lowering. The chain dependence is insufficient, and we need glue. We do
10679 // not need the glue variants in a strictfp function.
10680
10681 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10682
10683 SDValue Glue = DAG.getEntryNode();
10684 if (HasDynamicDenormals) {
10685 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10686 DAG.getVTList(MVT::i32, MVT::Glue),
10687 {BitField, Glue});
10688 SavedDenormMode = SDValue(GetReg, 0);
10689
10690 Glue = DAG.getMergeValues(
10691 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10692 }
10693
10694 SDNode *EnableDenorm;
10695 if (Subtarget->hasDenormModeInst()) {
10696 const SDValue EnableDenormValue =
10697 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10698
10699 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10700 EnableDenormValue)
10701 .getNode();
10702 } else {
10703 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10704 SL, MVT::i32);
10705 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10706 {EnableDenormValue, BitField, Glue});
10707 }
10708
10709 SDValue Ops[3] = {
10710 NegDivScale0,
10711 SDValue(EnableDenorm, 0),
10712 SDValue(EnableDenorm, 1)
10713 };
10714
10715 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10716 }
10717
10718 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10719 ApproxRcp, One, NegDivScale0, Flags);
10720
10721 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10722 ApproxRcp, Fma0, Flags);
10723
10724 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10725 Fma1, Fma1, Flags);
10726
10727 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10728 NumeratorScaled, Mul, Flags);
10729
10730 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10731 Fma2, Fma1, Mul, Fma2, Flags);
10732
10733 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10734 NumeratorScaled, Fma3, Flags);
10735
10736 if (!PreservesDenormals) {
10737 SDNode *DisableDenorm;
10738 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10739 const SDValue DisableDenormValue = getSPDenormModeValue(
10740 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10741
10742 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10743 Fma4.getValue(1), DisableDenormValue,
10744 Fma4.getValue(2)).getNode();
10745 } else {
10746 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10747 const SDValue DisableDenormValue =
10748 HasDynamicDenormals
10749 ? SavedDenormMode
10750 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10751
10752 DisableDenorm = DAG.getMachineNode(
10753 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10754 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10755 }
10756
10757 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10758 SDValue(DisableDenorm, 0), DAG.getRoot());
10759 DAG.setRoot(OutputChain);
10760 }
10761
10762 SDValue Scale = NumeratorScaled.getValue(1);
10763 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10764 {Fma4, Fma1, Fma3, Scale}, Flags);
10765
10766 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10767}
10768
10769SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10770 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10771 return FastLowered;
10772
10773 SDLoc SL(Op);
10774 SDValue X = Op.getOperand(0);
10775 SDValue Y = Op.getOperand(1);
10776
10777 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10778
10779 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10780
10781 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10782
10783 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10784
10785 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10786
10787 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10788
10789 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10790
10791 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10792
10793 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10794
10795 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10796 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10797
10798 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10799 NegDivScale0, Mul, DivScale1);
10800
10801 SDValue Scale;
10802
10803 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10804 // Workaround a hardware bug on SI where the condition output from div_scale
10805 // is not usable.
10806
10807 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10808
10809 // Figure out if the scale to use for div_fmas.
10810 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10811 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10812 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10813 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10814
10815 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10816 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10817
10818 SDValue Scale0Hi
10819 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10820 SDValue Scale1Hi
10821 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10822
10823 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10824 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10825 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10826 } else {
10827 Scale = DivScale1.getValue(1);
10828 }
10829
10830 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10831 Fma4, Fma3, Mul, Scale);
10832
10833 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10834}
10835
10836SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10837 EVT VT = Op.getValueType();
10838
10839 if (VT == MVT::f32)
10840 return LowerFDIV32(Op, DAG);
10841
10842 if (VT == MVT::f64)
10843 return LowerFDIV64(Op, DAG);
10844
10845 if (VT == MVT::f16)
10846 return LowerFDIV16(Op, DAG);
10847
10848 llvm_unreachable("Unexpected type for fdiv");
10849}
10850
10851SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10852 SDLoc dl(Op);
10853 SDValue Val = Op.getOperand(0);
10854 EVT VT = Val.getValueType();
10855 EVT ResultExpVT = Op->getValueType(1);
10856 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10857
10858 SDValue Mant = DAG.getNode(
10860 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10861
10862 SDValue Exp = DAG.getNode(
10863 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10864 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10865
10866 if (Subtarget->hasFractBug()) {
10867 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10868 SDValue Inf = DAG.getConstantFP(
10870
10871 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10872 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10873 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10874 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10875 }
10876
10877 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10878 return DAG.getMergeValues({Mant, CastExp}, dl);
10879}
10880
10881SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10882 SDLoc DL(Op);
10883 StoreSDNode *Store = cast<StoreSDNode>(Op);
10884 EVT VT = Store->getMemoryVT();
10885
10886 if (VT == MVT::i1) {
10887 return DAG.getTruncStore(Store->getChain(), DL,
10888 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10889 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10890 }
10891
10892 assert(VT.isVector() &&
10893 Store->getValue().getValueType().getScalarType() == MVT::i32);
10894
10895 unsigned AS = Store->getAddressSpace();
10896 if (Subtarget->hasLDSMisalignedBug() &&
10897 AS == AMDGPUAS::FLAT_ADDRESS &&
10898 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10899 return SplitVectorStore(Op, DAG);
10900 }
10901
10904 // If there is a possibility that flat instruction access scratch memory
10905 // then we need to use the same legalization rules we use for private.
10906 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10908 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10910
10911 unsigned NumElements = VT.getVectorNumElements();
10912 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10913 AS == AMDGPUAS::FLAT_ADDRESS) {
10914 if (NumElements > 4)
10915 return SplitVectorStore(Op, DAG);
10916 // v3 stores not supported on SI.
10917 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10918 return SplitVectorStore(Op, DAG);
10919
10921 VT, *Store->getMemOperand()))
10922 return expandUnalignedStore(Store, DAG);
10923
10924 return SDValue();
10925 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10926 switch (Subtarget->getMaxPrivateElementSize()) {
10927 case 4:
10928 return scalarizeVectorStore(Store, DAG);
10929 case 8:
10930 if (NumElements > 2)
10931 return SplitVectorStore(Op, DAG);
10932 return SDValue();
10933 case 16:
10934 if (NumElements > 4 ||
10935 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10936 return SplitVectorStore(Op, DAG);
10937 return SDValue();
10938 default:
10939 llvm_unreachable("unsupported private_element_size");
10940 }
10941 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10942 unsigned Fast = 0;
10943 auto Flags = Store->getMemOperand()->getFlags();
10945 Store->getAlign(), Flags, &Fast) &&
10946 Fast > 1)
10947 return SDValue();
10948
10949 if (VT.isVector())
10950 return SplitVectorStore(Op, DAG);
10951
10952 return expandUnalignedStore(Store, DAG);
10953 }
10954
10955 // Probably an invalid store. If so we'll end up emitting a selection error.
10956 return SDValue();
10957}
10958
10959// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10960SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10961 SDLoc SL(Op);
10962 assert(!Subtarget->has16BitInsts());
10963 SDNodeFlags Flags = Op->getFlags();
10964 SDValue Ext =
10965 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10966
10967 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10968 SDValue Sqrt =
10969 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10970
10971 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10972 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10973}
10974
10975SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10976 SDLoc DL(Op);
10977 SDNodeFlags Flags = Op->getFlags();
10978 MVT VT = Op.getValueType().getSimpleVT();
10979 const SDValue X = Op.getOperand(0);
10980
10981 if (allowApproxFunc(DAG, Flags)) {
10982 // Instruction is 1ulp but ignores denormals.
10983 return DAG.getNode(
10985 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10986 }
10987
10988 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10989 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10990
10991 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10992
10993 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10994
10995 SDValue SqrtX =
10996 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
10997
10998 SDValue SqrtS;
10999 if (needsDenormHandlingF32(DAG, X, Flags)) {
11000 SDValue SqrtID =
11001 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11002 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11003
11004 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11005 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11006 DAG.getConstant(-1, DL, MVT::i32));
11007 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11008
11009 SDValue NegSqrtSNextDown =
11010 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11011
11012 SDValue SqrtVP =
11013 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11014
11015 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11016 DAG.getConstant(1, DL, MVT::i32));
11017 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11018
11019 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11020 SDValue SqrtVS =
11021 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11022
11023 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11024 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11025
11026 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11027 Flags);
11028
11029 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11030 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11031 Flags);
11032 } else {
11033 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11034
11035 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11036
11037 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11038 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11039 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11040
11041 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11042 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11043 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11044
11045 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11046 SDValue SqrtD =
11047 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11048 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11049 }
11050
11051 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11052
11053 SDValue ScaledDown =
11054 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11055
11056 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11057 SDValue IsZeroOrInf =
11058 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11059 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11060
11061 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11062}
11063
11064SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11065 // For double type, the SQRT and RSQ instructions don't have required
11066 // precision, we apply Goldschmidt's algorithm to improve the result:
11067 //
11068 // y0 = rsq(x)
11069 // g0 = x * y0
11070 // h0 = 0.5 * y0
11071 //
11072 // r0 = 0.5 - h0 * g0
11073 // g1 = g0 * r0 + g0
11074 // h1 = h0 * r0 + h0
11075 //
11076 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11077 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11078 // h2 = h1 * r1 + h1
11079 //
11080 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11081 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11082 //
11083 // sqrt(x) = g3
11084
11085 SDNodeFlags Flags = Op->getFlags();
11086
11087 SDLoc DL(Op);
11088
11089 SDValue X = Op.getOperand(0);
11090 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11091
11092 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11093
11094 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11095
11096 // Scale up input if it is too small.
11097 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11098 SDValue ScaleUp =
11099 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11100 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11101
11102 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11103
11104 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11105
11106 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11107 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11108
11109 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11110 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11111
11112 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11113
11114 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11115
11116 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11117 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11118
11119 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11120
11121 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11122 SDValue SqrtD1 =
11123 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11124
11125 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11126
11127 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11128 SDValue ScaleDown =
11129 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11130 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11131
11132 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11133 // with finite only or nsz because rsq(+/-0) = +/-inf
11134
11135 // TODO: Check for DAZ and expand to subnormals
11136 SDValue IsZeroOrInf =
11137 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11138 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11139
11140 // If x is +INF, +0, or -0, use its original value
11141 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11142 Flags);
11143}
11144
11145SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11146 SDLoc DL(Op);
11147 EVT VT = Op.getValueType();
11148 SDValue Arg = Op.getOperand(0);
11149 SDValue TrigVal;
11150
11151 // Propagate fast-math flags so that the multiply we introduce can be folded
11152 // if Arg is already the result of a multiply by constant.
11153 auto Flags = Op->getFlags();
11154
11155 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11156
11157 if (Subtarget->hasTrigReducedRange()) {
11158 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11159 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11160 } else {
11161 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11162 }
11163
11164 switch (Op.getOpcode()) {
11165 case ISD::FCOS:
11166 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11167 case ISD::FSIN:
11168 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11169 default:
11170 llvm_unreachable("Wrong trig opcode");
11171 }
11172}
11173
11174SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11175 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11176 assert(AtomicNode->isCompareAndSwap());
11177 unsigned AS = AtomicNode->getAddressSpace();
11178
11179 // No custom lowering required for local address space
11181 return Op;
11182
11183 // Non-local address space requires custom lowering for atomic compare
11184 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11185 SDLoc DL(Op);
11186 SDValue ChainIn = Op.getOperand(0);
11187 SDValue Addr = Op.getOperand(1);
11188 SDValue Old = Op.getOperand(2);
11189 SDValue New = Op.getOperand(3);
11190 EVT VT = Op.getValueType();
11191 MVT SimpleVT = VT.getSimpleVT();
11192 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11193
11194 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11195 SDValue Ops[] = { ChainIn, Addr, NewOld };
11196
11197 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11198 Ops, VT, AtomicNode->getMemOperand());
11199}
11200
11201//===----------------------------------------------------------------------===//
11202// Custom DAG optimizations
11203//===----------------------------------------------------------------------===//
11204
11205SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11206 DAGCombinerInfo &DCI) const {
11207 EVT VT = N->getValueType(0);
11208 EVT ScalarVT = VT.getScalarType();
11209 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11210 return SDValue();
11211
11212 SelectionDAG &DAG = DCI.DAG;
11213 SDLoc DL(N);
11214
11215 SDValue Src = N->getOperand(0);
11216 EVT SrcVT = Src.getValueType();
11217
11218 // TODO: We could try to match extracting the higher bytes, which would be
11219 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11220 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11221 // about in practice.
11222 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11223 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11224 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11225 DCI.AddToWorklist(Cvt.getNode());
11226
11227 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11228 if (ScalarVT != MVT::f32) {
11229 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11230 DAG.getTargetConstant(0, DL, MVT::i32));
11231 }
11232 return Cvt;
11233 }
11234 }
11235
11236 return SDValue();
11237}
11238
11239SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11240 DAGCombinerInfo &DCI) const {
11241 SDValue MagnitudeOp = N->getOperand(0);
11242 SDValue SignOp = N->getOperand(1);
11243 SelectionDAG &DAG = DCI.DAG;
11244 SDLoc DL(N);
11245
11246 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11247 // lower half with a copy.
11248 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11249 if (MagnitudeOp.getValueType() == MVT::f64) {
11250 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11251 SDValue MagLo =
11252 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11253 DAG.getConstant(0, DL, MVT::i32));
11254 SDValue MagHi =
11255 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11256 DAG.getConstant(1, DL, MVT::i32));
11257
11258 SDValue HiOp =
11259 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11260
11261 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11262
11263 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11264 }
11265
11266 if (SignOp.getValueType() != MVT::f64)
11267 return SDValue();
11268
11269 // Reduce width of sign operand, we only need the highest bit.
11270 //
11271 // fcopysign f64:x, f64:y ->
11272 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11273 // TODO: In some cases it might make sense to go all the way to f16.
11274 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11275 SDValue SignAsF32 =
11276 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11277 DAG.getConstant(1, DL, MVT::i32));
11278
11279 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11280 SignAsF32);
11281}
11282
11283// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11284// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11285// bits
11286
11287// This is a variant of
11288// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11289//
11290// The normal DAG combiner will do this, but only if the add has one use since
11291// that would increase the number of instructions.
11292//
11293// This prevents us from seeing a constant offset that can be folded into a
11294// memory instruction's addressing mode. If we know the resulting add offset of
11295// a pointer can be folded into an addressing offset, we can replace the pointer
11296// operand with the add of new constant offset. This eliminates one of the uses,
11297// and may allow the remaining use to also be simplified.
11298//
11299SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11300 unsigned AddrSpace,
11301 EVT MemVT,
11302 DAGCombinerInfo &DCI) const {
11303 SDValue N0 = N->getOperand(0);
11304 SDValue N1 = N->getOperand(1);
11305
11306 // We only do this to handle cases where it's profitable when there are
11307 // multiple uses of the add, so defer to the standard combine.
11308 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11309 N0->hasOneUse())
11310 return SDValue();
11311
11312 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11313 if (!CN1)
11314 return SDValue();
11315
11316 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11317 if (!CAdd)
11318 return SDValue();
11319
11320 SelectionDAG &DAG = DCI.DAG;
11321
11322 if (N0->getOpcode() == ISD::OR &&
11323 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11324 return SDValue();
11325
11326 // If the resulting offset is too large, we can't fold it into the
11327 // addressing mode offset.
11328 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11329 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11330
11331 AddrMode AM;
11332 AM.HasBaseReg = true;
11333 AM.BaseOffs = Offset.getSExtValue();
11334 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11335 return SDValue();
11336
11337 SDLoc SL(N);
11338 EVT VT = N->getValueType(0);
11339
11340 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11341 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11342
11344 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11345 (N0.getOpcode() == ISD::OR ||
11346 N0->getFlags().hasNoUnsignedWrap()));
11347
11348 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11349}
11350
11351/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11352/// by the chain and intrinsic ID. Theoretically we would also need to check the
11353/// specific intrinsic, but they all place the pointer operand first.
11354static unsigned getBasePtrIndex(const MemSDNode *N) {
11355 switch (N->getOpcode()) {
11356 case ISD::STORE:
11359 return 2;
11360 default:
11361 return 1;
11362 }
11363}
11364
11365SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11366 DAGCombinerInfo &DCI) const {
11367 SelectionDAG &DAG = DCI.DAG;
11368 SDLoc SL(N);
11369
11370 unsigned PtrIdx = getBasePtrIndex(N);
11371 SDValue Ptr = N->getOperand(PtrIdx);
11372
11373 // TODO: We could also do this for multiplies.
11374 if (Ptr.getOpcode() == ISD::SHL) {
11375 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11376 N->getMemoryVT(), DCI);
11377 if (NewPtr) {
11378 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11379
11380 NewOps[PtrIdx] = NewPtr;
11381 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11382 }
11383 }
11384
11385 return SDValue();
11386}
11387
11388static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11389 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11390 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11391 (Opc == ISD::XOR && Val == 0);
11392}
11393
11394// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11395// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11396// integer combine opportunities since most 64-bit operations are decomposed
11397// this way. TODO: We won't want this for SALU especially if it is an inline
11398// immediate.
11399SDValue SITargetLowering::splitBinaryBitConstantOp(
11400 DAGCombinerInfo &DCI,
11401 const SDLoc &SL,
11402 unsigned Opc, SDValue LHS,
11403 const ConstantSDNode *CRHS) const {
11404 uint64_t Val = CRHS->getZExtValue();
11405 uint32_t ValLo = Lo_32(Val);
11406 uint32_t ValHi = Hi_32(Val);
11408
11409 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11410 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11411 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11412 // If we need to materialize a 64-bit immediate, it will be split up later
11413 // anyway. Avoid creating the harder to understand 64-bit immediate
11414 // materialization.
11415 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11416 }
11417
11418 return SDValue();
11419}
11420
11422 if (V.getValueType() != MVT::i1)
11423 return false;
11424 switch (V.getOpcode()) {
11425 default:
11426 break;
11427 case ISD::SETCC:
11429 return true;
11430 case ISD::AND:
11431 case ISD::OR:
11432 case ISD::XOR:
11433 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11434 }
11435 return false;
11436}
11437
11438// If a constant has all zeroes or all ones within each byte return it.
11439// Otherwise return 0.
11441 // 0xff for any zero byte in the mask
11442 uint32_t ZeroByteMask = 0;
11443 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11444 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11445 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11446 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11447 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11448 if ((NonZeroByteMask & C) != NonZeroByteMask)
11449 return 0; // Partial bytes selected.
11450 return C;
11451}
11452
11453// Check if a node selects whole bytes from its operand 0 starting at a byte
11454// boundary while masking the rest. Returns select mask as in the v_perm_b32
11455// or -1 if not succeeded.
11456// Note byte select encoding:
11457// value 0-3 selects corresponding source byte;
11458// value 0xc selects zero;
11459// value 0xff selects 0xff.
11461 assert(V.getValueSizeInBits() == 32);
11462
11463 if (V.getNumOperands() != 2)
11464 return ~0;
11465
11466 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11467 if (!N1)
11468 return ~0;
11469
11470 uint32_t C = N1->getZExtValue();
11471
11472 switch (V.getOpcode()) {
11473 default:
11474 break;
11475 case ISD::AND:
11476 if (uint32_t ConstMask = getConstantPermuteMask(C))
11477 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11478 break;
11479
11480 case ISD::OR:
11481 if (uint32_t ConstMask = getConstantPermuteMask(C))
11482 return (0x03020100 & ~ConstMask) | ConstMask;
11483 break;
11484
11485 case ISD::SHL:
11486 if (C % 8)
11487 return ~0;
11488
11489 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11490
11491 case ISD::SRL:
11492 if (C % 8)
11493 return ~0;
11494
11495 return uint32_t(0x0c0c0c0c03020100ull >> C);
11496 }
11497
11498 return ~0;
11499}
11500
11501SDValue SITargetLowering::performAndCombine(SDNode *N,
11502 DAGCombinerInfo &DCI) const {
11503 if (DCI.isBeforeLegalize())
11504 return SDValue();
11505
11506 SelectionDAG &DAG = DCI.DAG;
11507 EVT VT = N->getValueType(0);
11508 SDValue LHS = N->getOperand(0);
11509 SDValue RHS = N->getOperand(1);
11510
11511
11512 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11513 if (VT == MVT::i64 && CRHS) {
11514 if (SDValue Split
11515 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11516 return Split;
11517 }
11518
11519 if (CRHS && VT == MVT::i32) {
11520 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11521 // nb = number of trailing zeroes in mask
11522 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11523 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11524 uint64_t Mask = CRHS->getZExtValue();
11525 unsigned Bits = llvm::popcount(Mask);
11526 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11527 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11528 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11529 unsigned Shift = CShift->getZExtValue();
11530 unsigned NB = CRHS->getAPIntValue().countr_zero();
11531 unsigned Offset = NB + Shift;
11532 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11533 SDLoc SL(N);
11534 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11535 LHS->getOperand(0),
11536 DAG.getConstant(Offset, SL, MVT::i32),
11537 DAG.getConstant(Bits, SL, MVT::i32));
11538 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11539 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11540 DAG.getValueType(NarrowVT));
11541 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11542 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11543 return Shl;
11544 }
11545 }
11546 }
11547
11548 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11549 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11550 isa<ConstantSDNode>(LHS.getOperand(2))) {
11551 uint32_t Sel = getConstantPermuteMask(Mask);
11552 if (!Sel)
11553 return SDValue();
11554
11555 // Select 0xc for all zero bytes
11556 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11557 SDLoc DL(N);
11558 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11559 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11560 }
11561 }
11562
11563 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11564 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11565 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11566 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11567 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11568
11569 SDValue X = LHS.getOperand(0);
11570 SDValue Y = RHS.getOperand(0);
11571 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11572 !isTypeLegal(X.getValueType()))
11573 return SDValue();
11574
11575 if (LCC == ISD::SETO) {
11576 if (X != LHS.getOperand(1))
11577 return SDValue();
11578
11579 if (RCC == ISD::SETUNE) {
11580 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11581 if (!C1 || !C1->isInfinity() || C1->isNegative())
11582 return SDValue();
11583
11590
11591 static_assert(((~(SIInstrFlags::S_NAN |
11594 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11595 "mask not equal");
11596
11597 SDLoc DL(N);
11598 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11599 X, DAG.getConstant(Mask, DL, MVT::i32));
11600 }
11601 }
11602 }
11603
11604 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11605 std::swap(LHS, RHS);
11606
11607 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11608 RHS.hasOneUse()) {
11609 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11610 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11611 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11612 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11613 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11614 (RHS.getOperand(0) == LHS.getOperand(0) &&
11615 LHS.getOperand(0) == LHS.getOperand(1))) {
11616 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11617 unsigned NewMask = LCC == ISD::SETO ?
11618 Mask->getZExtValue() & ~OrdMask :
11619 Mask->getZExtValue() & OrdMask;
11620
11621 SDLoc DL(N);
11622 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11623 DAG.getConstant(NewMask, DL, MVT::i32));
11624 }
11625 }
11626
11627 if (VT == MVT::i32 &&
11628 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11629 // and x, (sext cc from i1) => select cc, x, 0
11630 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11631 std::swap(LHS, RHS);
11632 if (isBoolSGPR(RHS.getOperand(0)))
11633 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11634 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11635 }
11636
11637 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11639 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11640 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11641 uint32_t LHSMask = getPermuteMask(LHS);
11642 uint32_t RHSMask = getPermuteMask(RHS);
11643 if (LHSMask != ~0u && RHSMask != ~0u) {
11644 // Canonicalize the expression in an attempt to have fewer unique masks
11645 // and therefore fewer registers used to hold the masks.
11646 if (LHSMask > RHSMask) {
11647 std::swap(LHSMask, RHSMask);
11648 std::swap(LHS, RHS);
11649 }
11650
11651 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11652 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11653 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11654 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11655
11656 // Check of we need to combine values from two sources within a byte.
11657 if (!(LHSUsedLanes & RHSUsedLanes) &&
11658 // If we select high and lower word keep it for SDWA.
11659 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11660 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11661 // Each byte in each mask is either selector mask 0-3, or has higher
11662 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11663 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11664 // mask which is not 0xff wins. By anding both masks we have a correct
11665 // result except that 0x0c shall be corrected to give 0x0c only.
11666 uint32_t Mask = LHSMask & RHSMask;
11667 for (unsigned I = 0; I < 32; I += 8) {
11668 uint32_t ByteSel = 0xff << I;
11669 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11670 Mask &= (0x0c << I) & 0xffffffff;
11671 }
11672
11673 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11674 // or 0x0c.
11675 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11676 SDLoc DL(N);
11677
11678 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11679 LHS.getOperand(0), RHS.getOperand(0),
11680 DAG.getConstant(Sel, DL, MVT::i32));
11681 }
11682 }
11683 }
11684
11685 return SDValue();
11686}
11687
11688// A key component of v_perm is a mapping between byte position of the src
11689// operands, and the byte position of the dest. To provide such, we need: 1. the
11690// node that provides x byte of the dest of the OR, and 2. the byte of the node
11691// used to provide that x byte. calculateByteProvider finds which node provides
11692// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11693// and finds an ultimate src and byte position For example: The supported
11694// LoadCombine pattern for vector loads is as follows
11695// t1
11696// or
11697// / \
11698// t2 t3
11699// zext shl
11700// | | \
11701// t4 t5 16
11702// or anyext
11703// / \ |
11704// t6 t7 t8
11705// srl shl or
11706// / | / \ / \
11707// t9 t10 t11 t12 t13 t14
11708// trunc* 8 trunc* 8 and and
11709// | | / | | \
11710// t15 t16 t17 t18 t19 t20
11711// trunc* 255 srl -256
11712// | / \
11713// t15 t15 16
11714//
11715// *In this example, the truncs are from i32->i16
11716//
11717// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11718// respectively. calculateSrcByte would find (given node) -> ultimate src &
11719// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11720// After finding the mapping, we can combine the tree into vperm t15, t16,
11721// 0x05000407
11722
11723// Find the source and byte position from a node.
11724// \p DestByte is the byte position of the dest of the or that the src
11725// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11726// dest of the or byte. \p Depth tracks how many recursive iterations we have
11727// performed.
11728static const std::optional<ByteProvider<SDValue>>
11729calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11730 unsigned Depth = 0) {
11731 // We may need to recursively traverse a series of SRLs
11732 if (Depth >= 6)
11733 return std::nullopt;
11734
11735 if (Op.getValueSizeInBits() < 8)
11736 return std::nullopt;
11737
11738 if (Op.getValueType().isVector())
11739 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11740
11741 switch (Op->getOpcode()) {
11742 case ISD::TRUNCATE: {
11743 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11744 }
11745
11746 case ISD::SIGN_EXTEND:
11747 case ISD::ZERO_EXTEND:
11749 SDValue NarrowOp = Op->getOperand(0);
11750 auto NarrowVT = NarrowOp.getValueType();
11751 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11752 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11753 NarrowVT = VTSign->getVT();
11754 }
11755 if (!NarrowVT.isByteSized())
11756 return std::nullopt;
11757 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11758
11759 if (SrcIndex >= NarrowByteWidth)
11760 return std::nullopt;
11761 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11762 }
11763
11764 case ISD::SRA:
11765 case ISD::SRL: {
11766 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11767 if (!ShiftOp)
11768 return std::nullopt;
11769
11770 uint64_t BitShift = ShiftOp->getZExtValue();
11771
11772 if (BitShift % 8 != 0)
11773 return std::nullopt;
11774
11775 SrcIndex += BitShift / 8;
11776
11777 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11778 }
11779
11780 default: {
11781 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11782 }
11783 }
11784 llvm_unreachable("fully handled switch");
11785}
11786
11787// For a byte position in the result of an Or, traverse the tree and find the
11788// node (and the byte of the node) which ultimately provides this {Or,
11789// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11790// the byte position of the Op that corresponds with the originally requested
11791// byte of the Or \p Depth tracks how many recursive iterations we have
11792// performed. \p StartingIndex is the originally requested byte of the Or
11793static const std::optional<ByteProvider<SDValue>>
11794calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11795 unsigned StartingIndex = 0) {
11796 // Finding Src tree of RHS of or typically requires at least 1 additional
11797 // depth
11798 if (Depth > 6)
11799 return std::nullopt;
11800
11801 unsigned BitWidth = Op.getScalarValueSizeInBits();
11802 if (BitWidth % 8 != 0)
11803 return std::nullopt;
11804 if (Index > BitWidth / 8 - 1)
11805 return std::nullopt;
11806
11807 bool IsVec = Op.getValueType().isVector();
11808 switch (Op.getOpcode()) {
11809 case ISD::OR: {
11810 if (IsVec)
11811 return std::nullopt;
11812
11813 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11814 StartingIndex);
11815 if (!RHS)
11816 return std::nullopt;
11817 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11818 StartingIndex);
11819 if (!LHS)
11820 return std::nullopt;
11821 // A well formed Or will have two ByteProviders for each byte, one of which
11822 // is constant zero
11823 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11824 return std::nullopt;
11825 if (!LHS || LHS->isConstantZero())
11826 return RHS;
11827 if (!RHS || RHS->isConstantZero())
11828 return LHS;
11829 return std::nullopt;
11830 }
11831
11832 case ISD::AND: {
11833 if (IsVec)
11834 return std::nullopt;
11835
11836 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11837 if (!BitMaskOp)
11838 return std::nullopt;
11839
11840 uint32_t BitMask = BitMaskOp->getZExtValue();
11841 // Bits we expect for our StartingIndex
11842 uint32_t IndexMask = 0xFF << (Index * 8);
11843
11844 if ((IndexMask & BitMask) != IndexMask) {
11845 // If the result of the and partially provides the byte, then it
11846 // is not well formatted
11847 if (IndexMask & BitMask)
11848 return std::nullopt;
11850 }
11851
11852 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11853 }
11854
11855 case ISD::FSHR: {
11856 if (IsVec)
11857 return std::nullopt;
11858
11859 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11860 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11861 if (!ShiftOp || Op.getValueType().isVector())
11862 return std::nullopt;
11863
11864 uint64_t BitsProvided = Op.getValueSizeInBits();
11865 if (BitsProvided % 8 != 0)
11866 return std::nullopt;
11867
11868 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11869 if (BitShift % 8)
11870 return std::nullopt;
11871
11872 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11873 uint64_t ByteShift = BitShift / 8;
11874
11875 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11876 uint64_t BytesProvided = BitsProvided / 8;
11877 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11878 NewIndex %= BytesProvided;
11879 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11880 }
11881
11882 case ISD::SRA:
11883 case ISD::SRL: {
11884 if (IsVec)
11885 return std::nullopt;
11886
11887 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11888 if (!ShiftOp)
11889 return std::nullopt;
11890
11891 uint64_t BitShift = ShiftOp->getZExtValue();
11892 if (BitShift % 8)
11893 return std::nullopt;
11894
11895 auto BitsProvided = Op.getScalarValueSizeInBits();
11896 if (BitsProvided % 8 != 0)
11897 return std::nullopt;
11898
11899 uint64_t BytesProvided = BitsProvided / 8;
11900 uint64_t ByteShift = BitShift / 8;
11901 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11902 // If the byte we are trying to provide (as tracked by index) falls in this
11903 // range, then the SRL provides the byte. The byte of interest of the src of
11904 // the SRL is Index + ByteShift
11905 return BytesProvided - ByteShift > Index
11906 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11907 Index + ByteShift)
11909 }
11910
11911 case ISD::SHL: {
11912 if (IsVec)
11913 return std::nullopt;
11914
11915 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11916 if (!ShiftOp)
11917 return std::nullopt;
11918
11919 uint64_t BitShift = ShiftOp->getZExtValue();
11920 if (BitShift % 8 != 0)
11921 return std::nullopt;
11922 uint64_t ByteShift = BitShift / 8;
11923
11924 // If we are shifting by an amount greater than (or equal to)
11925 // the index we are trying to provide, then it provides 0s. If not,
11926 // then this bytes are not definitively 0s, and the corresponding byte
11927 // of interest is Index - ByteShift of the src
11928 return Index < ByteShift
11930 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11931 Depth + 1, StartingIndex);
11932 }
11933 case ISD::ANY_EXTEND:
11934 case ISD::SIGN_EXTEND:
11935 case ISD::ZERO_EXTEND:
11937 case ISD::AssertZext:
11938 case ISD::AssertSext: {
11939 if (IsVec)
11940 return std::nullopt;
11941
11942 SDValue NarrowOp = Op->getOperand(0);
11943 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11944 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11945 Op->getOpcode() == ISD::AssertZext ||
11946 Op->getOpcode() == ISD::AssertSext) {
11947 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11948 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11949 }
11950 if (NarrowBitWidth % 8 != 0)
11951 return std::nullopt;
11952 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11953
11954 if (Index >= NarrowByteWidth)
11955 return Op.getOpcode() == ISD::ZERO_EXTEND
11956 ? std::optional<ByteProvider<SDValue>>(
11958 : std::nullopt;
11959 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11960 }
11961
11962 case ISD::TRUNCATE: {
11963 if (IsVec)
11964 return std::nullopt;
11965
11966 uint64_t NarrowByteWidth = BitWidth / 8;
11967
11968 if (NarrowByteWidth >= Index) {
11969 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11970 StartingIndex);
11971 }
11972
11973 return std::nullopt;
11974 }
11975
11976 case ISD::CopyFromReg: {
11977 if (BitWidth / 8 > Index)
11978 return calculateSrcByte(Op, StartingIndex, Index);
11979
11980 return std::nullopt;
11981 }
11982
11983 case ISD::LOAD: {
11984 auto L = cast<LoadSDNode>(Op.getNode());
11985
11986 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11987 if (NarrowBitWidth % 8 != 0)
11988 return std::nullopt;
11989 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11990
11991 // If the width of the load does not reach byte we are trying to provide for
11992 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11993 // question
11994 if (Index >= NarrowByteWidth) {
11995 return L->getExtensionType() == ISD::ZEXTLOAD
11996 ? std::optional<ByteProvider<SDValue>>(
11998 : std::nullopt;
11999 }
12000
12001 if (NarrowByteWidth > Index) {
12002 return calculateSrcByte(Op, StartingIndex, Index);
12003 }
12004
12005 return std::nullopt;
12006 }
12007
12008 case ISD::BSWAP: {
12009 if (IsVec)
12010 return std::nullopt;
12011
12012 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12013 Depth + 1, StartingIndex);
12014 }
12015
12017 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12018 if (!IdxOp)
12019 return std::nullopt;
12020 auto VecIdx = IdxOp->getZExtValue();
12021 auto ScalarSize = Op.getScalarValueSizeInBits();
12022 if (ScalarSize < 32)
12023 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12024 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12025 StartingIndex, Index);
12026 }
12027
12028 case AMDGPUISD::PERM: {
12029 if (IsVec)
12030 return std::nullopt;
12031
12032 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12033 if (!PermMask)
12034 return std::nullopt;
12035
12036 auto IdxMask =
12037 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12038 if (IdxMask > 0x07 && IdxMask != 0x0c)
12039 return std::nullopt;
12040
12041 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12042 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12043
12044 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12047 }
12048
12049 default: {
12050 return std::nullopt;
12051 }
12052 }
12053
12054 llvm_unreachable("fully handled switch");
12055}
12056
12057// Returns true if the Operand is a scalar and is 16 bits
12058static bool isExtendedFrom16Bits(SDValue &Operand) {
12059
12060 switch (Operand.getOpcode()) {
12061 case ISD::ANY_EXTEND:
12062 case ISD::SIGN_EXTEND:
12063 case ISD::ZERO_EXTEND: {
12064 auto OpVT = Operand.getOperand(0).getValueType();
12065 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12066 }
12067 case ISD::LOAD: {
12068 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12069 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12070 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12071 ExtType == ISD::EXTLOAD) {
12072 auto MemVT = L->getMemoryVT();
12073 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12074 }
12075 return L->getMemoryVT().getSizeInBits() == 16;
12076 }
12077 default:
12078 return false;
12079 }
12080}
12081
12082// Returns true if the mask matches consecutive bytes, and the first byte
12083// begins at a power of 2 byte offset from 0th byte
12084static bool addresses16Bits(int Mask) {
12085 int Low8 = Mask & 0xff;
12086 int Hi8 = (Mask & 0xff00) >> 8;
12087
12088 assert(Low8 < 8 && Hi8 < 8);
12089 // Are the bytes contiguous in the order of increasing addresses.
12090 bool IsConsecutive = (Hi8 - Low8 == 1);
12091 // Is the first byte at location that is aligned for 16 bit instructions.
12092 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12093 // In this case, we still need code to extract the 16 bit operand, so it
12094 // is better to use i8 v_perm
12095 bool Is16Aligned = !(Low8 % 2);
12096
12097 return IsConsecutive && Is16Aligned;
12098}
12099
12100// Do not lower into v_perm if the operands are actually 16 bit
12101// and the selected bits (based on PermMask) correspond with two
12102// easily addressable 16 bit operands.
12104 SDValue &OtherOp) {
12105 int Low16 = PermMask & 0xffff;
12106 int Hi16 = (PermMask & 0xffff0000) >> 16;
12107
12108 auto TempOp = peekThroughBitcasts(Op);
12109 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12110
12111 auto OpIs16Bit =
12112 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12113 if (!OpIs16Bit)
12114 return true;
12115
12116 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12117 isExtendedFrom16Bits(TempOtherOp);
12118 if (!OtherOpIs16Bit)
12119 return true;
12120
12121 // Do we cleanly address both
12122 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12123}
12124
12126 unsigned DWordOffset) {
12127 SDValue Ret;
12128
12129 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12130 // ByteProvider must be at least 8 bits
12131 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12132
12133 if (TypeSize <= 32)
12134 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12135
12136 if (Src.getValueType().isVector()) {
12137 auto ScalarTySize = Src.getScalarValueSizeInBits();
12138 auto ScalarTy = Src.getValueType().getScalarType();
12139 if (ScalarTySize == 32) {
12140 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12141 DAG.getConstant(DWordOffset, SL, MVT::i32));
12142 }
12143 if (ScalarTySize > 32) {
12144 Ret = DAG.getNode(
12145 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12146 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12147 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12148 if (ShiftVal)
12149 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12150 DAG.getConstant(ShiftVal, SL, MVT::i32));
12151 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12152 }
12153
12154 assert(ScalarTySize < 32);
12155 auto NumElements = TypeSize / ScalarTySize;
12156 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12157 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12158 auto NumElementsIn32 = 32 / ScalarTySize;
12159 auto NumAvailElements = DWordOffset < Trunc32Elements
12160 ? NumElementsIn32
12161 : NumElements - NormalizedTrunc;
12162
12164 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12165 NumAvailElements);
12166
12167 Ret = DAG.getBuildVector(
12168 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12169 VecSrcs);
12170 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12171 }
12172
12173 /// Scalar Type
12174 auto ShiftVal = 32 * DWordOffset;
12175 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12176 DAG.getConstant(ShiftVal, SL, MVT::i32));
12177 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12178}
12179
12181 SelectionDAG &DAG = DCI.DAG;
12182 [[maybe_unused]] EVT VT = N->getValueType(0);
12184
12185 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12186 assert(VT == MVT::i32);
12187 for (int i = 0; i < 4; i++) {
12188 // Find the ByteProvider that provides the ith byte of the result of OR
12189 std::optional<ByteProvider<SDValue>> P =
12190 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12191 // TODO support constantZero
12192 if (!P || P->isConstantZero())
12193 return SDValue();
12194
12195 PermNodes.push_back(*P);
12196 }
12197 if (PermNodes.size() != 4)
12198 return SDValue();
12199
12200 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12201 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12202 uint64_t PermMask = 0x00000000;
12203 for (size_t i = 0; i < PermNodes.size(); i++) {
12204 auto PermOp = PermNodes[i];
12205 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12206 // by sizeof(Src2) = 4
12207 int SrcByteAdjust = 4;
12208
12209 // If the Src uses a byte from a different DWORD, then it corresponds
12210 // with a difference source
12211 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12212 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12213 if (SecondSrc)
12214 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12215 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12216 return SDValue();
12217
12218 // Set the index of the second distinct Src node
12219 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12220 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12221 SrcByteAdjust = 0;
12222 }
12223 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12225 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12226 }
12227 SDLoc DL(N);
12228 SDValue Op = *PermNodes[FirstSrc.first].Src;
12229 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12230 assert(Op.getValueSizeInBits() == 32);
12231
12232 // Check that we are not just extracting the bytes in order from an op
12233 if (!SecondSrc) {
12234 int Low16 = PermMask & 0xffff;
12235 int Hi16 = (PermMask & 0xffff0000) >> 16;
12236
12237 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12238 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12239
12240 // The perm op would really just produce Op. So combine into Op
12241 if (WellFormedLow && WellFormedHi)
12242 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12243 }
12244
12245 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12246
12247 if (SecondSrc) {
12248 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12249 assert(OtherOp.getValueSizeInBits() == 32);
12250 }
12251
12252 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12253
12254 assert(Op.getValueType().isByteSized() &&
12255 OtherOp.getValueType().isByteSized());
12256
12257 // If the ultimate src is less than 32 bits, then we will only be
12258 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12259 // CalculateByteProvider would not have returned Op as source if we
12260 // used a byte that is outside its ValueType. Thus, we are free to
12261 // ANY_EXTEND as the extended bits are dont-cares.
12262 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12263 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12264
12265 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12266 DAG.getConstant(PermMask, DL, MVT::i32));
12267 }
12268 return SDValue();
12269}
12270
12271SDValue SITargetLowering::performOrCombine(SDNode *N,
12272 DAGCombinerInfo &DCI) const {
12273 SelectionDAG &DAG = DCI.DAG;
12274 SDValue LHS = N->getOperand(0);
12275 SDValue RHS = N->getOperand(1);
12276
12277 EVT VT = N->getValueType(0);
12278 if (VT == MVT::i1) {
12279 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12280 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12281 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12282 SDValue Src = LHS.getOperand(0);
12283 if (Src != RHS.getOperand(0))
12284 return SDValue();
12285
12286 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12287 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12288 if (!CLHS || !CRHS)
12289 return SDValue();
12290
12291 // Only 10 bits are used.
12292 static const uint32_t MaxMask = 0x3ff;
12293
12294 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12295 SDLoc DL(N);
12296 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12297 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12298 }
12299
12300 return SDValue();
12301 }
12302
12303 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12304 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12305 LHS.getOpcode() == AMDGPUISD::PERM &&
12306 isa<ConstantSDNode>(LHS.getOperand(2))) {
12307 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12308 if (!Sel)
12309 return SDValue();
12310
12311 Sel |= LHS.getConstantOperandVal(2);
12312 SDLoc DL(N);
12313 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12314 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12315 }
12316
12317 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12319 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12320 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12321
12322 // If all the uses of an or need to extract the individual elements, do not
12323 // attempt to lower into v_perm
12324 auto usesCombinedOperand = [](SDNode *OrUse) {
12325 // If we have any non-vectorized use, then it is a candidate for v_perm
12326 if (OrUse->getOpcode() != ISD::BITCAST ||
12327 !OrUse->getValueType(0).isVector())
12328 return true;
12329
12330 // If we have any non-vectorized use, then it is a candidate for v_perm
12331 for (auto VUse : OrUse->uses()) {
12332 if (!VUse->getValueType(0).isVector())
12333 return true;
12334
12335 // If the use of a vector is a store, then combining via a v_perm
12336 // is beneficial.
12337 // TODO -- whitelist more uses
12338 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12339 if (VUse->getOpcode() == VectorwiseOp)
12340 return true;
12341 }
12342 return false;
12343 };
12344
12345 if (!any_of(N->uses(), usesCombinedOperand))
12346 return SDValue();
12347
12348 uint32_t LHSMask = getPermuteMask(LHS);
12349 uint32_t RHSMask = getPermuteMask(RHS);
12350
12351 if (LHSMask != ~0u && RHSMask != ~0u) {
12352 // Canonicalize the expression in an attempt to have fewer unique masks
12353 // and therefore fewer registers used to hold the masks.
12354 if (LHSMask > RHSMask) {
12355 std::swap(LHSMask, RHSMask);
12356 std::swap(LHS, RHS);
12357 }
12358
12359 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12360 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12361 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12362 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12363
12364 // Check of we need to combine values from two sources within a byte.
12365 if (!(LHSUsedLanes & RHSUsedLanes) &&
12366 // If we select high and lower word keep it for SDWA.
12367 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12368 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12369 // Kill zero bytes selected by other mask. Zero value is 0xc.
12370 LHSMask &= ~RHSUsedLanes;
12371 RHSMask &= ~LHSUsedLanes;
12372 // Add 4 to each active LHS lane
12373 LHSMask |= LHSUsedLanes & 0x04040404;
12374 // Combine masks
12375 uint32_t Sel = LHSMask | RHSMask;
12376 SDLoc DL(N);
12377
12378 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12379 LHS.getOperand(0), RHS.getOperand(0),
12380 DAG.getConstant(Sel, DL, MVT::i32));
12381 }
12382 }
12383 if (LHSMask == ~0u || RHSMask == ~0u) {
12384 if (SDValue Perm = matchPERM(N, DCI))
12385 return Perm;
12386 }
12387 }
12388
12389 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12390 return SDValue();
12391
12392 // TODO: This could be a generic combine with a predicate for extracting the
12393 // high half of an integer being free.
12394
12395 // (or i64:x, (zero_extend i32:y)) ->
12396 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12397 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12398 RHS.getOpcode() != ISD::ZERO_EXTEND)
12399 std::swap(LHS, RHS);
12400
12401 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12402 SDValue ExtSrc = RHS.getOperand(0);
12403 EVT SrcVT = ExtSrc.getValueType();
12404 if (SrcVT == MVT::i32) {
12405 SDLoc SL(N);
12406 SDValue LowLHS, HiBits;
12407 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12408 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12409
12410 DCI.AddToWorklist(LowOr.getNode());
12411 DCI.AddToWorklist(HiBits.getNode());
12412
12413 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12414 LowOr, HiBits);
12415 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12416 }
12417 }
12418
12419 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12420 if (CRHS) {
12421 if (SDValue Split
12422 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12423 N->getOperand(0), CRHS))
12424 return Split;
12425 }
12426
12427 return SDValue();
12428}
12429
12430SDValue SITargetLowering::performXorCombine(SDNode *N,
12431 DAGCombinerInfo &DCI) const {
12432 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12433 return RV;
12434
12435 SDValue LHS = N->getOperand(0);
12436 SDValue RHS = N->getOperand(1);
12437
12438 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12439 SelectionDAG &DAG = DCI.DAG;
12440
12441 EVT VT = N->getValueType(0);
12442 if (CRHS && VT == MVT::i64) {
12443 if (SDValue Split
12444 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12445 return Split;
12446 }
12447
12448 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12449 // fneg-like xors into 64-bit select.
12450 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12451 // This looks like an fneg, try to fold as a source modifier.
12452 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12453 shouldFoldFNegIntoSrc(N, LHS)) {
12454 // xor (select c, a, b), 0x80000000 ->
12455 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12456 SDLoc DL(N);
12457 SDValue CastLHS =
12458 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12459 SDValue CastRHS =
12460 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12461 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12462 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12463 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12464 LHS->getOperand(0), FNegLHS, FNegRHS);
12465 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12466 }
12467 }
12468
12469 return SDValue();
12470}
12471
12472SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12473 DAGCombinerInfo &DCI) const {
12474 if (!Subtarget->has16BitInsts() ||
12475 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12476 return SDValue();
12477
12478 EVT VT = N->getValueType(0);
12479 if (VT != MVT::i32)
12480 return SDValue();
12481
12482 SDValue Src = N->getOperand(0);
12483 if (Src.getValueType() != MVT::i16)
12484 return SDValue();
12485
12486 return SDValue();
12487}
12488
12489SDValue
12490SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12491 DAGCombinerInfo &DCI) const {
12492 SDValue Src = N->getOperand(0);
12493 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12494
12495 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12496 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12497 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12498 VTSign->getVT() == MVT::i8) ||
12499 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12500 VTSign->getVT() == MVT::i16))) {
12501 assert(Subtarget->hasScalarSubwordLoads() &&
12502 "s_buffer_load_{u8, i8} are supported "
12503 "in GFX12 (or newer) architectures.");
12504 EVT VT = Src.getValueType();
12505 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12508 SDLoc DL(N);
12509 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12510 SDValue Ops[] = {
12511 Src.getOperand(0), // source register
12512 Src.getOperand(1), // offset
12513 Src.getOperand(2) // cachePolicy
12514 };
12515 auto *M = cast<MemSDNode>(Src);
12516 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12517 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12518 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12519 return LoadVal;
12520 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12521 VTSign->getVT() == MVT::i8) ||
12522 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12523 VTSign->getVT() == MVT::i16)) &&
12524 Src.hasOneUse()) {
12525 auto *M = cast<MemSDNode>(Src);
12526 SDValue Ops[] = {
12527 Src.getOperand(0), // Chain
12528 Src.getOperand(1), // rsrc
12529 Src.getOperand(2), // vindex
12530 Src.getOperand(3), // voffset
12531 Src.getOperand(4), // soffset
12532 Src.getOperand(5), // offset
12533 Src.getOperand(6),
12534 Src.getOperand(7)
12535 };
12536 // replace with BUFFER_LOAD_BYTE/SHORT
12537 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12538 Src.getOperand(0).getValueType());
12539 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12541 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12542 ResList,
12543 Ops, M->getMemoryVT(),
12544 M->getMemOperand());
12545 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12546 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12547 }
12548 return SDValue();
12549}
12550
12551SDValue SITargetLowering::performClassCombine(SDNode *N,
12552 DAGCombinerInfo &DCI) const {
12553 SelectionDAG &DAG = DCI.DAG;
12554 SDValue Mask = N->getOperand(1);
12555
12556 // fp_class x, 0 -> false
12557 if (isNullConstant(Mask))
12558 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12559
12560 if (N->getOperand(0).isUndef())
12561 return DAG.getUNDEF(MVT::i1);
12562
12563 return SDValue();
12564}
12565
12566SDValue SITargetLowering::performRcpCombine(SDNode *N,
12567 DAGCombinerInfo &DCI) const {
12568 EVT VT = N->getValueType(0);
12569 SDValue N0 = N->getOperand(0);
12570
12571 if (N0.isUndef()) {
12572 return DCI.DAG.getConstantFP(
12574 VT);
12575 }
12576
12577 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12578 N0.getOpcode() == ISD::SINT_TO_FP)) {
12579 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12580 N->getFlags());
12581 }
12582
12583 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12584 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12585 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12586 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12587 N0.getOperand(0), N->getFlags());
12588 }
12589
12591}
12592
12594 unsigned MaxDepth) const {
12595 unsigned Opcode = Op.getOpcode();
12596 if (Opcode == ISD::FCANONICALIZE)
12597 return true;
12598
12599 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12600 const auto &F = CFP->getValueAPF();
12601 if (F.isNaN() && F.isSignaling())
12602 return false;
12603 if (!F.isDenormal())
12604 return true;
12605
12606 DenormalMode Mode =
12607 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12608 return Mode == DenormalMode::getIEEE();
12609 }
12610
12611 // If source is a result of another standard FP operation it is already in
12612 // canonical form.
12613 if (MaxDepth == 0)
12614 return false;
12615
12616 switch (Opcode) {
12617 // These will flush denorms if required.
12618 case ISD::FADD:
12619 case ISD::FSUB:
12620 case ISD::FMUL:
12621 case ISD::FCEIL:
12622 case ISD::FFLOOR:
12623 case ISD::FMA:
12624 case ISD::FMAD:
12625 case ISD::FSQRT:
12626 case ISD::FDIV:
12627 case ISD::FREM:
12628 case ISD::FP_ROUND:
12629 case ISD::FP_EXTEND:
12630 case ISD::FP16_TO_FP:
12631 case ISD::FP_TO_FP16:
12632 case ISD::BF16_TO_FP:
12633 case ISD::FP_TO_BF16:
12634 case ISD::FLDEXP:
12637 case AMDGPUISD::RCP:
12638 case AMDGPUISD::RSQ:
12642 case AMDGPUISD::LOG:
12643 case AMDGPUISD::EXP:
12647 case AMDGPUISD::FRACT:
12654 case AMDGPUISD::SIN_HW:
12655 case AMDGPUISD::COS_HW:
12656 return true;
12657
12658 // It can/will be lowered or combined as a bit operation.
12659 // Need to check their input recursively to handle.
12660 case ISD::FNEG:
12661 case ISD::FABS:
12662 case ISD::FCOPYSIGN:
12663 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12664
12665 case ISD::AND:
12666 if (Op.getValueType() == MVT::i32) {
12667 // Be careful as we only know it is a bitcast floating point type. It
12668 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12669 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12670 // is valid to optimize for all types.
12671 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12672 if (RHS->getZExtValue() == 0xffff0000) {
12673 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12674 }
12675 }
12676 }
12677 break;
12678
12679 case ISD::FSIN:
12680 case ISD::FCOS:
12681 case ISD::FSINCOS:
12682 return Op.getValueType().getScalarType() != MVT::f16;
12683
12684 case ISD::FMINNUM:
12685 case ISD::FMAXNUM:
12686 case ISD::FMINNUM_IEEE:
12687 case ISD::FMAXNUM_IEEE:
12688 case ISD::FMINIMUM:
12689 case ISD::FMAXIMUM:
12690 case AMDGPUISD::CLAMP:
12691 case AMDGPUISD::FMED3:
12692 case AMDGPUISD::FMAX3:
12693 case AMDGPUISD::FMIN3:
12695 case AMDGPUISD::FMINIMUM3: {
12696 // FIXME: Shouldn't treat the generic operations different based these.
12697 // However, we aren't really required to flush the result from
12698 // minnum/maxnum..
12699
12700 // snans will be quieted, so we only need to worry about denormals.
12701 if (Subtarget->supportsMinMaxDenormModes() ||
12702 // FIXME: denormalsEnabledForType is broken for dynamic
12703 denormalsEnabledForType(DAG, Op.getValueType()))
12704 return true;
12705
12706 // Flushing may be required.
12707 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12708 // targets need to check their input recursively.
12709
12710 // FIXME: Does this apply with clamp? It's implemented with max.
12711 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12712 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12713 return false;
12714 }
12715
12716 return true;
12717 }
12718 case ISD::SELECT: {
12719 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12720 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12721 }
12722 case ISD::BUILD_VECTOR: {
12723 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12724 SDValue SrcOp = Op.getOperand(i);
12725 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12726 return false;
12727 }
12728
12729 return true;
12730 }
12733 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12734 }
12736 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12737 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12738 }
12739 case ISD::UNDEF:
12740 // Could be anything.
12741 return false;
12742
12743 case ISD::BITCAST:
12744 // TODO: This is incorrect as it loses track of the operand's type. We may
12745 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12746 // same bits that are canonicalized in one type need not be in the other.
12747 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12748 case ISD::TRUNCATE: {
12749 // Hack round the mess we make when legalizing extract_vector_elt
12750 if (Op.getValueType() == MVT::i16) {
12751 SDValue TruncSrc = Op.getOperand(0);
12752 if (TruncSrc.getValueType() == MVT::i32 &&
12753 TruncSrc.getOpcode() == ISD::BITCAST &&
12754 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12755 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12756 }
12757 }
12758 return false;
12759 }
12761 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12762 // TODO: Handle more intrinsics
12763 switch (IntrinsicID) {
12764 case Intrinsic::amdgcn_cvt_pkrtz:
12765 case Intrinsic::amdgcn_cubeid:
12766 case Intrinsic::amdgcn_frexp_mant:
12767 case Intrinsic::amdgcn_fdot2:
12768 case Intrinsic::amdgcn_rcp:
12769 case Intrinsic::amdgcn_rsq:
12770 case Intrinsic::amdgcn_rsq_clamp:
12771 case Intrinsic::amdgcn_rcp_legacy:
12772 case Intrinsic::amdgcn_rsq_legacy:
12773 case Intrinsic::amdgcn_trig_preop:
12774 case Intrinsic::amdgcn_log:
12775 case Intrinsic::amdgcn_exp2:
12776 case Intrinsic::amdgcn_sqrt:
12777 return true;
12778 default:
12779 break;
12780 }
12781
12782 break;
12783 }
12784 default:
12785 break;
12786 }
12787
12788 // FIXME: denormalsEnabledForType is broken for dynamic
12789 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12790 DAG.isKnownNeverSNaN(Op);
12791}
12792
12794 unsigned MaxDepth) const {
12795 const MachineRegisterInfo &MRI = MF.getRegInfo();
12796 MachineInstr *MI = MRI.getVRegDef(Reg);
12797 unsigned Opcode = MI->getOpcode();
12798
12799 if (Opcode == AMDGPU::G_FCANONICALIZE)
12800 return true;
12801
12802 std::optional<FPValueAndVReg> FCR;
12803 // Constant splat (can be padded with undef) or scalar constant.
12804 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12805 if (FCR->Value.isSignaling())
12806 return false;
12807 if (!FCR->Value.isDenormal())
12808 return true;
12809
12810 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12811 return Mode == DenormalMode::getIEEE();
12812 }
12813
12814 if (MaxDepth == 0)
12815 return false;
12816
12817 switch (Opcode) {
12818 case AMDGPU::G_FADD:
12819 case AMDGPU::G_FSUB:
12820 case AMDGPU::G_FMUL:
12821 case AMDGPU::G_FCEIL:
12822 case AMDGPU::G_FFLOOR:
12823 case AMDGPU::G_FRINT:
12824 case AMDGPU::G_FNEARBYINT:
12825 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12826 case AMDGPU::G_INTRINSIC_TRUNC:
12827 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12828 case AMDGPU::G_FMA:
12829 case AMDGPU::G_FMAD:
12830 case AMDGPU::G_FSQRT:
12831 case AMDGPU::G_FDIV:
12832 case AMDGPU::G_FREM:
12833 case AMDGPU::G_FPOW:
12834 case AMDGPU::G_FPEXT:
12835 case AMDGPU::G_FLOG:
12836 case AMDGPU::G_FLOG2:
12837 case AMDGPU::G_FLOG10:
12838 case AMDGPU::G_FPTRUNC:
12839 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12840 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12841 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12842 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12843 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12844 return true;
12845 case AMDGPU::G_FNEG:
12846 case AMDGPU::G_FABS:
12847 case AMDGPU::G_FCOPYSIGN:
12848 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12849 case AMDGPU::G_FMINNUM:
12850 case AMDGPU::G_FMAXNUM:
12851 case AMDGPU::G_FMINNUM_IEEE:
12852 case AMDGPU::G_FMAXNUM_IEEE:
12853 case AMDGPU::G_FMINIMUM:
12854 case AMDGPU::G_FMAXIMUM: {
12855 if (Subtarget->supportsMinMaxDenormModes() ||
12856 // FIXME: denormalsEnabledForType is broken for dynamic
12857 denormalsEnabledForType(MRI.getType(Reg), MF))
12858 return true;
12859
12860 [[fallthrough]];
12861 }
12862 case AMDGPU::G_BUILD_VECTOR:
12863 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12864 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12865 return false;
12866 return true;
12867 case AMDGPU::G_INTRINSIC:
12868 case AMDGPU::G_INTRINSIC_CONVERGENT:
12869 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12870 case Intrinsic::amdgcn_fmul_legacy:
12871 case Intrinsic::amdgcn_fmad_ftz:
12872 case Intrinsic::amdgcn_sqrt:
12873 case Intrinsic::amdgcn_fmed3:
12874 case Intrinsic::amdgcn_sin:
12875 case Intrinsic::amdgcn_cos:
12876 case Intrinsic::amdgcn_log:
12877 case Intrinsic::amdgcn_exp2:
12878 case Intrinsic::amdgcn_log_clamp:
12879 case Intrinsic::amdgcn_rcp:
12880 case Intrinsic::amdgcn_rcp_legacy:
12881 case Intrinsic::amdgcn_rsq:
12882 case Intrinsic::amdgcn_rsq_clamp:
12883 case Intrinsic::amdgcn_rsq_legacy:
12884 case Intrinsic::amdgcn_div_scale:
12885 case Intrinsic::amdgcn_div_fmas:
12886 case Intrinsic::amdgcn_div_fixup:
12887 case Intrinsic::amdgcn_fract:
12888 case Intrinsic::amdgcn_cvt_pkrtz:
12889 case Intrinsic::amdgcn_cubeid:
12890 case Intrinsic::amdgcn_cubema:
12891 case Intrinsic::amdgcn_cubesc:
12892 case Intrinsic::amdgcn_cubetc:
12893 case Intrinsic::amdgcn_frexp_mant:
12894 case Intrinsic::amdgcn_fdot2:
12895 case Intrinsic::amdgcn_trig_preop:
12896 return true;
12897 default:
12898 break;
12899 }
12900
12901 [[fallthrough]];
12902 default:
12903 return false;
12904 }
12905
12906 llvm_unreachable("invalid operation");
12907}
12908
12909// Constant fold canonicalize.
12910SDValue SITargetLowering::getCanonicalConstantFP(
12911 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12912 // Flush denormals to 0 if not enabled.
12913 if (C.isDenormal()) {
12914 DenormalMode Mode =
12915 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12916 if (Mode == DenormalMode::getPreserveSign()) {
12917 return DAG.getConstantFP(
12918 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12919 }
12920
12921 if (Mode != DenormalMode::getIEEE())
12922 return SDValue();
12923 }
12924
12925 if (C.isNaN()) {
12926 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12927 if (C.isSignaling()) {
12928 // Quiet a signaling NaN.
12929 // FIXME: Is this supposed to preserve payload bits?
12930 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12931 }
12932
12933 // Make sure it is the canonical NaN bitpattern.
12934 //
12935 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12936 // immediate?
12937 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12938 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12939 }
12940
12941 // Already canonical.
12942 return DAG.getConstantFP(C, SL, VT);
12943}
12944
12946 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12947}
12948
12949SDValue SITargetLowering::performFCanonicalizeCombine(
12950 SDNode *N,
12951 DAGCombinerInfo &DCI) const {
12952 SelectionDAG &DAG = DCI.DAG;
12953 SDValue N0 = N->getOperand(0);
12954 EVT VT = N->getValueType(0);
12955
12956 // fcanonicalize undef -> qnan
12957 if (N0.isUndef()) {
12959 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12960 }
12961
12962 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12963 EVT VT = N->getValueType(0);
12964 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12965 }
12966
12967 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12968 // (fcanonicalize k)
12969 //
12970 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12971
12972 // TODO: This could be better with wider vectors that will be split to v2f16,
12973 // and to consider uses since there aren't that many packed operations.
12974 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12975 isTypeLegal(MVT::v2f16)) {
12976 SDLoc SL(N);
12977 SDValue NewElts[2];
12978 SDValue Lo = N0.getOperand(0);
12979 SDValue Hi = N0.getOperand(1);
12980 EVT EltVT = Lo.getValueType();
12981
12983 for (unsigned I = 0; I != 2; ++I) {
12984 SDValue Op = N0.getOperand(I);
12985 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12986 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12987 CFP->getValueAPF());
12988 } else if (Op.isUndef()) {
12989 // Handled below based on what the other operand is.
12990 NewElts[I] = Op;
12991 } else {
12992 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12993 }
12994 }
12995
12996 // If one half is undef, and one is constant, prefer a splat vector rather
12997 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12998 // cheaper to use and may be free with a packed operation.
12999 if (NewElts[0].isUndef()) {
13000 if (isa<ConstantFPSDNode>(NewElts[1]))
13001 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13002 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13003 }
13004
13005 if (NewElts[1].isUndef()) {
13006 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13007 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13008 }
13009
13010 return DAG.getBuildVector(VT, SL, NewElts);
13011 }
13012 }
13013
13014 return SDValue();
13015}
13016
13017static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13018 switch (Opc) {
13019 case ISD::FMAXNUM:
13020 case ISD::FMAXNUM_IEEE:
13021 return AMDGPUISD::FMAX3;
13022 case ISD::FMAXIMUM:
13023 return AMDGPUISD::FMAXIMUM3;
13024 case ISD::SMAX:
13025 return AMDGPUISD::SMAX3;
13026 case ISD::UMAX:
13027 return AMDGPUISD::UMAX3;
13028 case ISD::FMINNUM:
13029 case ISD::FMINNUM_IEEE:
13030 return AMDGPUISD::FMIN3;
13031 case ISD::FMINIMUM:
13032 return AMDGPUISD::FMINIMUM3;
13033 case ISD::SMIN:
13034 return AMDGPUISD::SMIN3;
13035 case ISD::UMIN:
13036 return AMDGPUISD::UMIN3;
13037 default:
13038 llvm_unreachable("Not a min/max opcode");
13039 }
13040}
13041
13042SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13043 const SDLoc &SL, SDValue Src,
13044 SDValue MinVal,
13045 SDValue MaxVal,
13046 bool Signed) const {
13047
13048 // med3 comes from
13049 // min(max(x, K0), K1), K0 < K1
13050 // max(min(x, K0), K1), K1 < K0
13051 //
13052 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13053 // min/max op.
13054 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13055 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13056
13057 if (!MinK || !MaxK)
13058 return SDValue();
13059
13060 if (Signed) {
13061 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13062 return SDValue();
13063 } else {
13064 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13065 return SDValue();
13066 }
13067
13068 EVT VT = MinK->getValueType(0);
13069 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13070 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13071 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13072
13073 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13074 // not available, but this is unlikely to be profitable as constants
13075 // will often need to be materialized & extended, especially on
13076 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13077 return SDValue();
13078}
13079
13081 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13082 return C;
13083
13084 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13085 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13086 return C;
13087 }
13088
13089 return nullptr;
13090}
13091
13092SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13093 const SDLoc &SL,
13094 SDValue Op0,
13095 SDValue Op1) const {
13097 if (!K1)
13098 return SDValue();
13099
13101 if (!K0)
13102 return SDValue();
13103
13104 // Ordered >= (although NaN inputs should have folded away by now).
13105 if (K0->getValueAPF() > K1->getValueAPF())
13106 return SDValue();
13107
13108 const MachineFunction &MF = DAG.getMachineFunction();
13110
13111 // TODO: Check IEEE bit enabled?
13112 EVT VT = Op0.getValueType();
13113 if (Info->getMode().DX10Clamp) {
13114 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13115 // hardware fmed3 behavior converting to a min.
13116 // FIXME: Should this be allowing -0.0?
13117 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13118 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13119 }
13120
13121 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13122 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13123 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13124 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13125 // then give the other result, which is different from med3 with a NaN
13126 // input.
13127 SDValue Var = Op0.getOperand(0);
13128 if (!DAG.isKnownNeverSNaN(Var))
13129 return SDValue();
13130
13132
13133 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13134 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13135 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13136 Var, SDValue(K0, 0), SDValue(K1, 0));
13137 }
13138 }
13139
13140 return SDValue();
13141}
13142
13143/// \return true if the subtarget supports minimum3 and maximum3 with the given
13144/// base min/max opcode \p Opc for type \p VT.
13145static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13146 EVT VT) {
13147 switch (Opc) {
13148 case ISD::FMINNUM:
13149 case ISD::FMAXNUM:
13150 case ISD::FMINNUM_IEEE:
13151 case ISD::FMAXNUM_IEEE:
13154 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13155 case ISD::FMINIMUM:
13156 case ISD::FMAXIMUM:
13157 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13158 case ISD::SMAX:
13159 case ISD::SMIN:
13160 case ISD::UMAX:
13161 case ISD::UMIN:
13162 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13163 default:
13164 return false;
13165 }
13166
13167 llvm_unreachable("not a min/max opcode");
13168}
13169
13170SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13171 DAGCombinerInfo &DCI) const {
13172 SelectionDAG &DAG = DCI.DAG;
13173
13174 EVT VT = N->getValueType(0);
13175 unsigned Opc = N->getOpcode();
13176 SDValue Op0 = N->getOperand(0);
13177 SDValue Op1 = N->getOperand(1);
13178
13179 // Only do this if the inner op has one use since this will just increases
13180 // register pressure for no benefit.
13181
13182 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13183 // max(max(a, b), c) -> max3(a, b, c)
13184 // min(min(a, b), c) -> min3(a, b, c)
13185 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13186 SDLoc DL(N);
13187 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13188 DL,
13189 N->getValueType(0),
13190 Op0.getOperand(0),
13191 Op0.getOperand(1),
13192 Op1);
13193 }
13194
13195 // Try commuted.
13196 // max(a, max(b, c)) -> max3(a, b, c)
13197 // min(a, min(b, c)) -> min3(a, b, c)
13198 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13199 SDLoc DL(N);
13200 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13201 DL,
13202 N->getValueType(0),
13203 Op0,
13204 Op1.getOperand(0),
13205 Op1.getOperand(1));
13206 }
13207 }
13208
13209 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13210 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13211 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13212 if (SDValue Med3 = performIntMed3ImmCombine(
13213 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13214 return Med3;
13215 }
13216 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13217 if (SDValue Med3 = performIntMed3ImmCombine(
13218 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13219 return Med3;
13220 }
13221
13222 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13223 if (SDValue Med3 = performIntMed3ImmCombine(
13224 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13225 return Med3;
13226 }
13227 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13228 if (SDValue Med3 = performIntMed3ImmCombine(
13229 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13230 return Med3;
13231 }
13232
13233 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13234 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13235 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13236 (Opc == AMDGPUISD::FMIN_LEGACY &&
13237 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13238 (VT == MVT::f32 || VT == MVT::f64 ||
13239 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13240 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13241 Op0.hasOneUse()) {
13242 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13243 return Res;
13244 }
13245
13246 return SDValue();
13247}
13248
13250 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13251 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13252 // FIXME: Should this be allowing -0.0?
13253 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13254 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13255 }
13256 }
13257
13258 return false;
13259}
13260
13261// FIXME: Should only worry about snans for version with chain.
13262SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13263 DAGCombinerInfo &DCI) const {
13264 EVT VT = N->getValueType(0);
13265 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13266 // NaNs. With a NaN input, the order of the operands may change the result.
13267
13268 SelectionDAG &DAG = DCI.DAG;
13269 SDLoc SL(N);
13270
13271 SDValue Src0 = N->getOperand(0);
13272 SDValue Src1 = N->getOperand(1);
13273 SDValue Src2 = N->getOperand(2);
13274
13275 if (isClampZeroToOne(Src0, Src1)) {
13276 // const_a, const_b, x -> clamp is safe in all cases including signaling
13277 // nans.
13278 // FIXME: Should this be allowing -0.0?
13279 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13280 }
13281
13282 const MachineFunction &MF = DAG.getMachineFunction();
13284
13285 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13286 // handling no dx10-clamp?
13287 if (Info->getMode().DX10Clamp) {
13288 // If NaNs is clamped to 0, we are free to reorder the inputs.
13289
13290 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13291 std::swap(Src0, Src1);
13292
13293 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13294 std::swap(Src1, Src2);
13295
13296 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13297 std::swap(Src0, Src1);
13298
13299 if (isClampZeroToOne(Src1, Src2))
13300 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13301 }
13302
13303 return SDValue();
13304}
13305
13306SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13307 DAGCombinerInfo &DCI) const {
13308 SDValue Src0 = N->getOperand(0);
13309 SDValue Src1 = N->getOperand(1);
13310 if (Src0.isUndef() && Src1.isUndef())
13311 return DCI.DAG.getUNDEF(N->getValueType(0));
13312 return SDValue();
13313}
13314
13315// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13316// expanded into a set of cmp/select instructions.
13318 unsigned NumElem,
13319 bool IsDivergentIdx,
13320 const GCNSubtarget *Subtarget) {
13322 return false;
13323
13324 unsigned VecSize = EltSize * NumElem;
13325
13326 // Sub-dword vectors of size 2 dword or less have better implementation.
13327 if (VecSize <= 64 && EltSize < 32)
13328 return false;
13329
13330 // Always expand the rest of sub-dword instructions, otherwise it will be
13331 // lowered via memory.
13332 if (EltSize < 32)
13333 return true;
13334
13335 // Always do this if var-idx is divergent, otherwise it will become a loop.
13336 if (IsDivergentIdx)
13337 return true;
13338
13339 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13340 unsigned NumInsts = NumElem /* Number of compares */ +
13341 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13342
13343 // On some architectures (GFX9) movrel is not available and it's better
13344 // to expand.
13345 if (!Subtarget->hasMovrel())
13346 return NumInsts <= 16;
13347
13348 // If movrel is available, use it instead of expanding for vector of 8
13349 // elements.
13350 return NumInsts <= 15;
13351}
13352
13354 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13355 if (isa<ConstantSDNode>(Idx))
13356 return false;
13357
13358 SDValue Vec = N->getOperand(0);
13359 EVT VecVT = Vec.getValueType();
13360 EVT EltVT = VecVT.getVectorElementType();
13361 unsigned EltSize = EltVT.getSizeInBits();
13362 unsigned NumElem = VecVT.getVectorNumElements();
13363
13365 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13366}
13367
13368SDValue SITargetLowering::performExtractVectorEltCombine(
13369 SDNode *N, DAGCombinerInfo &DCI) const {
13370 SDValue Vec = N->getOperand(0);
13371 SelectionDAG &DAG = DCI.DAG;
13372
13373 EVT VecVT = Vec.getValueType();
13374 EVT VecEltVT = VecVT.getVectorElementType();
13375 EVT ResVT = N->getValueType(0);
13376
13377 unsigned VecSize = VecVT.getSizeInBits();
13378 unsigned VecEltSize = VecEltVT.getSizeInBits();
13379
13380 if ((Vec.getOpcode() == ISD::FNEG ||
13382 SDLoc SL(N);
13383 SDValue Idx = N->getOperand(1);
13384 SDValue Elt =
13385 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13386 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13387 }
13388
13389 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13390 // =>
13391 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13392 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13393 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13394 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13395 SDLoc SL(N);
13396 SDValue Idx = N->getOperand(1);
13397 unsigned Opc = Vec.getOpcode();
13398
13399 switch(Opc) {
13400 default:
13401 break;
13402 // TODO: Support other binary operations.
13403 case ISD::FADD:
13404 case ISD::FSUB:
13405 case ISD::FMUL:
13406 case ISD::ADD:
13407 case ISD::UMIN:
13408 case ISD::UMAX:
13409 case ISD::SMIN:
13410 case ISD::SMAX:
13411 case ISD::FMAXNUM:
13412 case ISD::FMINNUM:
13413 case ISD::FMAXNUM_IEEE:
13414 case ISD::FMINNUM_IEEE:
13415 case ISD::FMAXIMUM:
13416 case ISD::FMINIMUM: {
13417 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13418 Vec.getOperand(0), Idx);
13419 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13420 Vec.getOperand(1), Idx);
13421
13422 DCI.AddToWorklist(Elt0.getNode());
13423 DCI.AddToWorklist(Elt1.getNode());
13424 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13425 }
13426 }
13427 }
13428
13429 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13431 SDLoc SL(N);
13432 SDValue Idx = N->getOperand(1);
13433 SDValue V;
13434 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13435 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13436 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13437 if (I == 0)
13438 V = Elt;
13439 else
13440 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13441 }
13442 return V;
13443 }
13444
13445 if (!DCI.isBeforeLegalize())
13446 return SDValue();
13447
13448 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13449 // elements. This exposes more load reduction opportunities by replacing
13450 // multiple small extract_vector_elements with a single 32-bit extract.
13451 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13452 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13453 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13454 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13455
13456 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13457 unsigned EltIdx = BitIndex / 32;
13458 unsigned LeftoverBitIdx = BitIndex % 32;
13459 SDLoc SL(N);
13460
13461 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13462 DCI.AddToWorklist(Cast.getNode());
13463
13464 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13465 DAG.getConstant(EltIdx, SL, MVT::i32));
13466 DCI.AddToWorklist(Elt.getNode());
13467 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13468 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13469 DCI.AddToWorklist(Srl.getNode());
13470
13471 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13472 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13473 DCI.AddToWorklist(Trunc.getNode());
13474
13475 if (VecEltVT == ResVT) {
13476 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13477 }
13478
13479 assert(ResVT.isScalarInteger());
13480 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13481 }
13482
13483 return SDValue();
13484}
13485
13486SDValue
13487SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13488 DAGCombinerInfo &DCI) const {
13489 SDValue Vec = N->getOperand(0);
13490 SDValue Idx = N->getOperand(2);
13491 EVT VecVT = Vec.getValueType();
13492 EVT EltVT = VecVT.getVectorElementType();
13493
13494 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13495 // => BUILD_VECTOR n x select (e, const-idx)
13497 return SDValue();
13498
13499 SelectionDAG &DAG = DCI.DAG;
13500 SDLoc SL(N);
13501 SDValue Ins = N->getOperand(1);
13502 EVT IdxVT = Idx.getValueType();
13503
13505 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13506 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13507 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13508 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13509 Ops.push_back(V);
13510 }
13511
13512 return DAG.getBuildVector(VecVT, SL, Ops);
13513}
13514
13515/// Return the source of an fp_extend from f16 to f32, or a converted FP
13516/// constant.
13518 if (Src.getOpcode() == ISD::FP_EXTEND &&
13519 Src.getOperand(0).getValueType() == MVT::f16) {
13520 return Src.getOperand(0);
13521 }
13522
13523 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13524 APFloat Val = CFP->getValueAPF();
13525 bool LosesInfo = true;
13527 if (!LosesInfo)
13528 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13529 }
13530
13531 return SDValue();
13532}
13533
13534SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13535 DAGCombinerInfo &DCI) const {
13536 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13537 "combine only useful on gfx8");
13538
13539 SDValue TruncSrc = N->getOperand(0);
13540 EVT VT = N->getValueType(0);
13541 if (VT != MVT::f16)
13542 return SDValue();
13543
13544 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13545 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13546 return SDValue();
13547
13548 SelectionDAG &DAG = DCI.DAG;
13549 SDLoc SL(N);
13550
13551 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13552 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13553 // casting back.
13554
13555 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13556 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13557 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13558 if (!A)
13559 return SDValue();
13560
13561 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13562 if (!B)
13563 return SDValue();
13564
13565 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13566 if (!C)
13567 return SDValue();
13568
13569 // This changes signaling nan behavior. If an input is a signaling nan, it
13570 // would have been quieted by the fpext originally. We don't care because
13571 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13572 // we would be worse off than just doing the promotion.
13573 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13574 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13575 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13576 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13577}
13578
13579unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13580 const SDNode *N0,
13581 const SDNode *N1) const {
13582 EVT VT = N0->getValueType(0);
13583
13584 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13585 // support denormals ever.
13586 if (((VT == MVT::f32 &&
13588 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13591 return ISD::FMAD;
13592
13593 const TargetOptions &Options = DAG.getTarget().Options;
13594 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13595 (N0->getFlags().hasAllowContract() &&
13596 N1->getFlags().hasAllowContract())) &&
13598 return ISD::FMA;
13599 }
13600
13601 return 0;
13602}
13603
13604// For a reassociatable opcode perform:
13605// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13606SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13607 SelectionDAG &DAG) const {
13608 EVT VT = N->getValueType(0);
13609 if (VT != MVT::i32 && VT != MVT::i64)
13610 return SDValue();
13611
13612 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13613 return SDValue();
13614
13615 unsigned Opc = N->getOpcode();
13616 SDValue Op0 = N->getOperand(0);
13617 SDValue Op1 = N->getOperand(1);
13618
13619 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13620 return SDValue();
13621
13622 if (Op0->isDivergent())
13623 std::swap(Op0, Op1);
13624
13625 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13626 return SDValue();
13627
13628 SDValue Op2 = Op1.getOperand(1);
13629 Op1 = Op1.getOperand(0);
13630 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13631 return SDValue();
13632
13633 if (Op1->isDivergent())
13634 std::swap(Op1, Op2);
13635
13636 SDLoc SL(N);
13637 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13638 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13639}
13640
13641static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13642 EVT VT,
13643 SDValue N0, SDValue N1, SDValue N2,
13644 bool Signed) {
13646 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13647 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13648 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13649}
13650
13651// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13652// multiplies, if any.
13653//
13654// Full 64-bit multiplies that feed into an addition are lowered here instead
13655// of using the generic expansion. The generic expansion ends up with
13656// a tree of ADD nodes that prevents us from using the "add" part of the
13657// MAD instruction. The expansion produced here results in a chain of ADDs
13658// instead of a tree.
13659SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13660 DAGCombinerInfo &DCI) const {
13661 assert(N->getOpcode() == ISD::ADD);
13662
13663 SelectionDAG &DAG = DCI.DAG;
13664 EVT VT = N->getValueType(0);
13665 SDLoc SL(N);
13666 SDValue LHS = N->getOperand(0);
13667 SDValue RHS = N->getOperand(1);
13668
13669 if (VT.isVector())
13670 return SDValue();
13671
13672 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13673 // result in scalar registers for uniform values.
13674 if (!N->isDivergent() && Subtarget->hasSMulHi())
13675 return SDValue();
13676
13677 unsigned NumBits = VT.getScalarSizeInBits();
13678 if (NumBits <= 32 || NumBits > 64)
13679 return SDValue();
13680
13681 if (LHS.getOpcode() != ISD::MUL) {
13682 assert(RHS.getOpcode() == ISD::MUL);
13683 std::swap(LHS, RHS);
13684 }
13685
13686 // Avoid the fold if it would unduly increase the number of multiplies due to
13687 // multiple uses, except on hardware with full-rate multiply-add (which is
13688 // part of full-rate 64-bit ops).
13689 if (!Subtarget->hasFullRate64Ops()) {
13690 unsigned NumUsers = 0;
13691 for (SDNode *Use : LHS->uses()) {
13692 // There is a use that does not feed into addition, so the multiply can't
13693 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13694 if (Use->getOpcode() != ISD::ADD)
13695 return SDValue();
13696
13697 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13698 // MUL + 3xADD + 3xADDC over 3xMAD.
13699 ++NumUsers;
13700 if (NumUsers >= 3)
13701 return SDValue();
13702 }
13703 }
13704
13705 SDValue MulLHS = LHS.getOperand(0);
13706 SDValue MulRHS = LHS.getOperand(1);
13707 SDValue AddRHS = RHS;
13708
13709 // Always check whether operands are small unsigned values, since that
13710 // knowledge is useful in more cases. Check for small signed values only if
13711 // doing so can unlock a shorter code sequence.
13712 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13713 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13714
13715 bool MulSignedLo = false;
13716 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13717 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13718 numBitsSigned(MulRHS, DAG) <= 32;
13719 }
13720
13721 // The operands and final result all have the same number of bits. If
13722 // operands need to be extended, they can be extended with garbage. The
13723 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13724 // truncated away in the end.
13725 if (VT != MVT::i64) {
13726 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13727 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13728 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13729 }
13730
13731 // The basic code generated is conceptually straightforward. Pseudo code:
13732 //
13733 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13734 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13735 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13736 //
13737 // The second and third lines are optional, depending on whether the factors
13738 // are {sign,zero}-extended or not.
13739 //
13740 // The actual DAG is noisier than the pseudo code, but only due to
13741 // instructions that disassemble values into low and high parts, and
13742 // assemble the final result.
13743 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13744
13745 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13746 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13747 SDValue Accum =
13748 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13749
13750 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13751 SDValue AccumLo, AccumHi;
13752 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13753
13754 if (!MulLHSUnsigned32) {
13755 auto MulLHSHi =
13756 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13757 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13758 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13759 }
13760
13761 if (!MulRHSUnsigned32) {
13762 auto MulRHSHi =
13763 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13764 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13765 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13766 }
13767
13768 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13769 Accum = DAG.getBitcast(MVT::i64, Accum);
13770 }
13771
13772 if (VT != MVT::i64)
13773 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13774 return Accum;
13775}
13776
13777// Collect the ultimate src of each of the mul node's operands, and confirm
13778// each operand is 8 bytes.
13779static std::optional<ByteProvider<SDValue>>
13780handleMulOperand(const SDValue &MulOperand) {
13781 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13782 if (!Byte0 || Byte0->isConstantZero()) {
13783 return std::nullopt;
13784 }
13785 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13786 if (Byte1 && !Byte1->isConstantZero()) {
13787 return std::nullopt;
13788 }
13789 return Byte0;
13790}
13791
13792static unsigned addPermMasks(unsigned First, unsigned Second) {
13793 unsigned FirstCs = First & 0x0c0c0c0c;
13794 unsigned SecondCs = Second & 0x0c0c0c0c;
13795 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13796 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13797
13798 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13799 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13800 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13801 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13802
13803 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13804}
13805
13806struct DotSrc {
13808 int64_t PermMask;
13810};
13811
13815 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13816
13817 assert(Src0.Src.has_value() && Src1.Src.has_value());
13818 // Src0s and Src1s are empty, just place arbitrarily.
13819 if (Step == 0) {
13820 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13821 Src0.SrcOffset / 4});
13822 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13823 Src1.SrcOffset / 4});
13824 return;
13825 }
13826
13827 for (int BPI = 0; BPI < 2; BPI++) {
13828 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13829 if (BPI == 1) {
13830 BPP = {Src1, Src0};
13831 }
13832 unsigned ZeroMask = 0x0c0c0c0c;
13833 unsigned FMask = 0xFF << (8 * (3 - Step));
13834
13835 unsigned FirstMask =
13836 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13837 unsigned SecondMask =
13838 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13839 // Attempt to find Src vector which contains our SDValue, if so, add our
13840 // perm mask to the existing one. If we are unable to find a match for the
13841 // first SDValue, attempt to find match for the second.
13842 int FirstGroup = -1;
13843 for (int I = 0; I < 2; I++) {
13844 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13845 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13846 return IterElt.SrcOp == *BPP.first.Src &&
13847 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13848 };
13849
13850 auto Match = llvm::find_if(Srcs, MatchesFirst);
13851 if (Match != Srcs.end()) {
13852 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13853 FirstGroup = I;
13854 break;
13855 }
13856 }
13857 if (FirstGroup != -1) {
13858 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13859 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13860 return IterElt.SrcOp == *BPP.second.Src &&
13861 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13862 };
13863 auto Match = llvm::find_if(Srcs, MatchesSecond);
13864 if (Match != Srcs.end()) {
13865 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13866 } else
13867 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13868 return;
13869 }
13870 }
13871
13872 // If we have made it here, then we could not find a match in Src0s or Src1s
13873 // for either Src0 or Src1, so just place them arbitrarily.
13874
13875 unsigned ZeroMask = 0x0c0c0c0c;
13876 unsigned FMask = 0xFF << (8 * (3 - Step));
13877
13878 Src0s.push_back(
13879 {*Src0.Src,
13880 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13881 Src1.SrcOffset / 4});
13882 Src1s.push_back(
13883 {*Src1.Src,
13884 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13885 Src1.SrcOffset / 4});
13886
13887 return;
13888}
13889
13891 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13892 bool IsAny) {
13893
13894 // If we just have one source, just permute it accordingly.
13895 if (Srcs.size() == 1) {
13896 auto Elt = Srcs.begin();
13897 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13898
13899 // v_perm will produce the original value
13900 if (Elt->PermMask == 0x3020100)
13901 return EltOp;
13902
13903 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13904 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13905 }
13906
13907 auto FirstElt = Srcs.begin();
13908 auto SecondElt = std::next(FirstElt);
13909
13911
13912 // If we have multiple sources in the chain, combine them via perms (using
13913 // calculated perm mask) and Ors.
13914 while (true) {
13915 auto FirstMask = FirstElt->PermMask;
13916 auto SecondMask = SecondElt->PermMask;
13917
13918 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13919 unsigned FirstPlusFour = FirstMask | 0x04040404;
13920 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13921 // original 0x0C.
13922 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13923
13924 auto PermMask = addPermMasks(FirstMask, SecondMask);
13925 auto FirstVal =
13926 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13927 auto SecondVal =
13928 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13929
13930 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13931 SecondVal,
13932 DAG.getConstant(PermMask, SL, MVT::i32)));
13933
13934 FirstElt = std::next(SecondElt);
13935 if (FirstElt == Srcs.end())
13936 break;
13937
13938 SecondElt = std::next(FirstElt);
13939 // If we only have a FirstElt, then just combine that into the cumulative
13940 // source node.
13941 if (SecondElt == Srcs.end()) {
13942 auto EltOp =
13943 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13944
13945 Perms.push_back(
13946 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13947 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13948 break;
13949 }
13950 }
13951
13952 assert(Perms.size() == 1 || Perms.size() == 2);
13953 return Perms.size() == 2
13954 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13955 : Perms[0];
13956}
13957
13958static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13959 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13960 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13961 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13962 EntryMask += ZeroMask;
13963 }
13964}
13965
13966static bool isMul(const SDValue Op) {
13967 auto Opcode = Op.getOpcode();
13968
13969 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13970 Opcode == AMDGPUISD::MUL_I24);
13971}
13972
13973static std::optional<bool>
13975 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13976 const SDValue &S1Op, const SelectionDAG &DAG) {
13977 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13978 // of the dot4 is irrelevant.
13979 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13980 return false;
13981
13982 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13983 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13984 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13985 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13986 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13987 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13988
13989 assert(!(S0IsUnsigned && S0IsSigned));
13990 assert(!(S1IsUnsigned && S1IsSigned));
13991
13992 // There are 9 possible permutations of
13993 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13994
13995 // In two permutations, the sign bits are known to be the same for both Ops,
13996 // so simply return Signed / Unsigned corresponding to the MSB
13997
13998 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13999 return S0IsSigned;
14000
14001 // In another two permutations, the sign bits are known to be opposite. In
14002 // this case return std::nullopt to indicate a bad match.
14003
14004 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14005 return std::nullopt;
14006
14007 // In the remaining five permutations, we don't know the value of the sign
14008 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14009 // the upper bits must be extension bits. Thus, the only ways for the sign
14010 // bit to be unknown is if it was sign extended from unknown value, or if it
14011 // was any extended. In either case, it is correct to use the signed
14012 // version of the signedness semantics of dot4
14013
14014 // In two of such permutations, we known the sign bit is set for
14015 // one op, and the other is unknown. It is okay to used signed version of
14016 // dot4.
14017 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14018 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14019 return true;
14020
14021 // In one such permutation, we don't know either of the sign bits. It is okay
14022 // to used the signed version of dot4.
14023 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14024 return true;
14025
14026 // In two of such permutations, we known the sign bit is unset for
14027 // one op, and the other is unknown. Return std::nullopt to indicate a
14028 // bad match.
14029 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14030 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14031 return std::nullopt;
14032
14033 llvm_unreachable("Fully covered condition");
14034}
14035
14036SDValue SITargetLowering::performAddCombine(SDNode *N,
14037 DAGCombinerInfo &DCI) const {
14038 SelectionDAG &DAG = DCI.DAG;
14039 EVT VT = N->getValueType(0);
14040 SDLoc SL(N);
14041 SDValue LHS = N->getOperand(0);
14042 SDValue RHS = N->getOperand(1);
14043
14044 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14045 if (Subtarget->hasMad64_32()) {
14046 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14047 return Folded;
14048 }
14049 }
14050
14051 if (SDValue V = reassociateScalarOps(N, DAG)) {
14052 return V;
14053 }
14054
14055 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14056 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14057 SDValue TempNode(N, 0);
14058 std::optional<bool> IsSigned;
14062
14063 // Match the v_dot4 tree, while collecting src nodes.
14064 int ChainLength = 0;
14065 for (int I = 0; I < 4; I++) {
14066 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14067 if (MulIdx == -1)
14068 break;
14069 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14070 if (!Src0)
14071 break;
14072 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14073 if (!Src1)
14074 break;
14075
14076 auto IterIsSigned = checkDot4MulSignedness(
14077 TempNode->getOperand(MulIdx), *Src0, *Src1,
14078 TempNode->getOperand(MulIdx)->getOperand(0),
14079 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14080 if (!IterIsSigned)
14081 break;
14082 if (!IsSigned)
14083 IsSigned = *IterIsSigned;
14084 if (*IterIsSigned != *IsSigned)
14085 break;
14086 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14087 auto AddIdx = 1 - MulIdx;
14088 // Allow the special case where add (add (mul24, 0), mul24) became ->
14089 // add (mul24, mul24).
14090 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14091 Src2s.push_back(TempNode->getOperand(AddIdx));
14092 auto Src0 =
14093 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14094 if (!Src0)
14095 break;
14096 auto Src1 =
14097 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14098 if (!Src1)
14099 break;
14100 auto IterIsSigned = checkDot4MulSignedness(
14101 TempNode->getOperand(AddIdx), *Src0, *Src1,
14102 TempNode->getOperand(AddIdx)->getOperand(0),
14103 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14104 if (!IterIsSigned)
14105 break;
14106 assert(IsSigned);
14107 if (*IterIsSigned != *IsSigned)
14108 break;
14109 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14110 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14111 ChainLength = I + 2;
14112 break;
14113 }
14114
14115 TempNode = TempNode->getOperand(AddIdx);
14116 Src2s.push_back(TempNode);
14117 ChainLength = I + 1;
14118 if (TempNode->getNumOperands() < 2)
14119 break;
14120 LHS = TempNode->getOperand(0);
14121 RHS = TempNode->getOperand(1);
14122 }
14123
14124 if (ChainLength < 2)
14125 return SDValue();
14126
14127 // Masks were constructed with assumption that we would find a chain of
14128 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14129 // 0x0c) so they do not affect dot calculation.
14130 if (ChainLength < 4) {
14131 fixMasks(Src0s, ChainLength);
14132 fixMasks(Src1s, ChainLength);
14133 }
14134
14135 SDValue Src0, Src1;
14136
14137 // If we are just using a single source for both, and have permuted the
14138 // bytes consistently, we can just use the sources without permuting
14139 // (commutation).
14140 bool UseOriginalSrc = false;
14141 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14142 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14143 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14144 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14145 SmallVector<unsigned, 4> SrcBytes;
14146 auto Src0Mask = Src0s.begin()->PermMask;
14147 SrcBytes.push_back(Src0Mask & 0xFF000000);
14148 bool UniqueEntries = true;
14149 for (auto I = 1; I < 4; I++) {
14150 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14151
14152 if (is_contained(SrcBytes, NextByte)) {
14153 UniqueEntries = false;
14154 break;
14155 }
14156 SrcBytes.push_back(NextByte);
14157 }
14158
14159 if (UniqueEntries) {
14160 UseOriginalSrc = true;
14161
14162 auto FirstElt = Src0s.begin();
14163 auto FirstEltOp =
14164 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14165
14166 auto SecondElt = Src1s.begin();
14167 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14168 SecondElt->DWordOffset);
14169
14170 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14171 MVT::getIntegerVT(32));
14172 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14173 MVT::getIntegerVT(32));
14174 }
14175 }
14176
14177 if (!UseOriginalSrc) {
14178 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14179 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14180 }
14181
14182 assert(IsSigned);
14183 SDValue Src2 =
14184 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14185
14186 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14187 : Intrinsic::amdgcn_udot4,
14188 SL, MVT::i64);
14189
14190 assert(!VT.isVector());
14191 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14192 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14193
14194 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14195 }
14196
14197 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14198 return SDValue();
14199
14200 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14201 // add x, sext (setcc) => usubo_carry x, 0, setcc
14202 unsigned Opc = LHS.getOpcode();
14203 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14204 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14205 std::swap(RHS, LHS);
14206
14207 Opc = RHS.getOpcode();
14208 switch (Opc) {
14209 default: break;
14210 case ISD::ZERO_EXTEND:
14211 case ISD::SIGN_EXTEND:
14212 case ISD::ANY_EXTEND: {
14213 auto Cond = RHS.getOperand(0);
14214 // If this won't be a real VOPC output, we would still need to insert an
14215 // extra instruction anyway.
14216 if (!isBoolSGPR(Cond))
14217 break;
14218 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14219 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14221 return DAG.getNode(Opc, SL, VTList, Args);
14222 }
14223 case ISD::UADDO_CARRY: {
14224 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14225 if (!isNullConstant(RHS.getOperand(1)))
14226 break;
14227 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14228 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14229 }
14230 }
14231 return SDValue();
14232}
14233
14234SDValue SITargetLowering::performSubCombine(SDNode *N,
14235 DAGCombinerInfo &DCI) const {
14236 SelectionDAG &DAG = DCI.DAG;
14237 EVT VT = N->getValueType(0);
14238
14239 if (VT != MVT::i32)
14240 return SDValue();
14241
14242 SDLoc SL(N);
14243 SDValue LHS = N->getOperand(0);
14244 SDValue RHS = N->getOperand(1);
14245
14246 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14247 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14248 unsigned Opc = RHS.getOpcode();
14249 switch (Opc) {
14250 default: break;
14251 case ISD::ZERO_EXTEND:
14252 case ISD::SIGN_EXTEND:
14253 case ISD::ANY_EXTEND: {
14254 auto Cond = RHS.getOperand(0);
14255 // If this won't be a real VOPC output, we would still need to insert an
14256 // extra instruction anyway.
14257 if (!isBoolSGPR(Cond))
14258 break;
14259 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14260 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14262 return DAG.getNode(Opc, SL, VTList, Args);
14263 }
14264 }
14265
14266 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14267 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14268 if (!isNullConstant(LHS.getOperand(1)))
14269 return SDValue();
14270 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14271 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14272 }
14273 return SDValue();
14274}
14275
14276SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14277 DAGCombinerInfo &DCI) const {
14278
14279 if (N->getValueType(0) != MVT::i32)
14280 return SDValue();
14281
14282 if (!isNullConstant(N->getOperand(1)))
14283 return SDValue();
14284
14285 SelectionDAG &DAG = DCI.DAG;
14286 SDValue LHS = N->getOperand(0);
14287
14288 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14289 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14290 unsigned LHSOpc = LHS.getOpcode();
14291 unsigned Opc = N->getOpcode();
14292 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14293 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14294 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14295 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14296 }
14297 return SDValue();
14298}
14299
14300SDValue SITargetLowering::performFAddCombine(SDNode *N,
14301 DAGCombinerInfo &DCI) const {
14302 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14303 return SDValue();
14304
14305 SelectionDAG &DAG = DCI.DAG;
14306 EVT VT = N->getValueType(0);
14307
14308 SDLoc SL(N);
14309 SDValue LHS = N->getOperand(0);
14310 SDValue RHS = N->getOperand(1);
14311
14312 // These should really be instruction patterns, but writing patterns with
14313 // source modifiers is a pain.
14314
14315 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14316 if (LHS.getOpcode() == ISD::FADD) {
14317 SDValue A = LHS.getOperand(0);
14318 if (A == LHS.getOperand(1)) {
14319 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14320 if (FusedOp != 0) {
14321 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14322 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14323 }
14324 }
14325 }
14326
14327 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14328 if (RHS.getOpcode() == ISD::FADD) {
14329 SDValue A = RHS.getOperand(0);
14330 if (A == RHS.getOperand(1)) {
14331 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14332 if (FusedOp != 0) {
14333 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14334 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14335 }
14336 }
14337 }
14338
14339 return SDValue();
14340}
14341
14342SDValue SITargetLowering::performFSubCombine(SDNode *N,
14343 DAGCombinerInfo &DCI) const {
14344 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14345 return SDValue();
14346
14347 SelectionDAG &DAG = DCI.DAG;
14348 SDLoc SL(N);
14349 EVT VT = N->getValueType(0);
14350 assert(!VT.isVector());
14351
14352 // Try to get the fneg to fold into the source modifier. This undoes generic
14353 // DAG combines and folds them into the mad.
14354 //
14355 // Only do this if we are not trying to support denormals. v_mad_f32 does
14356 // not support denormals ever.
14357 SDValue LHS = N->getOperand(0);
14358 SDValue RHS = N->getOperand(1);
14359 if (LHS.getOpcode() == ISD::FADD) {
14360 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14361 SDValue A = LHS.getOperand(0);
14362 if (A == LHS.getOperand(1)) {
14363 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14364 if (FusedOp != 0){
14365 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14366 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14367
14368 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14369 }
14370 }
14371 }
14372
14373 if (RHS.getOpcode() == ISD::FADD) {
14374 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14375
14376 SDValue A = RHS.getOperand(0);
14377 if (A == RHS.getOperand(1)) {
14378 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14379 if (FusedOp != 0){
14380 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14381 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14382 }
14383 }
14384 }
14385
14386 return SDValue();
14387}
14388
14389SDValue SITargetLowering::performFDivCombine(SDNode *N,
14390 DAGCombinerInfo &DCI) const {
14391 SelectionDAG &DAG = DCI.DAG;
14392 SDLoc SL(N);
14393 EVT VT = N->getValueType(0);
14394 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14395 return SDValue();
14396
14397 SDValue LHS = N->getOperand(0);
14398 SDValue RHS = N->getOperand(1);
14399
14400 SDNodeFlags Flags = N->getFlags();
14401 SDNodeFlags RHSFlags = RHS->getFlags();
14402 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14403 !RHS->hasOneUse())
14404 return SDValue();
14405
14406 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14407 bool IsNegative = false;
14408 if (CLHS->isExactlyValue(1.0) ||
14409 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14410 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14411 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14412 if (RHS.getOpcode() == ISD::FSQRT) {
14413 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14414 SDValue Rsq =
14415 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14416 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14417 }
14418 }
14419 }
14420
14421 return SDValue();
14422}
14423
14424SDValue SITargetLowering::performFMACombine(SDNode *N,
14425 DAGCombinerInfo &DCI) const {
14426 SelectionDAG &DAG = DCI.DAG;
14427 EVT VT = N->getValueType(0);
14428 SDLoc SL(N);
14429
14430 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14431 return SDValue();
14432
14433 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14434 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14435 SDValue Op1 = N->getOperand(0);
14436 SDValue Op2 = N->getOperand(1);
14437 SDValue FMA = N->getOperand(2);
14438
14439 if (FMA.getOpcode() != ISD::FMA ||
14440 Op1.getOpcode() != ISD::FP_EXTEND ||
14441 Op2.getOpcode() != ISD::FP_EXTEND)
14442 return SDValue();
14443
14444 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14445 // regardless of the denorm mode setting. Therefore,
14446 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14447 const TargetOptions &Options = DAG.getTarget().Options;
14448 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14449 (N->getFlags().hasAllowContract() &&
14450 FMA->getFlags().hasAllowContract())) {
14451 Op1 = Op1.getOperand(0);
14452 Op2 = Op2.getOperand(0);
14453 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14455 return SDValue();
14456
14457 SDValue Vec1 = Op1.getOperand(0);
14458 SDValue Idx1 = Op1.getOperand(1);
14459 SDValue Vec2 = Op2.getOperand(0);
14460
14461 SDValue FMAOp1 = FMA.getOperand(0);
14462 SDValue FMAOp2 = FMA.getOperand(1);
14463 SDValue FMAAcc = FMA.getOperand(2);
14464
14465 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14466 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14467 return SDValue();
14468
14469 FMAOp1 = FMAOp1.getOperand(0);
14470 FMAOp2 = FMAOp2.getOperand(0);
14471 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14473 return SDValue();
14474
14475 SDValue Vec3 = FMAOp1.getOperand(0);
14476 SDValue Vec4 = FMAOp2.getOperand(0);
14477 SDValue Idx2 = FMAOp1.getOperand(1);
14478
14479 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14480 // Idx1 and Idx2 cannot be the same.
14481 Idx1 == Idx2)
14482 return SDValue();
14483
14484 if (Vec1 == Vec2 || Vec3 == Vec4)
14485 return SDValue();
14486
14487 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14488 return SDValue();
14489
14490 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14491 (Vec1 == Vec4 && Vec2 == Vec3)) {
14492 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14493 DAG.getTargetConstant(0, SL, MVT::i1));
14494 }
14495 }
14496 return SDValue();
14497}
14498
14499SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14500 DAGCombinerInfo &DCI) const {
14501 SelectionDAG &DAG = DCI.DAG;
14502 SDLoc SL(N);
14503
14504 SDValue LHS = N->getOperand(0);
14505 SDValue RHS = N->getOperand(1);
14506 EVT VT = LHS.getValueType();
14507 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14508
14509 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14510 if (!CRHS) {
14511 CRHS = dyn_cast<ConstantSDNode>(LHS);
14512 if (CRHS) {
14513 std::swap(LHS, RHS);
14515 }
14516 }
14517
14518 if (CRHS) {
14519 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14520 isBoolSGPR(LHS.getOperand(0))) {
14521 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14522 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14523 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14524 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14525 if ((CRHS->isAllOnes() &&
14526 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14527 (CRHS->isZero() &&
14528 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14529 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14530 DAG.getConstant(-1, SL, MVT::i1));
14531 if ((CRHS->isAllOnes() &&
14532 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14533 (CRHS->isZero() &&
14534 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14535 return LHS.getOperand(0);
14536 }
14537
14538 const APInt &CRHSVal = CRHS->getAPIntValue();
14539 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14540 LHS.getOpcode() == ISD::SELECT &&
14541 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14542 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14543 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14544 isBoolSGPR(LHS.getOperand(0))) {
14545 // Given CT != FT:
14546 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14547 // setcc (select cc, CT, CF), CF, ne => cc
14548 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14549 // setcc (select cc, CT, CF), CT, eq => cc
14550 const APInt &CT = LHS.getConstantOperandAPInt(1);
14551 const APInt &CF = LHS.getConstantOperandAPInt(2);
14552
14553 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14554 (CT == CRHSVal && CC == ISD::SETNE))
14555 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14556 DAG.getConstant(-1, SL, MVT::i1));
14557 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14558 (CT == CRHSVal && CC == ISD::SETEQ))
14559 return LHS.getOperand(0);
14560 }
14561 }
14562
14563 if (VT != MVT::f32 && VT != MVT::f64 &&
14564 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14565 return SDValue();
14566
14567 // Match isinf/isfinite pattern
14568 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14569 // (fcmp one (fabs x), inf) -> (fp_class x,
14570 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14571 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14572 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14573 if (!CRHS)
14574 return SDValue();
14575
14576 const APFloat &APF = CRHS->getValueAPF();
14577 if (APF.isInfinity() && !APF.isNegative()) {
14578 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14580 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14586 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14587 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14588 DAG.getConstant(Mask, SL, MVT::i32));
14589 }
14590 }
14591
14592 return SDValue();
14593}
14594
14595SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14596 DAGCombinerInfo &DCI) const {
14597 SelectionDAG &DAG = DCI.DAG;
14598 SDLoc SL(N);
14599 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14600
14601 SDValue Src = N->getOperand(0);
14602 SDValue Shift = N->getOperand(0);
14603
14604 // TODO: Extend type shouldn't matter (assuming legal types).
14605 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14606 Shift = Shift.getOperand(0);
14607
14608 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14609 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14610 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14611 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14612 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14613 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14614 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14615 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14616 SDLoc(Shift.getOperand(0)), MVT::i32);
14617
14618 unsigned ShiftOffset = 8 * Offset;
14619 if (Shift.getOpcode() == ISD::SHL)
14620 ShiftOffset -= C->getZExtValue();
14621 else
14622 ShiftOffset += C->getZExtValue();
14623
14624 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14625 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14626 MVT::f32, Shifted);
14627 }
14628 }
14629 }
14630
14631 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14632 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14633 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14634 // We simplified Src. If this node is not dead, visit it again so it is
14635 // folded properly.
14636 if (N->getOpcode() != ISD::DELETED_NODE)
14637 DCI.AddToWorklist(N);
14638 return SDValue(N, 0);
14639 }
14640
14641 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14642 if (SDValue DemandedSrc =
14644 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14645
14646 return SDValue();
14647}
14648
14649SDValue SITargetLowering::performClampCombine(SDNode *N,
14650 DAGCombinerInfo &DCI) const {
14651 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14652 if (!CSrc)
14653 return SDValue();
14654
14655 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14656 const APFloat &F = CSrc->getValueAPF();
14657 APFloat Zero = APFloat::getZero(F.getSemantics());
14658 if (F < Zero ||
14659 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14660 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14661 }
14662
14663 APFloat One(F.getSemantics(), "1.0");
14664 if (F > One)
14665 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14666
14667 return SDValue(CSrc, 0);
14668}
14669
14670
14672 DAGCombinerInfo &DCI) const {
14673 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14674 return SDValue();
14675 switch (N->getOpcode()) {
14676 case ISD::ADD:
14677 return performAddCombine(N, DCI);
14678 case ISD::SUB:
14679 return performSubCombine(N, DCI);
14680 case ISD::UADDO_CARRY:
14681 case ISD::USUBO_CARRY:
14682 return performAddCarrySubCarryCombine(N, DCI);
14683 case ISD::FADD:
14684 return performFAddCombine(N, DCI);
14685 case ISD::FSUB:
14686 return performFSubCombine(N, DCI);
14687 case ISD::FDIV:
14688 return performFDivCombine(N, DCI);
14689 case ISD::SETCC:
14690 return performSetCCCombine(N, DCI);
14691 case ISD::FMAXNUM:
14692 case ISD::FMINNUM:
14693 case ISD::FMAXNUM_IEEE:
14694 case ISD::FMINNUM_IEEE:
14695 case ISD::FMAXIMUM:
14696 case ISD::FMINIMUM:
14697 case ISD::SMAX:
14698 case ISD::SMIN:
14699 case ISD::UMAX:
14700 case ISD::UMIN:
14703 return performMinMaxCombine(N, DCI);
14704 case ISD::FMA:
14705 return performFMACombine(N, DCI);
14706 case ISD::AND:
14707 return performAndCombine(N, DCI);
14708 case ISD::OR:
14709 return performOrCombine(N, DCI);
14710 case ISD::FSHR: {
14712 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14713 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14714 return matchPERM(N, DCI);
14715 }
14716 break;
14717 }
14718 case ISD::XOR:
14719 return performXorCombine(N, DCI);
14720 case ISD::ZERO_EXTEND:
14721 return performZeroExtendCombine(N, DCI);
14723 return performSignExtendInRegCombine(N , DCI);
14725 return performClassCombine(N, DCI);
14726 case ISD::FCANONICALIZE:
14727 return performFCanonicalizeCombine(N, DCI);
14728 case AMDGPUISD::RCP:
14729 return performRcpCombine(N, DCI);
14730 case ISD::FLDEXP:
14731 case AMDGPUISD::FRACT:
14732 case AMDGPUISD::RSQ:
14735 case AMDGPUISD::RSQ_CLAMP: {
14736 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14737 SDValue Src = N->getOperand(0);
14738 if (Src.isUndef())
14739 return Src;
14740 break;
14741 }
14742 case ISD::SINT_TO_FP:
14743 case ISD::UINT_TO_FP:
14744 return performUCharToFloatCombine(N, DCI);
14745 case ISD::FCOPYSIGN:
14746 return performFCopySignCombine(N, DCI);
14751 return performCvtF32UByteNCombine(N, DCI);
14752 case AMDGPUISD::FMED3:
14753 return performFMed3Combine(N, DCI);
14755 return performCvtPkRTZCombine(N, DCI);
14756 case AMDGPUISD::CLAMP:
14757 return performClampCombine(N, DCI);
14758 case ISD::SCALAR_TO_VECTOR: {
14759 SelectionDAG &DAG = DCI.DAG;
14760 EVT VT = N->getValueType(0);
14761
14762 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14763 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14764 SDLoc SL(N);
14765 SDValue Src = N->getOperand(0);
14766 EVT EltVT = Src.getValueType();
14767 if (EltVT != MVT::i16)
14768 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14769
14770 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14771 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14772 }
14773
14774 break;
14775 }
14777 return performExtractVectorEltCombine(N, DCI);
14779 return performInsertVectorEltCombine(N, DCI);
14780 case ISD::FP_ROUND:
14781 return performFPRoundCombine(N, DCI);
14782 case ISD::LOAD: {
14783 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14784 return Widened;
14785 [[fallthrough]];
14786 }
14787 default: {
14788 if (!DCI.isBeforeLegalize()) {
14789 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14790 return performMemSDNodeCombine(MemNode, DCI);
14791 }
14792
14793 break;
14794 }
14795 }
14796
14798}
14799
14800/// Helper function for adjustWritemask
14801static unsigned SubIdx2Lane(unsigned Idx) {
14802 switch (Idx) {
14803 default: return ~0u;
14804 case AMDGPU::sub0: return 0;
14805 case AMDGPU::sub1: return 1;
14806 case AMDGPU::sub2: return 2;
14807 case AMDGPU::sub3: return 3;
14808 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14809 }
14810}
14811
14812/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14813SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14814 SelectionDAG &DAG) const {
14815 unsigned Opcode = Node->getMachineOpcode();
14816
14817 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14818 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14819 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14820 return Node; // not implemented for D16
14821
14822 SDNode *Users[5] = { nullptr };
14823 unsigned Lane = 0;
14824 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14825 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14826 unsigned NewDmask = 0;
14827 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14828 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14829 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14830 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14831 ? true
14832 : false;
14833 unsigned TFCLane = 0;
14834 bool HasChain = Node->getNumValues() > 1;
14835
14836 if (OldDmask == 0) {
14837 // These are folded out, but on the chance it happens don't assert.
14838 return Node;
14839 }
14840
14841 unsigned OldBitsSet = llvm::popcount(OldDmask);
14842 // Work out which is the TFE/LWE lane if that is enabled.
14843 if (UsesTFC) {
14844 TFCLane = OldBitsSet;
14845 }
14846
14847 // Try to figure out the used register components
14848 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14849 I != E; ++I) {
14850
14851 // Don't look at users of the chain.
14852 if (I.getUse().getResNo() != 0)
14853 continue;
14854
14855 // Abort if we can't understand the usage
14856 if (!I->isMachineOpcode() ||
14857 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14858 return Node;
14859
14860 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14861 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14862 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14863 // set, etc.
14864 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14865 if (Lane == ~0u)
14866 return Node;
14867
14868 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14869 if (UsesTFC && Lane == TFCLane) {
14870 Users[Lane] = *I;
14871 } else {
14872 // Set which texture component corresponds to the lane.
14873 unsigned Comp;
14874 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14875 Comp = llvm::countr_zero(Dmask);
14876 Dmask &= ~(1 << Comp);
14877 }
14878
14879 // Abort if we have more than one user per component.
14880 if (Users[Lane])
14881 return Node;
14882
14883 Users[Lane] = *I;
14884 NewDmask |= 1 << Comp;
14885 }
14886 }
14887
14888 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14889 bool NoChannels = !NewDmask;
14890 if (NoChannels) {
14891 if (!UsesTFC) {
14892 // No uses of the result and not using TFC. Then do nothing.
14893 return Node;
14894 }
14895 // If the original dmask has one channel - then nothing to do
14896 if (OldBitsSet == 1)
14897 return Node;
14898 // Use an arbitrary dmask - required for the instruction to work
14899 NewDmask = 1;
14900 }
14901 // Abort if there's no change
14902 if (NewDmask == OldDmask)
14903 return Node;
14904
14905 unsigned BitsSet = llvm::popcount(NewDmask);
14906
14907 // Check for TFE or LWE - increase the number of channels by one to account
14908 // for the extra return value
14909 // This will need adjustment for D16 if this is also included in
14910 // adjustWriteMask (this function) but at present D16 are excluded.
14911 unsigned NewChannels = BitsSet + UsesTFC;
14912
14913 int NewOpcode =
14914 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14915 assert(NewOpcode != -1 &&
14916 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14917 "failed to find equivalent MIMG op");
14918
14919 // Adjust the writemask in the node
14921 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14922 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14923 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14924
14925 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14926
14927 MVT ResultVT = NewChannels == 1 ?
14928 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14929 NewChannels == 5 ? 8 : NewChannels);
14930 SDVTList NewVTList = HasChain ?
14931 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14932
14933
14934 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14935 NewVTList, Ops);
14936
14937 if (HasChain) {
14938 // Update chain.
14939 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14940 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14941 }
14942
14943 if (NewChannels == 1) {
14944 assert(Node->hasNUsesOfValue(1, 0));
14945 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14946 SDLoc(Node), Users[Lane]->getValueType(0),
14947 SDValue(NewNode, 0));
14948 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14949 return nullptr;
14950 }
14951
14952 // Update the users of the node with the new indices
14953 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14954 SDNode *User = Users[i];
14955 if (!User) {
14956 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14957 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14958 if (i || !NoChannels)
14959 continue;
14960 } else {
14961 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14962 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14963 if (NewUser != User) {
14964 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14965 DAG.RemoveDeadNode(User);
14966 }
14967 }
14968
14969 switch (Idx) {
14970 default: break;
14971 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14972 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14973 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14974 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14975 }
14976 }
14977
14978 DAG.RemoveDeadNode(Node);
14979 return nullptr;
14980}
14981
14983 if (Op.getOpcode() == ISD::AssertZext)
14984 Op = Op.getOperand(0);
14985
14986 return isa<FrameIndexSDNode>(Op);
14987}
14988
14989/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14990/// with frame index operands.
14991/// LLVM assumes that inputs are to these instructions are registers.
14993 SelectionDAG &DAG) const {
14994 if (Node->getOpcode() == ISD::CopyToReg) {
14995 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14996 SDValue SrcVal = Node->getOperand(2);
14997
14998 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14999 // to try understanding copies to physical registers.
15000 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15001 SDLoc SL(Node);
15003 SDValue VReg = DAG.getRegister(
15004 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15005
15006 SDNode *Glued = Node->getGluedNode();
15007 SDValue ToVReg
15008 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15009 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15010 SDValue ToResultReg
15011 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15012 VReg, ToVReg.getValue(1));
15013 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15014 DAG.RemoveDeadNode(Node);
15015 return ToResultReg.getNode();
15016 }
15017 }
15018
15020 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15021 if (!isFrameIndexOp(Node->getOperand(i))) {
15022 Ops.push_back(Node->getOperand(i));
15023 continue;
15024 }
15025
15026 SDLoc DL(Node);
15027 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15028 Node->getOperand(i).getValueType(),
15029 Node->getOperand(i)), 0));
15030 }
15031
15032 return DAG.UpdateNodeOperands(Node, Ops);
15033}
15034
15035/// Fold the instructions after selecting them.
15036/// Returns null if users were already updated.
15038 SelectionDAG &DAG) const {
15040 unsigned Opcode = Node->getMachineOpcode();
15041
15042 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15043 !TII->isGather4(Opcode) &&
15044 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15045 return adjustWritemask(Node, DAG);
15046 }
15047
15048 if (Opcode == AMDGPU::INSERT_SUBREG ||
15049 Opcode == AMDGPU::REG_SEQUENCE) {
15051 return Node;
15052 }
15053
15054 switch (Opcode) {
15055 case AMDGPU::V_DIV_SCALE_F32_e64:
15056 case AMDGPU::V_DIV_SCALE_F64_e64: {
15057 // Satisfy the operand register constraint when one of the inputs is
15058 // undefined. Ordinarily each undef value will have its own implicit_def of
15059 // a vreg, so force these to use a single register.
15060 SDValue Src0 = Node->getOperand(1);
15061 SDValue Src1 = Node->getOperand(3);
15062 SDValue Src2 = Node->getOperand(5);
15063
15064 if ((Src0.isMachineOpcode() &&
15065 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15066 (Src0 == Src1 || Src0 == Src2))
15067 break;
15068
15069 MVT VT = Src0.getValueType().getSimpleVT();
15070 const TargetRegisterClass *RC =
15071 getRegClassFor(VT, Src0.getNode()->isDivergent());
15072
15074 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15075
15076 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15077 UndefReg, Src0, SDValue());
15078
15079 // src0 must be the same register as src1 or src2, even if the value is
15080 // undefined, so make sure we don't violate this constraint.
15081 if (Src0.isMachineOpcode() &&
15082 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15083 if (Src1.isMachineOpcode() &&
15084 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15085 Src0 = Src1;
15086 else if (Src2.isMachineOpcode() &&
15087 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15088 Src0 = Src2;
15089 else {
15090 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15091 Src0 = UndefReg;
15092 Src1 = UndefReg;
15093 }
15094 } else
15095 break;
15096
15097 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15098 Ops[1] = Src0;
15099 Ops[3] = Src1;
15100 Ops[5] = Src2;
15101 Ops.push_back(ImpDef.getValue(1));
15102 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15103 }
15104 default:
15105 break;
15106 }
15107
15108 return Node;
15109}
15110
15111// Any MIMG instructions that use tfe or lwe require an initialization of the
15112// result register that will be written in the case of a memory access failure.
15113// The required code is also added to tie this init code to the result of the
15114// img instruction.
15117 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15118 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15119 MachineBasicBlock &MBB = *MI.getParent();
15120
15121 int DstIdx =
15122 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15123 unsigned InitIdx = 0;
15124
15125 if (TII->isImage(MI)) {
15126 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15127 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15128 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15129
15130 if (!TFE && !LWE) // intersect_ray
15131 return;
15132
15133 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15134 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15135 unsigned D16Val = D16 ? D16->getImm() : 0;
15136
15137 if (!TFEVal && !LWEVal)
15138 return;
15139
15140 // At least one of TFE or LWE are non-zero
15141 // We have to insert a suitable initialization of the result value and
15142 // tie this to the dest of the image instruction.
15143
15144 // Calculate which dword we have to initialize to 0.
15145 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15146
15147 // check that dmask operand is found.
15148 assert(MO_Dmask && "Expected dmask operand in instruction");
15149
15150 unsigned dmask = MO_Dmask->getImm();
15151 // Determine the number of active lanes taking into account the
15152 // Gather4 special case
15153 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15154
15155 bool Packed = !Subtarget->hasUnpackedD16VMem();
15156
15157 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15158
15159 // Abandon attempt if the dst size isn't large enough
15160 // - this is in fact an error but this is picked up elsewhere and
15161 // reported correctly.
15162 uint32_t DstSize =
15163 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15164 if (DstSize < InitIdx)
15165 return;
15166 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15167 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15168 } else {
15169 return;
15170 }
15171
15172 const DebugLoc &DL = MI.getDebugLoc();
15173
15174 // Create a register for the initialization value.
15175 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15176 unsigned NewDst = 0; // Final initialized value will be in here
15177
15178 // If PRTStrictNull feature is enabled (the default) then initialize
15179 // all the result registers to 0, otherwise just the error indication
15180 // register (VGPRn+1)
15181 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15182 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15183
15184 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15185 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15186 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15187 // Initialize dword
15188 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15189 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15190 .addImm(0);
15191 // Insert into the super-reg
15192 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15193 .addReg(PrevDst)
15194 .addReg(SubReg)
15196
15197 PrevDst = NewDst;
15198 }
15199
15200 // Add as an implicit operand
15201 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15202
15203 // Tie the just added implicit operand to the dst
15204 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15205}
15206
15207/// Assign the register class depending on the number of
15208/// bits set in the writemask
15210 SDNode *Node) const {
15212
15213 MachineFunction *MF = MI.getParent()->getParent();
15216
15217 if (TII->isVOP3(MI.getOpcode())) {
15218 // Make sure constant bus requirements are respected.
15219 TII->legalizeOperandsVOP3(MRI, MI);
15220
15221 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15222 // This saves a chain-copy of registers and better balance register
15223 // use between vgpr and agpr as agpr tuples tend to be big.
15224 if (!MI.getDesc().operands().empty()) {
15225 unsigned Opc = MI.getOpcode();
15226 bool HasAGPRs = Info->mayNeedAGPRs();
15227 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15228 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15229 for (auto I :
15230 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15231 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15232 if (I == -1)
15233 break;
15234 if ((I == Src2Idx) && (HasAGPRs))
15235 break;
15236 MachineOperand &Op = MI.getOperand(I);
15237 if (!Op.isReg() || !Op.getReg().isVirtual())
15238 continue;
15239 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15240 if (!TRI->hasAGPRs(RC))
15241 continue;
15242 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15243 if (!Src || !Src->isCopy() ||
15244 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15245 continue;
15246 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15247 // All uses of agpr64 and agpr32 can also accept vgpr except for
15248 // v_accvgpr_read, but we do not produce agpr reads during selection,
15249 // so no use checks are needed.
15250 MRI.setRegClass(Op.getReg(), NewRC);
15251 }
15252
15253 if (!HasAGPRs)
15254 return;
15255
15256 // Resolve the rest of AV operands to AGPRs.
15257 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15258 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15259 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15260 if (TRI->isVectorSuperClass(RC)) {
15261 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15262 MRI.setRegClass(Src2->getReg(), NewRC);
15263 if (Src2->isTied())
15264 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15265 }
15266 }
15267 }
15268 }
15269
15270 return;
15271 }
15272
15273 if (TII->isImage(MI))
15274 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15275}
15276
15278 uint64_t Val) {
15279 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15280 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15281}
15282
15284 const SDLoc &DL,
15285 SDValue Ptr) const {
15287
15288 // Build the half of the subregister with the constants before building the
15289 // full 128-bit register. If we are building multiple resource descriptors,
15290 // this will allow CSEing of the 2-component register.
15291 const SDValue Ops0[] = {
15292 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15293 buildSMovImm32(DAG, DL, 0),
15294 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15295 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15296 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15297 };
15298
15299 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15300 MVT::v2i32, Ops0), 0);
15301
15302 // Combine the constants and the pointer.
15303 const SDValue Ops1[] = {
15304 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15305 Ptr,
15306 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15307 SubRegHi,
15308 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15309 };
15310
15311 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15312}
15313
15314/// Return a resource descriptor with the 'Add TID' bit enabled
15315/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15316/// of the resource descriptor) to create an offset, which is added to
15317/// the resource pointer.
15319 SDValue Ptr, uint32_t RsrcDword1,
15320 uint64_t RsrcDword2And3) const {
15321 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15322 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15323 if (RsrcDword1) {
15324 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15325 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15326 0);
15327 }
15328
15329 SDValue DataLo = buildSMovImm32(DAG, DL,
15330 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15331 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15332
15333 const SDValue Ops[] = {
15334 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15335 PtrLo,
15336 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15337 PtrHi,
15338 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15339 DataLo,
15340 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15341 DataHi,
15342 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15343 };
15344
15345 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15346}
15347
15348//===----------------------------------------------------------------------===//
15349// SI Inline Assembly Support
15350//===----------------------------------------------------------------------===//
15351
15352std::pair<unsigned, const TargetRegisterClass *>
15354 StringRef Constraint,
15355 MVT VT) const {
15356 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15357
15358 const TargetRegisterClass *RC = nullptr;
15359 if (Constraint.size() == 1) {
15360 const unsigned BitWidth = VT.getSizeInBits();
15361 switch (Constraint[0]) {
15362 default:
15363 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15364 case 's':
15365 case 'r':
15366 switch (BitWidth) {
15367 case 16:
15368 RC = &AMDGPU::SReg_32RegClass;
15369 break;
15370 case 64:
15371 RC = &AMDGPU::SGPR_64RegClass;
15372 break;
15373 default:
15375 if (!RC)
15376 return std::pair(0U, nullptr);
15377 break;
15378 }
15379 break;
15380 case 'v':
15381 switch (BitWidth) {
15382 case 16:
15383 RC = &AMDGPU::VGPR_32RegClass;
15384 break;
15385 default:
15386 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15387 if (!RC)
15388 return std::pair(0U, nullptr);
15389 break;
15390 }
15391 break;
15392 case 'a':
15393 if (!Subtarget->hasMAIInsts())
15394 break;
15395 switch (BitWidth) {
15396 case 16:
15397 RC = &AMDGPU::AGPR_32RegClass;
15398 break;
15399 default:
15400 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15401 if (!RC)
15402 return std::pair(0U, nullptr);
15403 break;
15404 }
15405 break;
15406 }
15407 // We actually support i128, i16 and f16 as inline parameters
15408 // even if they are not reported as legal
15409 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15410 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15411 return std::pair(0U, RC);
15412 }
15413
15414 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15415 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15416 if (RegName.consume_front("v")) {
15417 RC = &AMDGPU::VGPR_32RegClass;
15418 } else if (RegName.consume_front("s")) {
15419 RC = &AMDGPU::SGPR_32RegClass;
15420 } else if (RegName.consume_front("a")) {
15421 RC = &AMDGPU::AGPR_32RegClass;
15422 }
15423
15424 if (RC) {
15425 uint32_t Idx;
15426 if (RegName.consume_front("[")) {
15427 uint32_t End;
15428 bool Failed = RegName.consumeInteger(10, Idx);
15429 Failed |= !RegName.consume_front(":");
15430 Failed |= RegName.consumeInteger(10, End);
15431 Failed |= !RegName.consume_back("]");
15432 if (!Failed) {
15433 uint32_t Width = (End - Idx + 1) * 32;
15434 MCRegister Reg = RC->getRegister(Idx);
15436 RC = TRI->getVGPRClassForBitWidth(Width);
15437 else if (SIRegisterInfo::isSGPRClass(RC))
15438 RC = TRI->getSGPRClassForBitWidth(Width);
15439 else if (SIRegisterInfo::isAGPRClass(RC))
15440 RC = TRI->getAGPRClassForBitWidth(Width);
15441 if (RC) {
15442 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15443 return std::pair(Reg, RC);
15444 }
15445 }
15446 } else {
15447 bool Failed = RegName.getAsInteger(10, Idx);
15448 if (!Failed && Idx < RC->getNumRegs())
15449 return std::pair(RC->getRegister(Idx), RC);
15450 }
15451 }
15452 }
15453
15454 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15455 if (Ret.first)
15456 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15457
15458 return Ret;
15459}
15460
15461static bool isImmConstraint(StringRef Constraint) {
15462 if (Constraint.size() == 1) {
15463 switch (Constraint[0]) {
15464 default: break;
15465 case 'I':
15466 case 'J':
15467 case 'A':
15468 case 'B':
15469 case 'C':
15470 return true;
15471 }
15472 } else if (Constraint == "DA" ||
15473 Constraint == "DB") {
15474 return true;
15475 }
15476 return false;
15477}
15478
15481 if (Constraint.size() == 1) {
15482 switch (Constraint[0]) {
15483 default: break;
15484 case 's':
15485 case 'v':
15486 case 'a':
15487 return C_RegisterClass;
15488 }
15489 }
15490 if (isImmConstraint(Constraint)) {
15491 return C_Other;
15492 }
15493 return TargetLowering::getConstraintType(Constraint);
15494}
15495
15496static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15498 Val = Val & maskTrailingOnes<uint64_t>(Size);
15499 }
15500 return Val;
15501}
15502
15504 StringRef Constraint,
15505 std::vector<SDValue> &Ops,
15506 SelectionDAG &DAG) const {
15507 if (isImmConstraint(Constraint)) {
15508 uint64_t Val;
15509 if (getAsmOperandConstVal(Op, Val) &&
15510 checkAsmConstraintVal(Op, Constraint, Val)) {
15511 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15512 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15513 }
15514 } else {
15515 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15516 }
15517}
15518
15520 unsigned Size = Op.getScalarValueSizeInBits();
15521 if (Size > 64)
15522 return false;
15523
15524 if (Size == 16 && !Subtarget->has16BitInsts())
15525 return false;
15526
15527 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15528 Val = C->getSExtValue();
15529 return true;
15530 }
15531 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15532 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15533 return true;
15534 }
15535 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15536 if (Size != 16 || Op.getNumOperands() != 2)
15537 return false;
15538 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15539 return false;
15540 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15541 Val = C->getSExtValue();
15542 return true;
15543 }
15544 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15545 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15546 return true;
15547 }
15548 }
15549
15550 return false;
15551}
15552
15554 uint64_t Val) const {
15555 if (Constraint.size() == 1) {
15556 switch (Constraint[0]) {
15557 case 'I':
15559 case 'J':
15560 return isInt<16>(Val);
15561 case 'A':
15562 return checkAsmConstraintValA(Op, Val);
15563 case 'B':
15564 return isInt<32>(Val);
15565 case 'C':
15566 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15568 default:
15569 break;
15570 }
15571 } else if (Constraint.size() == 2) {
15572 if (Constraint == "DA") {
15573 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15574 int64_t LoBits = static_cast<int32_t>(Val);
15575 return checkAsmConstraintValA(Op, HiBits, 32) &&
15576 checkAsmConstraintValA(Op, LoBits, 32);
15577 }
15578 if (Constraint == "DB") {
15579 return true;
15580 }
15581 }
15582 llvm_unreachable("Invalid asm constraint");
15583}
15584
15586 unsigned MaxSize) const {
15587 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15588 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15589 if (Size == 16) {
15590 MVT VT = Op.getSimpleValueType();
15591 switch (VT.SimpleTy) {
15592 default:
15593 return false;
15594 case MVT::i16:
15595 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15596 case MVT::f16:
15597 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15598 case MVT::bf16:
15599 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15600 case MVT::v2i16:
15601 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15602 case MVT::v2f16:
15603 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15604 case MVT::v2bf16:
15605 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15606 }
15607 }
15608 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15609 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15610 return true;
15611 return false;
15612}
15613
15614static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15615 switch (UnalignedClassID) {
15616 case AMDGPU::VReg_64RegClassID:
15617 return AMDGPU::VReg_64_Align2RegClassID;
15618 case AMDGPU::VReg_96RegClassID:
15619 return AMDGPU::VReg_96_Align2RegClassID;
15620 case AMDGPU::VReg_128RegClassID:
15621 return AMDGPU::VReg_128_Align2RegClassID;
15622 case AMDGPU::VReg_160RegClassID:
15623 return AMDGPU::VReg_160_Align2RegClassID;
15624 case AMDGPU::VReg_192RegClassID:
15625 return AMDGPU::VReg_192_Align2RegClassID;
15626 case AMDGPU::VReg_224RegClassID:
15627 return AMDGPU::VReg_224_Align2RegClassID;
15628 case AMDGPU::VReg_256RegClassID:
15629 return AMDGPU::VReg_256_Align2RegClassID;
15630 case AMDGPU::VReg_288RegClassID:
15631 return AMDGPU::VReg_288_Align2RegClassID;
15632 case AMDGPU::VReg_320RegClassID:
15633 return AMDGPU::VReg_320_Align2RegClassID;
15634 case AMDGPU::VReg_352RegClassID:
15635 return AMDGPU::VReg_352_Align2RegClassID;
15636 case AMDGPU::VReg_384RegClassID:
15637 return AMDGPU::VReg_384_Align2RegClassID;
15638 case AMDGPU::VReg_512RegClassID:
15639 return AMDGPU::VReg_512_Align2RegClassID;
15640 case AMDGPU::VReg_1024RegClassID:
15641 return AMDGPU::VReg_1024_Align2RegClassID;
15642 case AMDGPU::AReg_64RegClassID:
15643 return AMDGPU::AReg_64_Align2RegClassID;
15644 case AMDGPU::AReg_96RegClassID:
15645 return AMDGPU::AReg_96_Align2RegClassID;
15646 case AMDGPU::AReg_128RegClassID:
15647 return AMDGPU::AReg_128_Align2RegClassID;
15648 case AMDGPU::AReg_160RegClassID:
15649 return AMDGPU::AReg_160_Align2RegClassID;
15650 case AMDGPU::AReg_192RegClassID:
15651 return AMDGPU::AReg_192_Align2RegClassID;
15652 case AMDGPU::AReg_256RegClassID:
15653 return AMDGPU::AReg_256_Align2RegClassID;
15654 case AMDGPU::AReg_512RegClassID:
15655 return AMDGPU::AReg_512_Align2RegClassID;
15656 case AMDGPU::AReg_1024RegClassID:
15657 return AMDGPU::AReg_1024_Align2RegClassID;
15658 default:
15659 return -1;
15660 }
15661}
15662
15663// Figure out which registers should be reserved for stack access. Only after
15664// the function is legalized do we know all of the non-spill stack objects or if
15665// calls are present.
15669 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15670 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15671 const SIInstrInfo *TII = ST.getInstrInfo();
15672
15673 if (Info->isEntryFunction()) {
15674 // Callable functions have fixed registers used for stack access.
15676 }
15677
15678 // TODO: Move this logic to getReservedRegs()
15679 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15680 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15681 Register SReg = ST.isWave32()
15682 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15683 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15684 &AMDGPU::SGPR_64RegClass);
15685 Info->setSGPRForEXECCopy(SReg);
15686
15687 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15688 Info->getStackPtrOffsetReg()));
15689 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15690 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15691
15692 // We need to worry about replacing the default register with itself in case
15693 // of MIR testcases missing the MFI.
15694 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15695 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15696
15697 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15698 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15699
15700 Info->limitOccupancy(MF);
15701
15702 if (ST.isWave32() && !MF.empty()) {
15703 for (auto &MBB : MF) {
15704 for (auto &MI : MBB) {
15705 TII->fixImplicitOperands(MI);
15706 }
15707 }
15708 }
15709
15710 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15711 // classes if required. Ideally the register class constraints would differ
15712 // per-subtarget, but there's no easy way to achieve that right now. This is
15713 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15714 // from using them as the register class for legal types.
15715 if (ST.needsAlignedVGPRs()) {
15716 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15717 const Register Reg = Register::index2VirtReg(I);
15718 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15719 if (!RC)
15720 continue;
15721 int NewClassID = getAlignedAGPRClassID(RC->getID());
15722 if (NewClassID != -1)
15723 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15724 }
15725 }
15726
15728}
15729
15731 KnownBits &Known,
15732 const APInt &DemandedElts,
15733 const SelectionDAG &DAG,
15734 unsigned Depth) const {
15735 Known.resetAll();
15736 unsigned Opc = Op.getOpcode();
15737 switch (Opc) {
15739 unsigned IID = Op.getConstantOperandVal(0);
15740 switch (IID) {
15741 case Intrinsic::amdgcn_mbcnt_lo:
15742 case Intrinsic::amdgcn_mbcnt_hi: {
15743 const GCNSubtarget &ST =
15745 // These return at most the (wavefront size - 1) + src1
15746 // As long as src1 is an immediate we can calc known bits
15747 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15748 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15749 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15750 // Cater for potential carry
15751 MaxActiveBits += Src1ValBits ? 1 : 0;
15752 unsigned Size = Op.getValueType().getSizeInBits();
15753 if (MaxActiveBits < Size)
15754 Known.Zero.setHighBits(Size - MaxActiveBits);
15755 return;
15756 }
15757 }
15758 break;
15759 }
15760 }
15762 Op, Known, DemandedElts, DAG, Depth);
15763}
15764
15766 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15768
15769 // Set the high bits to zero based on the maximum allowed scratch size per
15770 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15771 // calculation won't overflow, so assume the sign bit is never set.
15772 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15773}
15774
15776 KnownBits &Known, unsigned Dim) {
15777 unsigned MaxValue =
15778 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15779 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15780}
15781
15783 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15784 const MachineRegisterInfo &MRI, unsigned Depth) const {
15785 const MachineInstr *MI = MRI.getVRegDef(R);
15786 switch (MI->getOpcode()) {
15787 case AMDGPU::G_INTRINSIC:
15788 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15789 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15790 case Intrinsic::amdgcn_workitem_id_x:
15791 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15792 break;
15793 case Intrinsic::amdgcn_workitem_id_y:
15794 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15795 break;
15796 case Intrinsic::amdgcn_workitem_id_z:
15797 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15798 break;
15799 case Intrinsic::amdgcn_mbcnt_lo:
15800 case Intrinsic::amdgcn_mbcnt_hi: {
15801 // These return at most the wavefront size - 1.
15802 unsigned Size = MRI.getType(R).getSizeInBits();
15803 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15804 break;
15805 }
15806 case Intrinsic::amdgcn_groupstaticsize: {
15807 // We can report everything over the maximum size as 0. We can't report
15808 // based on the actual size because we don't know if it's accurate or not
15809 // at any given point.
15810 Known.Zero.setHighBits(
15811 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15812 break;
15813 }
15814 }
15815 break;
15816 }
15817 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15818 Known.Zero.setHighBits(24);
15819 break;
15820 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15821 Known.Zero.setHighBits(16);
15822 break;
15823 case AMDGPU::G_AMDGPU_SMED3:
15824 case AMDGPU::G_AMDGPU_UMED3: {
15825 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15826
15827 KnownBits Known2;
15828 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15829 if (Known2.isUnknown())
15830 break;
15831
15832 KnownBits Known1;
15833 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15834 if (Known1.isUnknown())
15835 break;
15836
15837 KnownBits Known0;
15838 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15839 if (Known0.isUnknown())
15840 break;
15841
15842 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15843 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15844 Known.One = Known0.One & Known1.One & Known2.One;
15845 break;
15846 }
15847 }
15848}
15849
15852 unsigned Depth) const {
15853 const MachineInstr *MI = MRI.getVRegDef(R);
15854 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15855 // FIXME: Can this move to generic code? What about the case where the call
15856 // site specifies a lower alignment?
15857 Intrinsic::ID IID = GI->getIntrinsicID();
15859 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15860 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15861 return *RetAlign;
15862 }
15863 return Align(1);
15864}
15865
15868 const Align CacheLineAlign = Align(64);
15869
15870 // Pre-GFX10 target did not benefit from loop alignment
15871 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15872 getSubtarget()->hasInstFwdPrefetchBug())
15873 return PrefAlign;
15874
15875 // On GFX10 I$ is 4 x 64 bytes cache lines.
15876 // By default prefetcher keeps one cache line behind and reads two ahead.
15877 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15878 // behind and one ahead.
15879 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15880 // If loop fits 64 bytes it always spans no more than two cache lines and
15881 // does not need an alignment.
15882 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15883 // Else if loop is less or equal 192 bytes we need two lines behind.
15884
15886 const MachineBasicBlock *Header = ML->getHeader();
15887 if (Header->getAlignment() != PrefAlign)
15888 return Header->getAlignment(); // Already processed.
15889
15890 unsigned LoopSize = 0;
15891 for (const MachineBasicBlock *MBB : ML->blocks()) {
15892 // If inner loop block is aligned assume in average half of the alignment
15893 // size to be added as nops.
15894 if (MBB != Header)
15895 LoopSize += MBB->getAlignment().value() / 2;
15896
15897 for (const MachineInstr &MI : *MBB) {
15898 LoopSize += TII->getInstSizeInBytes(MI);
15899 if (LoopSize > 192)
15900 return PrefAlign;
15901 }
15902 }
15903
15904 if (LoopSize <= 64)
15905 return PrefAlign;
15906
15907 if (LoopSize <= 128)
15908 return CacheLineAlign;
15909
15910 // If any of parent loops is surrounded by prefetch instructions do not
15911 // insert new for inner loop, which would reset parent's settings.
15912 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15913 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15914 auto I = Exit->getFirstNonDebugInstr();
15915 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15916 return CacheLineAlign;
15917 }
15918 }
15919
15920 MachineBasicBlock *Pre = ML->getLoopPreheader();
15921 MachineBasicBlock *Exit = ML->getExitBlock();
15922
15923 if (Pre && Exit) {
15924 auto PreTerm = Pre->getFirstTerminator();
15925 if (PreTerm == Pre->begin() ||
15926 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15927 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15928 .addImm(1); // prefetch 2 lines behind PC
15929
15930 auto ExitHead = Exit->getFirstNonDebugInstr();
15931 if (ExitHead == Exit->end() ||
15932 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15933 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15934 .addImm(2); // prefetch 1 line behind PC
15935 }
15936
15937 return CacheLineAlign;
15938}
15939
15941static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15942 assert(N->getOpcode() == ISD::CopyFromReg);
15943 do {
15944 // Follow the chain until we find an INLINEASM node.
15945 N = N->getOperand(0).getNode();
15946 if (N->getOpcode() == ISD::INLINEASM ||
15947 N->getOpcode() == ISD::INLINEASM_BR)
15948 return true;
15949 } while (N->getOpcode() == ISD::CopyFromReg);
15950 return false;
15951}
15952
15955 UniformityInfo *UA) const {
15956 switch (N->getOpcode()) {
15957 case ISD::CopyFromReg: {
15958 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15959 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15960 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15961 Register Reg = R->getReg();
15962
15963 // FIXME: Why does this need to consider isLiveIn?
15964 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15965 return !TRI->isSGPRReg(MRI, Reg);
15966
15967 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15968 return UA->isDivergent(V);
15969
15971 return !TRI->isSGPRReg(MRI, Reg);
15972 }
15973 case ISD::LOAD: {
15974 const LoadSDNode *L = cast<LoadSDNode>(N);
15975 unsigned AS = L->getAddressSpace();
15976 // A flat load may access private memory.
15978 }
15979 case ISD::CALLSEQ_END:
15980 return true;
15982 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15984 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16003 // Target-specific read-modify-write atomics are sources of divergence.
16004 return true;
16005 default:
16006 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16007 // Generic read-modify-write atomics are sources of divergence.
16008 return A->readMem() && A->writeMem();
16009 }
16010 return false;
16011 }
16012}
16013
16015 EVT VT) const {
16016 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16017 case MVT::f32:
16019 case MVT::f64:
16020 case MVT::f16:
16022 default:
16023 return false;
16024 }
16025}
16026
16028 LLT Ty, const MachineFunction &MF) const {
16029 switch (Ty.getScalarSizeInBits()) {
16030 case 32:
16031 return !denormalModeIsFlushAllF32(MF);
16032 case 64:
16033 case 16:
16034 return !denormalModeIsFlushAllF64F16(MF);
16035 default:
16036 return false;
16037 }
16038}
16039
16041 const SelectionDAG &DAG,
16042 bool SNaN,
16043 unsigned Depth) const {
16044 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16045 const MachineFunction &MF = DAG.getMachineFunction();
16047
16048 if (Info->getMode().DX10Clamp)
16049 return true; // Clamped to 0.
16050 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16051 }
16052
16054 SNaN, Depth);
16055}
16056
16057#if 0
16058// FIXME: This should be checked before unsafe fp atomics are enabled
16059// Global FP atomic instructions have a hardcoded FP mode and do not support
16060// FP32 denormals, and only support v2f16 denormals.
16061static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16063 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16064 if (&Flt == &APFloat::IEEEsingle())
16065 return DenormMode == DenormalMode::getPreserveSign();
16066 return DenormMode == DenormalMode::getIEEE();
16067}
16068#endif
16069
16070// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16071// floating point atomic instructions. May generate more efficient code,
16072// but may not respect rounding and denormal modes, and may give incorrect
16073// results for certain memory destinations.
16075 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16076 "true";
16077}
16078
16080 LLVMContext &Ctx = RMW->getContext();
16082 Ctx.getSyncScopeNames(SSNs);
16083 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16084 ? "system"
16085 : SSNs[RMW->getSyncScopeID()];
16086
16087 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16088 << "Hardware instruction generated for atomic "
16089 << RMW->getOperationName(RMW->getOperation())
16090 << " operation at memory scope " << MemScope;
16091}
16092
16093static bool isHalf2OrBFloat2(Type *Ty) {
16094 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16095 Type *EltTy = VT->getElementType();
16096 return VT->getNumElements() == 2 &&
16097 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16098 }
16099
16100 return false;
16101}
16102
16103static bool isHalf2(Type *Ty) {
16104 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16105 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16106}
16107
16108static bool isBFloat2(Type *Ty) {
16109 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16110 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16111}
16112
16115 unsigned AS = RMW->getPointerAddressSpace();
16116 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16118
16119 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16121 ORE.emit([=]() {
16122 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16123 });
16124 return Kind;
16125 };
16126
16127 auto SSID = RMW->getSyncScopeID();
16128 bool HasSystemScope =
16129 SSID == SyncScope::System ||
16130 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16131
16132 switch (RMW->getOperation()) {
16133 case AtomicRMWInst::Sub:
16134 case AtomicRMWInst::Or:
16135 case AtomicRMWInst::Xor: {
16136 // Atomic sub/or/xor do not work over PCI express, but atomic add
16137 // does. InstCombine transforms these with 0 to or, so undo that.
16138 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16139 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16140 ConstVal && ConstVal->isNullValue())
16142 }
16143
16144 break;
16145 }
16146 case AtomicRMWInst::FAdd: {
16147 Type *Ty = RMW->getType();
16148
16149 // TODO: Handle REGION_ADDRESS
16150 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16151 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16152 // is fixed to round-to-nearest-even.
16153 //
16154 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16155 // round-to-nearest-even.
16156 //
16157 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16158 // suggests it is OK if the floating-point mode may not match the calling
16159 // thread.
16160 if (Ty->isFloatTy()) {
16163 }
16164
16165 if (Ty->isDoubleTy()) {
16166 // Ignores denormal mode, but we don't consider flushing mandatory.
16169 }
16170
16171 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16173
16175 }
16176
16180
16181 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16183
16184 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16185 // gfx940, gfx12
16186 // FIXME: Needs to account for no fine-grained memory
16187 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16189 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16190 // gfx90a, gfx940, gfx12
16191 // FIXME: Needs to account for no fine-grained memory
16192 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16194
16195 // gfx940, gfx12
16196 // FIXME: Needs to account for no fine-grained memory
16197 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16199 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16200 // gfx90a, gfx940, gfx12
16201 // FIXME: Needs to account for no fine-grained memory
16202 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16204
16205 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16206 // buffer. gfx12 does have the buffer version.
16207 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16209 }
16210
16213
16214 // Always expand system scope fp atomics.
16215 if (HasSystemScope)
16217
16218 // global and flat atomic fadd f64: gfx90a, gfx940.
16219 if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
16220 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16221
16222 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16223 if (Ty->isFloatTy()) {
16224 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16225 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16226 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16227 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16228 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16229 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16230 } else {
16231 // gfx908
16232 if (RMW->use_empty() &&
16234 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16235 }
16236 }
16237
16238 // flat atomic fadd f32: gfx940, gfx11+.
16239 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16240 if (Subtarget->hasFlatAtomicFaddF32Inst())
16241 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16242
16243 // If it is in flat address space, and the type is float, we will try to
16244 // expand it, if the target supports global and lds atomic fadd. The
16245 // reason we need that is, in the expansion, we emit the check of address
16246 // space. If it is in global address space, we emit the global atomic
16247 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16248 if (Subtarget->hasLDSFPAtomicAddF32()) {
16249 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16251 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16253 }
16254 }
16255
16257 }
16259 case AtomicRMWInst::FMax: {
16260 Type *Ty = RMW->getType();
16261
16262 // LDS float and double fmin/fmax were always supported.
16263 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16265
16268
16269 // Always expand system scope fp atomics.
16270 if (HasSystemScope)
16272
16273 // For flat and global cases:
16274 // float, double in gfx7. Manual claims denormal support.
16275 // Removed in gfx8.
16276 // float, double restored in gfx10.
16277 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16278 //
16279 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16280 // f32.
16281 //
16282 // FIXME: Check scope and fine grained memory
16283 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16284 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16285 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16286 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16287 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16288 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16290 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16291 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16293 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16294 }
16295
16297 }
16298 case AtomicRMWInst::Min:
16299 case AtomicRMWInst::Max:
16301 case AtomicRMWInst::UMax: {
16304 // Always expand system scope min/max atomics.
16305 if (HasSystemScope)
16307 }
16308 break;
16309 }
16310 default:
16311 break;
16312 }
16313
16315}
16316
16322}
16323
16326 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16329}
16330
16336}
16337
16338const TargetRegisterClass *
16339SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16341 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16342 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16343 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16344 : &AMDGPU::SReg_32RegClass;
16345 if (!TRI->isSGPRClass(RC) && !isDivergent)
16346 return TRI->getEquivalentSGPRClass(RC);
16347 else if (TRI->isSGPRClass(RC) && isDivergent)
16348 return TRI->getEquivalentVGPRClass(RC);
16349
16350 return RC;
16351}
16352
16353// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16354// uniform values (as produced by the mask results of control flow intrinsics)
16355// used outside of divergent blocks. The phi users need to also be treated as
16356// always uniform.
16357//
16358// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16359static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16360 unsigned WaveSize) {
16361 // FIXME: We assume we never cast the mask results of a control flow
16362 // intrinsic.
16363 // Early exit if the type won't be consistent as a compile time hack.
16364 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16365 if (!IT || IT->getBitWidth() != WaveSize)
16366 return false;
16367
16368 if (!isa<Instruction>(V))
16369 return false;
16370 if (!Visited.insert(V).second)
16371 return false;
16372 bool Result = false;
16373 for (const auto *U : V->users()) {
16374 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16375 if (V == U->getOperand(1)) {
16376 switch (Intrinsic->getIntrinsicID()) {
16377 default:
16378 Result = false;
16379 break;
16380 case Intrinsic::amdgcn_if_break:
16381 case Intrinsic::amdgcn_if:
16382 case Intrinsic::amdgcn_else:
16383 Result = true;
16384 break;
16385 }
16386 }
16387 if (V == U->getOperand(0)) {
16388 switch (Intrinsic->getIntrinsicID()) {
16389 default:
16390 Result = false;
16391 break;
16392 case Intrinsic::amdgcn_end_cf:
16393 case Intrinsic::amdgcn_loop:
16394 Result = true;
16395 break;
16396 }
16397 }
16398 } else {
16399 Result = hasCFUser(U, Visited, WaveSize);
16400 }
16401 if (Result)
16402 break;
16403 }
16404 return Result;
16405}
16406
16408 const Value *V) const {
16409 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16410 if (CI->isInlineAsm()) {
16411 // FIXME: This cannot give a correct answer. This should only trigger in
16412 // the case where inline asm returns mixed SGPR and VGPR results, used
16413 // outside the defining block. We don't have a specific result to
16414 // consider, so this assumes if any value is SGPR, the overall register
16415 // also needs to be SGPR.
16416 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16418 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16419 for (auto &TC : TargetConstraints) {
16420 if (TC.Type == InlineAsm::isOutput) {
16423 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16424 if (RC && SIRI->isSGPRClass(RC))
16425 return true;
16426 }
16427 }
16428 }
16429 }
16431 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16432}
16433
16435 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16436 for (; I != E; ++I) {
16437 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16438 if (getBasePtrIndex(M) == I.getOperandNo())
16439 return true;
16440 }
16441 }
16442 return false;
16443}
16444
16446 SDValue N1) const {
16447 if (!N0.hasOneUse())
16448 return false;
16449 // Take care of the opportunity to keep N0 uniform
16450 if (N0->isDivergent() || !N1->isDivergent())
16451 return true;
16452 // Check if we have a good chance to form the memory access pattern with the
16453 // base and offset
16454 return (DAG.isBaseWithConstantOffset(N0) &&
16455 hasMemSDNodeUser(*N0->use_begin()));
16456}
16457
16459 Register N0, Register N1) const {
16460 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16461}
16462
16465 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16467 if (I.getMetadata("amdgpu.noclobber"))
16468 Flags |= MONoClobber;
16469 if (I.getMetadata("amdgpu.last.use"))
16470 Flags |= MOLastUse;
16471 return Flags;
16472}
16473
16475 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16476 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16477 if (User->getOpcode() != ISD::CopyToReg)
16478 return false;
16479 if (!Def->isMachineOpcode())
16480 return false;
16481 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16482 if (!MDef)
16483 return false;
16484
16485 unsigned ResNo = User->getOperand(Op).getResNo();
16486 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16487 return false;
16488 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16489 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16490 PhysReg = AMDGPU::SCC;
16491 const TargetRegisterClass *RC =
16492 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16493 Cost = RC->getCopyCost();
16494 return true;
16495 }
16496 return false;
16497}
16498
16501
16504 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16505 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16506 "this cannot be replaced with add");
16508 return;
16509 }
16510
16511 assert(Subtarget->hasAtomicFaddInsts() &&
16512 "target should have atomic fadd instructions");
16513 assert(AI->getType()->isFloatTy() &&
16515 "generic atomicrmw expansion only supports FP32 operand in flat "
16516 "address space");
16517 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16518
16519 // Given: atomicrmw fadd ptr %addr, float %val ordering
16520 //
16521 // With this expansion we produce the following code:
16522 // [...]
16523 // br label %atomicrmw.check.shared
16524 //
16525 // atomicrmw.check.shared:
16526 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16527 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16528 //
16529 // atomicrmw.shared:
16530 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16531 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16532 // float %val ordering
16533 // br label %atomicrmw.phi
16534 //
16535 // atomicrmw.check.private:
16536 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16537 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16538 //
16539 // atomicrmw.private:
16540 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16541 // %loaded.private = load float, ptr addrspace(5) %cast.private
16542 // %val.new = fadd float %loaded.private, %val
16543 // store float %val.new, ptr addrspace(5) %cast.private
16544 // br label %atomicrmw.phi
16545 //
16546 // atomicrmw.global:
16547 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16548 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16549 // float %val ordering
16550 // br label %atomicrmw.phi
16551 //
16552 // atomicrmw.phi:
16553 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16554 // [ %loaded.private, %atomicrmw.private ],
16555 // [ %loaded.global, %atomicrmw.global ]
16556 // br label %atomicrmw.end
16557 //
16558 // atomicrmw.end:
16559 // [...]
16560
16561 IRBuilder<> Builder(AI);
16562 LLVMContext &Ctx = Builder.getContext();
16563
16564 BasicBlock *BB = Builder.GetInsertBlock();
16565 Function *F = BB->getParent();
16566 BasicBlock *ExitBB =
16567 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16568 BasicBlock *CheckSharedBB =
16569 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16570 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16571 BasicBlock *CheckPrivateBB =
16572 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16573 BasicBlock *PrivateBB =
16574 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16575 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16576 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16577
16578 Value *Val = AI->getValOperand();
16579 Type *ValTy = Val->getType();
16580 Value *Addr = AI->getPointerOperand();
16581
16582 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16583 Value *Val) -> Value * {
16584 AtomicRMWInst *OldVal =
16585 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16586 AI->getOrdering(), AI->getSyncScopeID());
16588 AI->getAllMetadata(MDs);
16589 for (auto &P : MDs)
16590 OldVal->setMetadata(P.first, P.second);
16591 return OldVal;
16592 };
16593
16594 std::prev(BB->end())->eraseFromParent();
16595 Builder.SetInsertPoint(BB);
16596 Builder.CreateBr(CheckSharedBB);
16597
16598 Builder.SetInsertPoint(CheckSharedBB);
16599 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16600 {Addr}, nullptr, "is.shared");
16601 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16602
16603 Builder.SetInsertPoint(SharedBB);
16604 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16606 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16607 Builder.CreateBr(PhiBB);
16608
16609 Builder.SetInsertPoint(CheckPrivateBB);
16610 CallInst *IsPrivate = Builder.CreateIntrinsic(
16611 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16612 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16613
16614 Builder.SetInsertPoint(PrivateBB);
16615 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16617 Value *LoadedPrivate =
16618 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16619 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16620 Builder.CreateStore(NewVal, CastToPrivate);
16621 Builder.CreateBr(PhiBB);
16622
16623 Builder.SetInsertPoint(GlobalBB);
16624 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16626 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16627 Builder.CreateBr(PhiBB);
16628
16629 Builder.SetInsertPoint(PhiBB);
16630 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16631 Loaded->addIncoming(LoadedShared, SharedBB);
16632 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16633 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16634 Builder.CreateBr(ExitBB);
16635
16636 AI->replaceAllUsesWith(Loaded);
16637 AI->eraseFromParent();
16638}
16639
16640LoadInst *
16642 IRBuilder<> Builder(AI);
16643 auto Order = AI->getOrdering();
16644
16645 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16646 // must be flushed if the atomic ordering had a release semantics. This is
16647 // not necessary a fence, a release fence just coincides to do that flush.
16648 // Avoid replacing of an atomicrmw with a release semantics.
16649 if (isReleaseOrStronger(Order))
16650 return nullptr;
16651
16652 LoadInst *LI = Builder.CreateAlignedLoad(
16653 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16654 LI->setAtomic(Order, AI->getSyncScopeID());
16655 LI->copyMetadata(*AI);
16656 LI->takeName(AI);
16657 AI->replaceAllUsesWith(LI);
16658 AI->eraseFromParent();
16659 return LI;
16660}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1168
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isHalf2OrBFloat2(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static bool isHalf2(Type *Ty)
bool unsafeFPAtomicsDisabled(Function *F)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool isBFloat2(Type *Ty)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1020
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5282
bool isNegative() const
Definition: APFloat.h:1348
APInt bitcastToAPInt() const
Definition: APFloat.h:1254
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1038
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:998
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:982
bool isInfinity() const
Definition: APFloat.h:1345
Class for arbitrary precision integers.
Definition: APInt.h:77
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:237
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:445
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1597
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:275
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1216
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1200
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:494
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:631
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:808
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:707
@ Add
*p = old + v
Definition: Instructions.h:711
@ FAdd
*p = old + v
Definition: Instructions.h:732
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:725
@ Or
*p = old | v
Definition: Instructions.h:719
@ Sub
*p = old - v
Definition: Instructions.h:713
@ Xor
*p = old ^ v
Definition: Instructions.h:721
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:723
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:729
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:743
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:727
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:739
Value * getPointerOperand()
Definition: Instructions.h:851
void setOperation(BinOp Operation)
Definition: Instructions.h:802
BinOp getOperation() const
Definition: Instructions.h:786
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:842
Value * getValOperand()
Definition: Instructions.h:855
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:828
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:859
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:451
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:202
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:575
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1551
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
bool isSigned() const
Definition: InstrTypes.h:1007
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:41
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
iterator_range< arg_iterator > args()
Definition: Function.h:855
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasPrefetch() const
Definition: GCNSubtarget.h:920
bool hasD16Images() const
Definition: GCNSubtarget.h:695
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:844
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:473
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:464
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:840
bool hasDot7Insts() const
Definition: GCNSubtarget.h:794
bool hasApertureRegs() const
Definition: GCNSubtarget.h:593
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:623
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:836
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:764
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:407
bool hasMAIInsts() const
Definition: GCNSubtarget.h:814
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:675
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:523
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:581
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:262
bool hasDot1Insts() const
Definition: GCNSubtarget.h:770
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:852
Align getStackAlignment() const
Definition: GCNSubtarget.h:933
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:451
bool enableFlatScratch() const
Definition: GCNSubtarget.h:648
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:619
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:457
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:872
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:274
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:740
bool useDS128() const
Definition: GCNSubtarget.h:533
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:453
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:266
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:585
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:828
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:423
bool hasIntClamp() const
Definition: GCNSubtarget.h:353
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:373
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:597
bool hasLDSFPAtomicAddF64() const
Definition: GCNSubtarget.h:994
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:627
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:946
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:729
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:332
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:900
bool hasFFBL() const
Definition: GCNSubtarget.h:411
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:965
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:555
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:832
bool hasMed3_16() const
Definition: GCNSubtarget.h:419
bool hasMovrel() const
Definition: GCNSubtarget.h:969
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:846
bool hasBFI() const
Definition: GCNSubtarget.h:399
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:573
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:340
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:518
bool hasFFBH() const
Definition: GCNSubtarget.h:415
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:848
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:856
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:868
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:854
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:983
bool hasLDSFPAtomicAddF32() const
Definition: GCNSubtarget.h:993
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:543
bool hasDot8Insts() const
Definition: GCNSubtarget.h:798
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:538
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:527
Generation getGeneration() const
Definition: GCNSubtarget.h:313
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:860
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:727
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:864
bool hasAddr64() const
Definition: GCNSubtarget.h:377
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:427
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasFractBug() const
Definition: GCNSubtarget.h:391
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:395
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:710
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1805
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1531
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:173
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:172
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2395
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1118
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1788
LLVMContext & getContext() const
Definition: IRBuilder.h:174
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1801
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1852
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1112
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:178
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2130
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:363
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:399
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:173
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:258
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:238
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:968
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:565
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
Definition: SelectionDAG.h:482
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:859
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:574
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:846
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:258
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:270
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1143
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:737
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1297
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1019
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1290
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:567
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1292
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1262
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1293
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:495
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1275
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:927
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:488
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1288
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1289
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1431
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1295
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:899
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1417
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:628
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1209
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1068
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:736
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1242
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1009
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1098
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1291
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:508
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1258
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1298
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:894
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1037
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1014
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1286
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:999
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1232
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1269
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1294
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1118
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:936
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1300
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1284
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:473
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1285
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1203
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:478
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1283
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:959
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:921
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1115
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1091
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:777
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1299
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1534
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1042
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1542
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:228
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:219
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:483
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:271
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:246
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:269
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals