LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236 }
237
238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254
255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262
263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264
268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269
270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274
276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Expand);
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Expand);
289
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Custom);
294
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298
300
302
304 Expand);
305
306#if 0
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
331 case ISD::IS_FPCLASS:
332 break;
337 break;
338 default:
340 break;
341 }
342 }
343 }
344
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420 }
421
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Expand);
425
426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
455 Expand);
456
457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458
459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
468
469 if (Subtarget->hasSMemRealTime() ||
473
474 if (Subtarget->has16BitInsts()) {
477 } else {
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487
488 if (!Subtarget->hasBCNT(32))
490
491 if (!Subtarget->hasBCNT(64))
493
494 if (Subtarget->hasFFBH())
496
497 if (Subtarget->hasFFBL())
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517 Legal);
518
519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520 Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
526 {MVT::f32, MVT::f64}, Legal);
527
528 if (Subtarget->haveRoundOpsF64())
530 Legal);
531 else
533 MVT::f64, Custom);
534
536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537 Legal);
538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539
542
543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
550
551 if (Subtarget->has16BitInsts()) {
554 MVT::i16, Legal);
555
556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557
559 MVT::i16, Expand);
560
564 ISD::CTPOP},
565 MVT::i16, Promote);
566
568
569 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570
572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575
579
581
582 // F16 - Constant Actions.
585
586 // F16 - Load/Store Actions.
588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591
592 // BF16 - Load/Store Actions.
594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597
598 // F16 - VOP1 Actions.
601 MVT::f16, Custom);
602
605
606 // F16 - VOP2 Actions.
607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608 Expand);
612
613 // F16 - VOP3 Actions.
615 if (STI.hasMadF16())
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
638 break;
639 default:
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652
653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Legal);
655
657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660
662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665
666 setOperationAction(ISD::AND, MVT::v2i16, Promote);
667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668 setOperationAction(ISD::OR, MVT::v2i16, Promote);
669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672
674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679
681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686
688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693
695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698
700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705
706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712
714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719
720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726
728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733
735 MVT::v2i32, Expand);
737
739 MVT::v4i32, Expand);
740
742 MVT::v8i32, Expand);
743
744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751
754
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 }
859
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
863 Custom);
864
866 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
867 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
868 MVT::i16, MVT::i8, MVT::i128},
869 Custom);
870
872 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
873 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
874 MVT::i8, MVT::i128},
875 Custom);
876
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
886
888
891 ISD::SUB,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
949
950 // FIXME: In other contexts we pretend this is a per-function property.
952
954}
955
957 return Subtarget;
958}
959
960//===----------------------------------------------------------------------===//
961// TargetLowering queries
962//===----------------------------------------------------------------------===//
963
964// v_mad_mix* support a conversion from f16 to f32.
965//
966// There is only one special case when denormals are enabled we don't currently,
967// where this is OK to use.
968bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
969 EVT DestVT, EVT SrcVT) const {
970 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
971 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
972 DestVT.getScalarType() == MVT::f32 &&
973 SrcVT.getScalarType() == MVT::f16 &&
974 // TODO: This probably only requires no input flushing?
976}
977
979 LLT DestTy, LLT SrcTy) const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
982 DestTy.getScalarSizeInBits() == 32 &&
983 SrcTy.getScalarSizeInBits() == 16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 // SI has some legal vector types, but no legal vector operations. Say no
990 // shuffles are legal in order to prefer scalarizing some vector operations.
991 return false;
992}
993
996 EVT VT) const {
999
1000 if (VT.isVector()) {
1001 EVT ScalarVT = VT.getScalarType();
1002 unsigned Size = ScalarVT.getSizeInBits();
1003 if (Size == 16) {
1004 if (Subtarget->has16BitInsts()) {
1005 if (VT.isInteger())
1006 return MVT::v2i16;
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1008 }
1009 return VT.isInteger() ? MVT::i32 : MVT::f32;
1010 }
1011
1012 if (Size < 16)
1013 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1014 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1015 }
1016
1017 if (VT.getSizeInBits() > 32)
1018 return MVT::i32;
1019
1021}
1022
1025 EVT VT) const {
1028
1029 if (VT.isVector()) {
1030 unsigned NumElts = VT.getVectorNumElements();
1031 EVT ScalarVT = VT.getScalarType();
1032 unsigned Size = ScalarVT.getSizeInBits();
1033
1034 // FIXME: Should probably promote 8-bit vectors to i16.
1035 if (Size == 16 && Subtarget->has16BitInsts())
1036 return (NumElts + 1) / 2;
1037
1038 if (Size <= 32)
1039 return NumElts;
1040
1041 if (Size > 32)
1042 return NumElts * ((Size + 31) / 32);
1043 } else if (VT.getSizeInBits() > 32)
1044 return (VT.getSizeInBits() + 31) / 32;
1045
1047}
1048
1050 LLVMContext &Context, CallingConv::ID CC,
1051 EVT VT, EVT &IntermediateVT,
1052 unsigned &NumIntermediates, MVT &RegisterVT) const {
1053 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1054 unsigned NumElts = VT.getVectorNumElements();
1055 EVT ScalarVT = VT.getScalarType();
1056 unsigned Size = ScalarVT.getSizeInBits();
1057 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1058 // support, but unless we can properly handle 3-vectors, it will be still be
1059 // inconsistent.
1060 if (Size == 16 && Subtarget->has16BitInsts()) {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1064 } else {
1065 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1067 }
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size == 32) {
1073 RegisterVT = ScalarVT.getSimpleVT();
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1077 }
1078
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1080 // FIXME: Should probably form v2i16 pieces
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1085 }
1086
1087
1088 if (Size != 16 && Size <= 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1093 }
1094
1095 if (Size > 32) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((Size + 31) / 32);
1099 return NumIntermediates;
1100 }
1101 }
1102
1104 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1105}
1106
1107static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1108 assert(MaxNumLanes != 0);
1109
1110 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1112 return EVT::getVectorVT(Ty->getContext(),
1113 EVT::getEVT(VT->getElementType()),
1114 NumElts);
1115 }
1116
1117 return EVT::getEVT(Ty);
1118}
1119
1120// Peek through TFE struct returns to only use the data size.
1121static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1122 auto *ST = dyn_cast<StructType>(Ty);
1123 if (!ST)
1124 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1125
1126 // TFE intrinsics return an aggregate type.
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1129 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1130}
1131
1132/// Map address space 7 to MVT::v5i32 because that's its in-memory
1133/// representation. This return value is vector-typed because there is no
1134/// MVT::i160 and it is not clear if one can be added. While this could
1135/// cause issues during codegen, these address space 7 pointers will be
1136/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1137/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1138/// modeling, to work.
1140 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1141 return MVT::v5i32;
1143 DL.getPointerSizeInBits(AS) == 192)
1144 return MVT::v6i32;
1146}
1147/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1148/// v8i32 when padding is added.
1149/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1150/// also v8i32 with padding.
1152 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1156 return MVT::v8i32;
1158}
1159
1161 const CallInst &CI,
1162 MachineFunction &MF,
1163 unsigned IntrID) const {
1165 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1167
1168 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1171 (Intrinsic::ID)IntrID);
1172 MemoryEffects ME = Attr.getMemoryEffects();
1173 if (ME.doesNotAccessMemory())
1174 return false;
1175
1176 // TODO: Should images get their own address space?
1177 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1178
1179 if (RsrcIntr->IsImage)
1180 Info.align.reset();
1181
1182 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1183 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1184 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1185 // We conservatively set the memory operand of a buffer intrinsic to the
1186 // base resource pointer, so that we can access alias information about
1187 // those pointers. Cases like "this points at the same value
1188 // but with a different offset" are handled in
1189 // areMemAccessesTriviallyDisjoint.
1190 Info.ptrVal = RsrcArg;
1191 }
1192
1193 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1194 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1197 if (ME.onlyReadsMemory()) {
1198 unsigned MaxNumLanes = 4;
1199
1200 if (RsrcIntr->IsImage) {
1203 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1205
1206 if (!BaseOpcode->Gather4) {
1207 // If this isn't a gather, we may have excess loaded elements in the
1208 // IR type. Check the dmask for the real number of elements loaded.
1209 unsigned DMask
1210 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1211 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1212 }
1213 }
1214
1215 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1216
1217 // FIXME: What does alignment mean for an image?
1220 } else if (ME.onlyWritesMemory()) {
1222
1223 Type *DataTy = CI.getArgOperand(0)->getType();
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1226 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1228 } else
1229 Info.memVT = EVT::getEVT(DataTy);
1230
1232 } else {
1233 // Atomic
1234 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1236 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1240
1241 switch (IntrID) {
1242 default:
1243 // XXX - Should this be volatile without known ordering?
1245 break;
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1251 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1252 Info.ptrVal = CI.getArgOperand(1);
1253 return true;
1254 }
1255 }
1256 }
1257 return true;
1258 }
1259
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1267 Info.memVT = MVT::getVT(CI.getType());
1268 Info.ptrVal = CI.getOperand(0);
1269 Info.align.reset();
1271
1272 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1273 if (!Vol->isZero())
1275
1276 return true;
1277 }
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1280 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1281 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1282 Info.align.reset();
1284
1285 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1286 if (!Vol || !Vol->isZero())
1288
1289 return true;
1290 }
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1294 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1295 Info.ptrVal = nullptr;
1296 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1298 return true;
1299 }
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1303 Info.memVT = MVT::getVT(CI.getType());
1304 Info.ptrVal = CI.getOperand(0);
1305 Info.align.reset();
1307
1308 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1309 if (!Vol->isZero())
1311
1312 return true;
1313 }
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1316 Info.memVT = MVT::getVT(CI.getType());
1317 Info.ptrVal = CI.getOperand(0);
1318 Info.align.reset();
1322 return true;
1323 }
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1326 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1327
1328 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1329 Info.align.reset();
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1349 Info.memVT = MVT::getVT(CI.getType());
1350 Info.ptrVal = CI.getOperand(0);
1351 Info.align.reset();
1356 return true;
1357 }
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1361 Info.memVT = MVT::getVT(CI.getType());
1362 Info.ptrVal = CI.getOperand(0);
1363 Info.align.reset();
1365 return true;
1366 }
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1374
1375 const GCNTargetMachine &TM =
1376 static_cast<const GCNTargetMachine &>(getTargetMachine());
1377
1379 Info.ptrVal = MFI->getGWSPSV(TM);
1380
1381 // This is an abstract access, but we need to specify a type and size.
1382 Info.memVT = MVT::i32;
1383 Info.size = 4;
1384 Info.align = Align(4);
1385
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1388 else
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_global_load_lds: {
1394 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1395 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1396 Info.ptrVal = CI.getArgOperand(1);
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1402
1403 const GCNTargetMachine &TM =
1404 static_cast<const GCNTargetMachine &>(getTargetMachine());
1405
1407 Info.ptrVal = MFI->getGWSPSV(TM);
1408
1409 // This is an abstract access, but we need to specify a type and size.
1410 Info.memVT = MVT::i32;
1411 Info.size = 4;
1412 Info.align = Align(4);
1413
1415 return true;
1416 }
1417 default:
1418 return false;
1419 }
1420}
1421
1423 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1424 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1426 // The DAG's ValueType loses the addrspaces.
1427 // Add them as 2 extra Constant operands "from" and "to".
1428 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS = I.getType()->getPointerAddressSpace();
1430 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1431 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1432 break;
1433 }
1434 default:
1435 break;
1436 }
1437}
1438
1441 Type *&AccessTy) const {
1442 Value *Ptr = nullptr;
1443 switch (II->getIntrinsicID()) {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 Ptr = II->getArgOperand(0);
1469 break;
1470 case Intrinsic::amdgcn_global_load_lds:
1471 Ptr = II->getArgOperand(1);
1472 break;
1473 default:
1474 return false;
1475 }
1476 AccessTy = II->getType();
1477 Ops.push_back(Ptr);
1478 return true;
1479}
1480
1481bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1482 unsigned AddrSpace,
1483 uint64_t FlatVariant) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 return AM.Scale == 0 &&
1491 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1492 AM.BaseOffs, AddrSpace, FlatVariant));
1493}
1494
1496 if (Subtarget->hasFlatGlobalInsts())
1497 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1499
1500 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1501 // Assume the we will use FLAT for all global memory accesses
1502 // on VI.
1503 // FIXME: This assumption is currently wrong. On VI we still use
1504 // MUBUF instructions for the r + i addressing mode. As currently
1505 // implemented, the MUBUF instructions only work on buffer < 4GB.
1506 // It may be possible to support > 4GB buffers with MUBUF instructions,
1507 // by setting the stride value in the resource descriptor which would
1508 // increase the size limit to (stride * 4GB). However, this is risky,
1509 // because it has never been validated.
1510 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1512 }
1513
1514 return isLegalMUBUFAddressingMode(AM);
1515}
1516
1517bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1518 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1519 // additionally can do r + r + i with addr64. 32-bit has more addressing
1520 // mode options. Depending on the resource constant, it can also do
1521 // (i64 r0) + (i32 r1) * (i14 i).
1522 //
1523 // Private arrays end up using a scratch buffer most of the time, so also
1524 // assume those use MUBUF instructions. Scratch loads / stores are currently
1525 // implemented as mubuf instructions with offen bit set, so slightly
1526 // different than the normal addr64.
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1528 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1529 return false;
1530
1531 // FIXME: Since we can split immediate into soffset and immediate offset,
1532 // would it make sense to allow any immediate?
1533
1534 switch (AM.Scale) {
1535 case 0: // r + i or just i, depending on HasBaseReg.
1536 return true;
1537 case 1:
1538 return true; // We have r + r or r + i.
1539 case 2:
1540 if (AM.HasBaseReg) {
1541 // Reject 2 * r + r.
1542 return false;
1543 }
1544
1545 // Allow 2 * r as r + r
1546 // Or 2 * r + i is allowed as r + r + i.
1547 return true;
1548 default: // Don't allow n * r
1549 return false;
1550 }
1551}
1552
1554 const AddrMode &AM, Type *Ty,
1555 unsigned AS, Instruction *I) const {
1556 // No global is ever allowed as a base.
1557 if (AM.BaseGV)
1558 return false;
1559
1560 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1561 return isLegalGlobalAddressingMode(AM);
1562
1563 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1567 // If the offset isn't a multiple of 4, it probably isn't going to be
1568 // correctly aligned.
1569 // FIXME: Can we get the real alignment here?
1570 if (AM.BaseOffs % 4 != 0)
1571 return isLegalMUBUFAddressingMode(AM);
1572
1573 if (!Subtarget->hasScalarSubwordLoads()) {
1574 // There are no SMRD extloads, so if we have to do a small type access we
1575 // will use a MUBUF load.
1576 // FIXME?: We also need to do this if unaligned, but we don't know the
1577 // alignment here.
1578 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1579 return isLegalGlobalAddressingMode(AM);
1580 }
1581
1583 // SMRD instructions have an 8-bit, dword offset on SI.
1584 if (!isUInt<8>(AM.BaseOffs / 4))
1585 return false;
1586 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1587 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1588 // in 8-bits, it can use a smaller encoding.
1589 if (!isUInt<32>(AM.BaseOffs / 4))
1590 return false;
1591 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1592 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1593 if (!isUInt<20>(AM.BaseOffs))
1594 return false;
1595 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1596 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1597 // for S_BUFFER_* instructions).
1598 if (!isInt<21>(AM.BaseOffs))
1599 return false;
1600 } else {
1601 // On GFX12, all offsets are signed 24-bit in bytes.
1602 if (!isInt<24>(AM.BaseOffs))
1603 return false;
1604 }
1605
1606 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1607 return true;
1608
1609 if (AM.Scale == 1 && AM.HasBaseReg)
1610 return true;
1611
1612 return false;
1613 }
1614
1615 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1616 return Subtarget->enableFlatScratch()
1617 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1619 : isLegalMUBUFAddressingMode(AM);
1620
1621 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1622 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1623 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1624 // field.
1625 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1626 // an 8-bit dword offset but we don't know the alignment here.
1627 if (!isUInt<16>(AM.BaseOffs))
1628 return false;
1629
1630 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1631 return true;
1632
1633 if (AM.Scale == 1 && AM.HasBaseReg)
1634 return true;
1635
1636 return false;
1637 }
1638
1640 // For an unknown address space, this usually means that this is for some
1641 // reason being used for pure arithmetic, and not based on some addressing
1642 // computation. We don't have instructions that compute pointers with any
1643 // addressing modes, so treat them as having no offset like flat
1644 // instructions.
1645 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1647 }
1648
1649 // Assume a user alias of global for unknown address spaces.
1650 return isLegalGlobalAddressingMode(AM);
1651}
1652
1654 const MachineFunction &MF) const {
1656 return (MemVT.getSizeInBits() <= 4 * 32);
1657 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1658 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1659 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1660 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1661 return (MemVT.getSizeInBits() <= 2 * 32);
1662 }
1663 return true;
1664}
1665
1667 unsigned Size, unsigned AddrSpace, Align Alignment,
1668 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1669 if (IsFast)
1670 *IsFast = 0;
1671
1672 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1673 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1674 // Check if alignment requirements for ds_read/write instructions are
1675 // disabled.
1676 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1677 return false;
1678
1679 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1680 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1681 Alignment < RequiredAlignment)
1682 return false;
1683
1684 // Either, the alignment requirements are "enabled", or there is an
1685 // unaligned LDS access related hardware bug though alignment requirements
1686 // are "disabled". In either case, we need to check for proper alignment
1687 // requirements.
1688 //
1689 switch (Size) {
1690 case 64:
1691 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1692 // address is negative, then the instruction is incorrectly treated as
1693 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1694 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1695 // load later in the SILoadStoreOptimizer.
1696 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1697 return false;
1698
1699 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1700 // can do a 4 byte aligned, 8 byte access in a single operation using
1701 // ds_read2/write2_b32 with adjacent offsets.
1702 RequiredAlignment = Align(4);
1703
1704 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1705 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1706 // ds_write2_b32 depending on the alignment. In either case with either
1707 // alignment there is no faster way of doing this.
1708
1709 // The numbers returned here and below are not additive, it is a 'speed
1710 // rank'. They are just meant to be compared to decide if a certain way
1711 // of lowering an operation is faster than another. For that purpose
1712 // naturally aligned operation gets it bitsize to indicate that "it
1713 // operates with a speed comparable to N-bit wide load". With the full
1714 // alignment ds128 is slower than ds96 for example. If underaligned it
1715 // is comparable to a speed of a single dword access, which would then
1716 // mean 32 < 128 and it is faster to issue a wide load regardless.
1717 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1718 // wider load which will not be aligned anymore the latter is slower.
1719 if (IsFast)
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment < Align(4)) ? 32
1722 : 1;
1723 return true;
1724 }
1725
1726 break;
1727 case 96:
1728 if (!Subtarget->hasDS96AndDS128())
1729 return false;
1730
1731 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1732 // gfx8 and older.
1733
1734 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1735 // Naturally aligned access is fastest. However, also report it is Fast
1736 // if memory is aligned less than DWORD. A narrow load or store will be
1737 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1738 // be more of them, so overall we will pay less penalty issuing a single
1739 // instruction.
1740
1741 // See comment on the values above.
1742 if (IsFast)
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment < Align(4)) ? 32
1745 : 1;
1746 return true;
1747 }
1748
1749 break;
1750 case 128:
1751 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1752 return false;
1753
1754 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1755 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1756 // single operation using ds_read2/write2_b64.
1757 RequiredAlignment = Align(8);
1758
1759 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1760 // Naturally aligned access is fastest. However, also report it is Fast
1761 // if memory is aligned less than DWORD. A narrow load or store will be
1762 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1763 // will be more of them, so overall we will pay less penalty issuing a
1764 // single instruction.
1765
1766 // See comment on the values above.
1767 if (IsFast)
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment < Align(4)) ? 32
1770 : 1;
1771 return true;
1772 }
1773
1774 break;
1775 default:
1776 if (Size > 32)
1777 return false;
1778
1779 break;
1780 }
1781
1782 // See comment on the values above.
1783 // Note that we have a single-dword or sub-dword here, so if underaligned
1784 // it is a slowest possible access, hence returned value is 0.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1787
1788 return Alignment >= RequiredAlignment ||
1789 Subtarget->hasUnalignedDSAccessEnabled();
1790 }
1791
1792 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1793 bool AlignedBy4 = Alignment >= Align(4);
1794 if (IsFast)
1795 *IsFast = AlignedBy4;
1796
1797 return AlignedBy4 ||
1798 Subtarget->enableFlatScratch() ||
1799 Subtarget->hasUnalignedScratchAccess();
1800 }
1801
1802 // FIXME: We have to be conservative here and assume that flat operations
1803 // will access scratch. If we had access to the IR function, then we
1804 // could determine if any private memory was used in the function.
1805 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1806 !Subtarget->hasUnalignedScratchAccess()) {
1807 bool AlignedBy4 = Alignment >= Align(4);
1808 if (IsFast)
1809 *IsFast = AlignedBy4;
1810
1811 return AlignedBy4;
1812 }
1813
1814 // So long as they are correct, wide global memory operations perform better
1815 // than multiple smaller memory ops -- even when misaligned
1816 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1817 if (IsFast)
1818 *IsFast = Size;
1819
1820 return Alignment >= Align(4) ||
1822 }
1823
1824 // Smaller than dword value must be aligned.
1825 if (Size < 32)
1826 return false;
1827
1828 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1829 // byte-address are ignored, thus forcing Dword alignment.
1830 // This applies to private, global, and constant memory.
1831 if (IsFast)
1832 *IsFast = 1;
1833
1834 return Size >= 32 && Alignment >= Align(4);
1835}
1836
1838 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1839 unsigned *IsFast) const {
1841 Alignment, Flags, IsFast);
1842}
1843
1845 const MemOp &Op, const AttributeList &FuncAttributes) const {
1846 // FIXME: Should account for address space here.
1847
1848 // The default fallback uses the private pointer size as a guess for a type to
1849 // use. Make sure we switch these to 64-bit accesses.
1850
1851 if (Op.size() >= 16 &&
1852 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1853 return MVT::v4i32;
1854
1855 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1856 return MVT::v2i32;
1857
1858 // Use the default.
1859 return MVT::Other;
1860}
1861
1863 const MemSDNode *MemNode = cast<MemSDNode>(N);
1864 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1865}
1866
1868 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1870}
1871
1873 unsigned DestAS) const {
1874 // Flat -> private/local is a simple truncate.
1875 // Flat -> global is no-op
1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1877 return true;
1878
1879 const GCNTargetMachine &TM =
1880 static_cast<const GCNTargetMachine &>(getTargetMachine());
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1882}
1883
1885 const MemSDNode *MemNode = cast<MemSDNode>(N);
1886
1888}
1889
1892 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1893 VT.getScalarType().bitsLE(MVT::i16))
1896}
1897
1899 Type *Ty) const {
1900 // FIXME: Could be smarter if called for vector constants.
1901 return true;
1902}
1903
1905 unsigned Index) const {
1907 return false;
1908
1909 // TODO: Add more cases that are cheap.
1910 return Index == 0;
1911}
1912
1914 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1915 switch (Op) {
1916 case ISD::LOAD:
1917 case ISD::STORE:
1918
1919 // These operations are done with 32-bit instructions anyway.
1920 case ISD::AND:
1921 case ISD::OR:
1922 case ISD::XOR:
1923 case ISD::SELECT:
1924 // TODO: Extensions?
1925 return true;
1926 default:
1927 return false;
1928 }
1929 }
1930
1931 // SimplifySetCC uses this function to determine whether or not it should
1932 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1933 if (VT == MVT::i1 && Op == ISD::SETCC)
1934 return false;
1935
1937}
1938
1939SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1940 const SDLoc &SL,
1941 SDValue Chain,
1942 uint64_t Offset) const {
1943 const DataLayout &DL = DAG.getDataLayout();
1946
1947 const ArgDescriptor *InputPtrReg;
1948 const TargetRegisterClass *RC;
1949 LLT ArgTy;
1951
1952 std::tie(InputPtrReg, RC, ArgTy) =
1954
1955 // We may not have the kernarg segment argument if we have no kernel
1956 // arguments.
1957 if (!InputPtrReg)
1958 return DAG.getConstant(Offset, SL, PtrVT);
1959
1961 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1962 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1963
1964 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1965}
1966
1967SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1968 const SDLoc &SL) const {
1971 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1972}
1973
1974SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1975 const SDLoc &SL) const {
1976
1978 std::optional<uint32_t> KnownSize =
1980 if (KnownSize.has_value())
1981 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1982 return SDValue();
1983}
1984
1985SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1986 const SDLoc &SL, SDValue Val,
1987 bool Signed,
1988 const ISD::InputArg *Arg) const {
1989 // First, if it is a widened vector, narrow it.
1990 if (VT.isVector() &&
1992 EVT NarrowedVT =
1995 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1996 DAG.getConstant(0, SL, MVT::i32));
1997 }
1998
1999 // Then convert the vector elements or scalar value.
2000 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2001 VT.bitsLT(MemVT)) {
2002 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2003 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2004 }
2005
2006 if (MemVT.isFloatingPoint())
2007 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2008 else if (Signed)
2009 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2010 else
2011 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2012
2013 return Val;
2014}
2015
2016SDValue SITargetLowering::lowerKernargMemParameter(
2017 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2018 uint64_t Offset, Align Alignment, bool Signed,
2019 const ISD::InputArg *Arg) const {
2021
2022 // Try to avoid using an extload by loading earlier than the argument address,
2023 // and extracting the relevant bits. The load should hopefully be merged with
2024 // the previous argument.
2025 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2026 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2027 int64_t AlignDownOffset = alignDown(Offset, 4);
2028 int64_t OffsetDiff = Offset - AlignDownOffset;
2029
2030 EVT IntVT = MemVT.changeTypeToInteger();
2031
2032 // TODO: If we passed in the base kernel offset we could have a better
2033 // alignment than 4, but we don't really need it.
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2035 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2038
2039 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2040 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2041
2042 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2043 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2045
2046
2047 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2048 }
2049
2050 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2051 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2054
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2056 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2057}
2058
2059SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2060 const SDLoc &SL, SDValue Chain,
2061 const ISD::InputArg &Arg) const {
2063 MachineFrameInfo &MFI = MF.getFrameInfo();
2064
2065 if (Arg.Flags.isByVal()) {
2066 unsigned Size = Arg.Flags.getByValSize();
2067 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2068 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2069 }
2070
2071 unsigned ArgOffset = VA.getLocMemOffset();
2072 unsigned ArgSize = VA.getValVT().getStoreSize();
2073
2074 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2075
2076 // Create load nodes to retrieve arguments from the stack.
2077 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2078 SDValue ArgValue;
2079
2080 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2082 MVT MemVT = VA.getValVT();
2083
2084 switch (VA.getLocInfo()) {
2085 default:
2086 break;
2087 case CCValAssign::BCvt:
2088 MemVT = VA.getLocVT();
2089 break;
2090 case CCValAssign::SExt:
2091 ExtType = ISD::SEXTLOAD;
2092 break;
2093 case CCValAssign::ZExt:
2094 ExtType = ISD::ZEXTLOAD;
2095 break;
2096 case CCValAssign::AExt:
2097 ExtType = ISD::EXTLOAD;
2098 break;
2099 }
2100
2101 ArgValue = DAG.getExtLoad(
2102 ExtType, SL, VA.getLocVT(), Chain, FIN,
2104 MemVT);
2105 return ArgValue;
2106}
2107
2108SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2109 const SIMachineFunctionInfo &MFI,
2110 EVT VT,
2112 const ArgDescriptor *Reg = nullptr;
2113 const TargetRegisterClass *RC;
2114 LLT Ty;
2115
2117 const ArgDescriptor WorkGroupIDX =
2118 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2119 // If GridZ is not programmed in an entry function then the hardware will set
2120 // it to all zeros, so there is no need to mask the GridY value in the low
2121 // order bits.
2122 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2123 AMDGPU::TTMP7,
2124 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2125 const ArgDescriptor WorkGroupIDZ =
2126 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2127 if (Subtarget->hasArchitectedSGPRs() &&
2129 switch (PVID) {
2131 Reg = &WorkGroupIDX;
2132 RC = &AMDGPU::SReg_32RegClass;
2133 Ty = LLT::scalar(32);
2134 break;
2136 Reg = &WorkGroupIDY;
2137 RC = &AMDGPU::SReg_32RegClass;
2138 Ty = LLT::scalar(32);
2139 break;
2141 Reg = &WorkGroupIDZ;
2142 RC = &AMDGPU::SReg_32RegClass;
2143 Ty = LLT::scalar(32);
2144 break;
2145 default:
2146 break;
2147 }
2148 }
2149
2150 if (!Reg)
2151 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2152 if (!Reg) {
2154 // It's possible for a kernarg intrinsic call to appear in a kernel with
2155 // no allocated segment, in which case we do not add the user sgpr
2156 // argument, so just return null.
2157 return DAG.getConstant(0, SDLoc(), VT);
2158 }
2159
2160 // It's undefined behavior if a function marked with the amdgpu-no-*
2161 // attributes uses the corresponding intrinsic.
2162 return DAG.getUNDEF(VT);
2163 }
2164
2165 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2166}
2167
2169 CallingConv::ID CallConv,
2170 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2171 FunctionType *FType,
2172 SIMachineFunctionInfo *Info) {
2173 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2174 const ISD::InputArg *Arg = &Ins[I];
2175
2176 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2177 "vector type argument should have been split");
2178
2179 // First check if it's a PS input addr.
2180 if (CallConv == CallingConv::AMDGPU_PS &&
2181 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2182 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2183
2184 // Inconveniently only the first part of the split is marked as isSplit,
2185 // so skip to the end. We only want to increment PSInputNum once for the
2186 // entire split argument.
2187 if (Arg->Flags.isSplit()) {
2188 while (!Arg->Flags.isSplitEnd()) {
2189 assert((!Arg->VT.isVector() ||
2190 Arg->VT.getScalarSizeInBits() == 16) &&
2191 "unexpected vector split in ps argument type");
2192 if (!SkipArg)
2193 Splits.push_back(*Arg);
2194 Arg = &Ins[++I];
2195 }
2196 }
2197
2198 if (SkipArg) {
2199 // We can safely skip PS inputs.
2200 Skipped.set(Arg->getOrigArgIndex());
2201 ++PSInputNum;
2202 continue;
2203 }
2204
2205 Info->markPSInputAllocated(PSInputNum);
2206 if (Arg->Used)
2207 Info->markPSInputEnabled(PSInputNum);
2208
2209 ++PSInputNum;
2210 }
2211
2212 Splits.push_back(*Arg);
2213 }
2214}
2215
2216// Allocate special inputs passed in VGPRs.
2218 MachineFunction &MF,
2219 const SIRegisterInfo &TRI,
2220 SIMachineFunctionInfo &Info) const {
2221 const LLT S32 = LLT::scalar(32);
2223
2224 if (Info.hasWorkItemIDX()) {
2225 Register Reg = AMDGPU::VGPR0;
2226 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2227
2228 CCInfo.AllocateReg(Reg);
2229 unsigned Mask = (Subtarget->hasPackedTID() &&
2230 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2231 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2232 }
2233
2234 if (Info.hasWorkItemIDY()) {
2235 assert(Info.hasWorkItemIDX());
2236 if (Subtarget->hasPackedTID()) {
2237 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2238 0x3ff << 10));
2239 } else {
2240 unsigned Reg = AMDGPU::VGPR1;
2241 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2242
2243 CCInfo.AllocateReg(Reg);
2244 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2245 }
2246 }
2247
2248 if (Info.hasWorkItemIDZ()) {
2249 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2250 if (Subtarget->hasPackedTID()) {
2251 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2252 0x3ff << 20));
2253 } else {
2254 unsigned Reg = AMDGPU::VGPR2;
2255 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2256
2257 CCInfo.AllocateReg(Reg);
2258 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2259 }
2260 }
2261}
2262
2263// Try to allocate a VGPR at the end of the argument list, or if no argument
2264// VGPRs are left allocating a stack slot.
2265// If \p Mask is is given it indicates bitfield position in the register.
2266// If \p Arg is given use it with new ]p Mask instead of allocating new.
2267static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2268 ArgDescriptor Arg = ArgDescriptor()) {
2269 if (Arg.isSet())
2270 return ArgDescriptor::createArg(Arg, Mask);
2271
2272 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2273 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2274 if (RegIdx == ArgVGPRs.size()) {
2275 // Spill to stack required.
2276 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2277
2278 return ArgDescriptor::createStack(Offset, Mask);
2279 }
2280
2281 unsigned Reg = ArgVGPRs[RegIdx];
2282 Reg = CCInfo.AllocateReg(Reg);
2283 assert(Reg != AMDGPU::NoRegister);
2284
2285 MachineFunction &MF = CCInfo.getMachineFunction();
2286 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2287 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2288 return ArgDescriptor::createRegister(Reg, Mask);
2289}
2290
2292 const TargetRegisterClass *RC,
2293 unsigned NumArgRegs) {
2294 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2295 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2296 if (RegIdx == ArgSGPRs.size())
2297 report_fatal_error("ran out of SGPRs for arguments");
2298
2299 unsigned Reg = ArgSGPRs[RegIdx];
2300 Reg = CCInfo.AllocateReg(Reg);
2301 assert(Reg != AMDGPU::NoRegister);
2302
2303 MachineFunction &MF = CCInfo.getMachineFunction();
2304 MF.addLiveIn(Reg, RC);
2306}
2307
2308// If this has a fixed position, we still should allocate the register in the
2309// CCInfo state. Technically we could get away with this for values passed
2310// outside of the normal argument range.
2312 const TargetRegisterClass *RC,
2313 MCRegister Reg) {
2314 Reg = CCInfo.AllocateReg(Reg);
2315 assert(Reg != AMDGPU::NoRegister);
2316 MachineFunction &MF = CCInfo.getMachineFunction();
2317 MF.addLiveIn(Reg, RC);
2318}
2319
2320static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2321 if (Arg) {
2322 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2323 Arg.getRegister());
2324 } else
2325 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2326}
2327
2328static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2329 if (Arg) {
2330 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2331 Arg.getRegister());
2332 } else
2333 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2334}
2335
2336/// Allocate implicit function VGPR arguments at the end of allocated user
2337/// arguments.
2339 CCState &CCInfo, MachineFunction &MF,
2340 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2341 const unsigned Mask = 0x3ff;
2342 ArgDescriptor Arg;
2343
2344 if (Info.hasWorkItemIDX()) {
2345 Arg = allocateVGPR32Input(CCInfo, Mask);
2346 Info.setWorkItemIDX(Arg);
2347 }
2348
2349 if (Info.hasWorkItemIDY()) {
2350 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2351 Info.setWorkItemIDY(Arg);
2352 }
2353
2354 if (Info.hasWorkItemIDZ())
2355 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2356}
2357
2358/// Allocate implicit function VGPR arguments in fixed registers.
2360 CCState &CCInfo, MachineFunction &MF,
2361 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2362 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2363 if (!Reg)
2364 report_fatal_error("failed to allocated VGPR for implicit arguments");
2365
2366 const unsigned Mask = 0x3ff;
2367 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2368 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2369 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2370}
2371
2373 CCState &CCInfo,
2374 MachineFunction &MF,
2375 const SIRegisterInfo &TRI,
2376 SIMachineFunctionInfo &Info) const {
2377 auto &ArgInfo = Info.getArgInfo();
2378 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2379
2380 // TODO: Unify handling with private memory pointers.
2381 if (UserSGPRInfo.hasDispatchPtr())
2382 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2383
2384 const Module *M = MF.getFunction().getParent();
2385 if (UserSGPRInfo.hasQueuePtr() &&
2387 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2388
2389 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2390 // constant offset from the kernarg segment.
2391 if (Info.hasImplicitArgPtr())
2392 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2393
2394 if (UserSGPRInfo.hasDispatchID())
2395 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2396
2397 // flat_scratch_init is not applicable for non-kernel functions.
2398
2399 if (Info.hasWorkGroupIDX())
2400 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2401
2402 if (Info.hasWorkGroupIDY())
2403 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2404
2405 if (Info.hasWorkGroupIDZ())
2406 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2407
2408 if (Info.hasLDSKernelId())
2409 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2410}
2411
2412// Allocate special inputs passed in user SGPRs.
2414 MachineFunction &MF,
2415 const SIRegisterInfo &TRI,
2416 SIMachineFunctionInfo &Info) const {
2417 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2418 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2419 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2420 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2421 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2422 }
2423
2424 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2425 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2426 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2427 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2428 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2429 }
2430
2431 if (UserSGPRInfo.hasDispatchPtr()) {
2432 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2433 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2434 CCInfo.AllocateReg(DispatchPtrReg);
2435 }
2436
2437 const Module *M = MF.getFunction().getParent();
2438 if (UserSGPRInfo.hasQueuePtr() &&
2440 Register QueuePtrReg = Info.addQueuePtr(TRI);
2441 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2442 CCInfo.AllocateReg(QueuePtrReg);
2443 }
2444
2445 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2447 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2448 CCInfo.AllocateReg(InputPtrReg);
2449
2450 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2451 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2452 }
2453
2454 if (UserSGPRInfo.hasDispatchID()) {
2455 Register DispatchIDReg = Info.addDispatchID(TRI);
2456 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2457 CCInfo.AllocateReg(DispatchIDReg);
2458 }
2459
2460 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2461 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2462 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2463 CCInfo.AllocateReg(FlatScratchInitReg);
2464 }
2465
2466 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2467 // these from the dispatch pointer.
2468}
2469
2470// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2471// sequential starting from the first argument.
2473 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2475 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2476 Function &F = MF.getFunction();
2477 unsigned LastExplicitArgOffset =
2478 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2479 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2480 bool InPreloadSequence = true;
2481 unsigned InIdx = 0;
2482 for (auto &Arg : F.args()) {
2483 if (!InPreloadSequence || !Arg.hasInRegAttr())
2484 break;
2485
2486 int ArgIdx = Arg.getArgNo();
2487 // Don't preload non-original args or parts not in the current preload
2488 // sequence.
2489 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2490 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2491 break;
2492
2493 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2494 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2495 InIdx++) {
2496 assert(ArgLocs[ArgIdx].isMemLoc());
2497 auto &ArgLoc = ArgLocs[InIdx];
2498 const Align KernelArgBaseAlign = Align(16);
2499 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2500 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2501 unsigned NumAllocSGPRs =
2502 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2503
2504 // Arg is preloaded into the previous SGPR.
2505 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2506 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2507 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2508 continue;
2509 }
2510
2511 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2512 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2513 // Check for free user SGPRs for preloading.
2514 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2515 SGPRInfo.getNumFreeUserSGPRs()) {
2516 InPreloadSequence = false;
2517 break;
2518 }
2519
2520 // Preload this argument.
2521 const TargetRegisterClass *RC =
2522 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2523 SmallVectorImpl<MCRegister> *PreloadRegs =
2524 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2525
2526 if (PreloadRegs->size() > 1)
2527 RC = &AMDGPU::SGPR_32RegClass;
2528 for (auto &Reg : *PreloadRegs) {
2529 assert(Reg);
2530 MF.addLiveIn(Reg, RC);
2531 CCInfo.AllocateReg(Reg);
2532 }
2533
2534 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2535 }
2536 }
2537}
2538
2540 const SIRegisterInfo &TRI,
2541 SIMachineFunctionInfo &Info) const {
2542 // Always allocate this last since it is a synthetic preload.
2543 if (Info.hasLDSKernelId()) {
2544 Register Reg = Info.addLDSKernelId();
2545 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2546 CCInfo.AllocateReg(Reg);
2547 }
2548}
2549
2550// Allocate special input registers that are initialized per-wave.
2552 MachineFunction &MF,
2554 CallingConv::ID CallConv,
2555 bool IsShader) const {
2556 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2557 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2558 // Note: user SGPRs are handled by the front-end for graphics shaders
2559 // Pad up the used user SGPRs with dead inputs.
2560
2561 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2562 // before enabling architected SGPRs for workgroup IDs.
2563 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2564
2565 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2566 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2567 // rely on it to reach 16 since if we end up having no stack usage, it will
2568 // not really be added.
2569 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2570 Info.hasWorkGroupIDY() +
2571 Info.hasWorkGroupIDZ() +
2572 Info.hasWorkGroupInfo();
2573 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2574 Register Reg = Info.addReservedUserSGPR();
2575 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2576 CCInfo.AllocateReg(Reg);
2577 }
2578 }
2579
2580 if (!HasArchitectedSGPRs) {
2581 if (Info.hasWorkGroupIDX()) {
2582 Register Reg = Info.addWorkGroupIDX();
2583 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2584 CCInfo.AllocateReg(Reg);
2585 }
2586
2587 if (Info.hasWorkGroupIDY()) {
2588 Register Reg = Info.addWorkGroupIDY();
2589 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2590 CCInfo.AllocateReg(Reg);
2591 }
2592
2593 if (Info.hasWorkGroupIDZ()) {
2594 Register Reg = Info.addWorkGroupIDZ();
2595 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2596 CCInfo.AllocateReg(Reg);
2597 }
2598 }
2599
2600 if (Info.hasWorkGroupInfo()) {
2601 Register Reg = Info.addWorkGroupInfo();
2602 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2603 CCInfo.AllocateReg(Reg);
2604 }
2605
2606 if (Info.hasPrivateSegmentWaveByteOffset()) {
2607 // Scratch wave offset passed in system SGPR.
2608 unsigned PrivateSegmentWaveByteOffsetReg;
2609
2610 if (IsShader) {
2611 PrivateSegmentWaveByteOffsetReg =
2612 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2613
2614 // This is true if the scratch wave byte offset doesn't have a fixed
2615 // location.
2616 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2617 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2618 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2619 }
2620 } else
2621 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2622
2623 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2624 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2625 }
2626
2627 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2628 Info.getNumPreloadedSGPRs() >= 16);
2629}
2630
2632 MachineFunction &MF,
2633 const SIRegisterInfo &TRI,
2634 SIMachineFunctionInfo &Info) {
2635 // Now that we've figured out where the scratch register inputs are, see if
2636 // should reserve the arguments and use them directly.
2637 MachineFrameInfo &MFI = MF.getFrameInfo();
2638 bool HasStackObjects = MFI.hasStackObjects();
2639 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2640
2641 // Record that we know we have non-spill stack objects so we don't need to
2642 // check all stack objects later.
2643 if (HasStackObjects)
2644 Info.setHasNonSpillStackObjects(true);
2645
2646 // Everything live out of a block is spilled with fast regalloc, so it's
2647 // almost certain that spilling will be required.
2648 if (TM.getOptLevel() == CodeGenOptLevel::None)
2649 HasStackObjects = true;
2650
2651 // For now assume stack access is needed in any callee functions, so we need
2652 // the scratch registers to pass in.
2653 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2654
2655 if (!ST.enableFlatScratch()) {
2656 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2657 // If we have stack objects, we unquestionably need the private buffer
2658 // resource. For the Code Object V2 ABI, this will be the first 4 user
2659 // SGPR inputs. We can reserve those and use them directly.
2660
2661 Register PrivateSegmentBufferReg =
2663 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2664 } else {
2665 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2666 // We tentatively reserve the last registers (skipping the last registers
2667 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2668 // we'll replace these with the ones immediately after those which were
2669 // really allocated. In the prologue copies will be inserted from the
2670 // argument to these reserved registers.
2671
2672 // Without HSA, relocations are used for the scratch pointer and the
2673 // buffer resource setup is always inserted in the prologue. Scratch wave
2674 // offset is still in an input SGPR.
2675 Info.setScratchRSrcReg(ReservedBufferReg);
2676 }
2677 }
2678
2680
2681 // For entry functions we have to set up the stack pointer if we use it,
2682 // whereas non-entry functions get this "for free". This means there is no
2683 // intrinsic advantage to using S32 over S34 in cases where we do not have
2684 // calls but do need a frame pointer (i.e. if we are requested to have one
2685 // because frame pointer elimination is disabled). To keep things simple we
2686 // only ever use S32 as the call ABI stack pointer, and so using it does not
2687 // imply we need a separate frame pointer.
2688 //
2689 // Try to use s32 as the SP, but move it if it would interfere with input
2690 // arguments. This won't work with calls though.
2691 //
2692 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2693 // registers.
2694 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2695 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2696 } else {
2698
2699 if (MFI.hasCalls())
2700 report_fatal_error("call in graphics shader with too many input SGPRs");
2701
2702 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2703 if (!MRI.isLiveIn(Reg)) {
2704 Info.setStackPtrOffsetReg(Reg);
2705 break;
2706 }
2707 }
2708
2709 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2710 report_fatal_error("failed to find register for SP");
2711 }
2712
2713 // hasFP should be accurate for entry functions even before the frame is
2714 // finalized, because it does not rely on the known stack size, only
2715 // properties like whether variable sized objects are present.
2716 if (ST.getFrameLowering()->hasFP(MF)) {
2717 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2718 }
2719}
2720
2723 return !Info->isEntryFunction();
2724}
2725
2727
2728}
2729
2731 MachineBasicBlock *Entry,
2732 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2734
2735 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2736 if (!IStart)
2737 return;
2738
2739 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2740 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2741 MachineBasicBlock::iterator MBBI = Entry->begin();
2742 for (const MCPhysReg *I = IStart; *I; ++I) {
2743 const TargetRegisterClass *RC = nullptr;
2744 if (AMDGPU::SReg_64RegClass.contains(*I))
2745 RC = &AMDGPU::SGPR_64RegClass;
2746 else if (AMDGPU::SReg_32RegClass.contains(*I))
2747 RC = &AMDGPU::SGPR_32RegClass;
2748 else
2749 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2750
2751 Register NewVR = MRI->createVirtualRegister(RC);
2752 // Create copy from CSR to a virtual register.
2753 Entry->addLiveIn(*I);
2754 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2755 .addReg(*I);
2756
2757 // Insert the copy-back instructions right before the terminator.
2758 for (auto *Exit : Exits)
2759 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2760 TII->get(TargetOpcode::COPY), *I)
2761 .addReg(NewVR);
2762 }
2763}
2764
2766 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2767 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2768 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2770
2772 const Function &Fn = MF.getFunction();
2775
2776 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2777 DiagnosticInfoUnsupported NoGraphicsHSA(
2778 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2779 DAG.getContext()->diagnose(NoGraphicsHSA);
2780 return DAG.getEntryNode();
2781 }
2782
2785 BitVector Skipped(Ins.size());
2786 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2787 *DAG.getContext());
2788
2789 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2790 bool IsKernel = AMDGPU::isKernel(CallConv);
2791 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2792
2793 if (IsGraphics) {
2794 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2795 assert(!UserSGPRInfo.hasDispatchPtr() &&
2796 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2797 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2798 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2799 (void)UserSGPRInfo;
2800 if (!Subtarget->enableFlatScratch())
2801 assert(!UserSGPRInfo.hasFlatScratchInit());
2802 if ((CallConv != CallingConv::AMDGPU_CS &&
2803 CallConv != CallingConv::AMDGPU_Gfx) ||
2804 !Subtarget->hasArchitectedSGPRs())
2805 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2806 !Info->hasWorkGroupIDZ());
2807 }
2808
2809 if (CallConv == CallingConv::AMDGPU_PS) {
2810 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2811
2812 // At least one interpolation mode must be enabled or else the GPU will
2813 // hang.
2814 //
2815 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2816 // set PSInputAddr, the user wants to enable some bits after the compilation
2817 // based on run-time states. Since we can't know what the final PSInputEna
2818 // will look like, so we shouldn't do anything here and the user should take
2819 // responsibility for the correct programming.
2820 //
2821 // Otherwise, the following restrictions apply:
2822 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2823 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2824 // enabled too.
2825 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2826 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2827 CCInfo.AllocateReg(AMDGPU::VGPR0);
2828 CCInfo.AllocateReg(AMDGPU::VGPR1);
2829 Info->markPSInputAllocated(0);
2830 Info->markPSInputEnabled(0);
2831 }
2832 if (Subtarget->isAmdPalOS()) {
2833 // For isAmdPalOS, the user does not enable some bits after compilation
2834 // based on run-time states; the register values being generated here are
2835 // the final ones set in hardware. Therefore we need to apply the
2836 // workaround to PSInputAddr and PSInputEnable together. (The case where
2837 // a bit is set in PSInputAddr but not PSInputEnable is where the
2838 // frontend set up an input arg for a particular interpolation mode, but
2839 // nothing uses that input arg. Really we should have an earlier pass
2840 // that removes such an arg.)
2841 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2842 if ((PsInputBits & 0x7F) == 0 ||
2843 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2844 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2845 }
2846 } else if (IsKernel) {
2847 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2848 } else {
2849 Splits.append(Ins.begin(), Ins.end());
2850 }
2851
2852 if (IsKernel)
2853 analyzeFormalArgumentsCompute(CCInfo, Ins);
2854
2855 if (IsEntryFunc) {
2856 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2857 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2858 if (IsKernel && Subtarget->hasKernargPreload())
2859 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2860
2861 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2862 } else if (!IsGraphics) {
2863 // For the fixed ABI, pass workitem IDs in the last argument register.
2864 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2865
2866 // FIXME: Sink this into allocateSpecialInputSGPRs
2867 if (!Subtarget->enableFlatScratch())
2868 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2869
2870 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2871 }
2872
2873 if (!IsKernel) {
2874 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2875 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2876 }
2877
2879
2880 // FIXME: This is the minimum kernel argument alignment. We should improve
2881 // this to the maximum alignment of the arguments.
2882 //
2883 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2884 // kern arg offset.
2885 const Align KernelArgBaseAlign = Align(16);
2886
2887 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2888 const ISD::InputArg &Arg = Ins[i];
2889 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2890 InVals.push_back(DAG.getUNDEF(Arg.VT));
2891 continue;
2892 }
2893
2894 CCValAssign &VA = ArgLocs[ArgIdx++];
2895 MVT VT = VA.getLocVT();
2896
2897 if (IsEntryFunc && VA.isMemLoc()) {
2898 VT = Ins[i].VT;
2899 EVT MemVT = VA.getLocVT();
2900
2901 const uint64_t Offset = VA.getLocMemOffset();
2902 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2903
2904 if (Arg.Flags.isByRef()) {
2905 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2906
2907 const GCNTargetMachine &TM =
2908 static_cast<const GCNTargetMachine &>(getTargetMachine());
2909 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2910 Arg.Flags.getPointerAddrSpace())) {
2913 }
2914
2915 InVals.push_back(Ptr);
2916 continue;
2917 }
2918
2919 SDValue NewArg;
2920 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2921 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2922 // In this case the argument is packed into the previous preload SGPR.
2923 int64_t AlignDownOffset = alignDown(Offset, 4);
2924 int64_t OffsetDiff = Offset - AlignDownOffset;
2925 EVT IntVT = MemVT.changeTypeToInteger();
2926
2930 Register Reg =
2931 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2932
2933 assert(Reg);
2934 Register VReg = MRI.getLiveInVirtReg(Reg);
2935 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2936
2937 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2938 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2939
2940 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2941 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2942 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2943 Ins[i].Flags.isSExt(), &Ins[i]);
2944
2945 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2946 } else {
2950 const SmallVectorImpl<MCRegister> &PreloadRegs =
2951 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2952
2953 SDValue Copy;
2954 if (PreloadRegs.size() == 1) {
2955 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2956 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2957 NewArg = DAG.getCopyFromReg(
2958 Chain, DL, VReg,
2960 TRI->getRegSizeInBits(*RC)));
2961
2962 } else {
2963 // If the kernarg alignment does not match the alignment of the SGPR
2964 // tuple RC that can accommodate this argument, it will be built up
2965 // via copies from from the individual SGPRs that the argument was
2966 // preloaded to.
2968 for (auto Reg : PreloadRegs) {
2969 Register VReg = MRI.getLiveInVirtReg(Reg);
2970 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2971 Elts.push_back(Copy);
2972 }
2973 NewArg =
2974 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2975 PreloadRegs.size()),
2976 DL, Elts);
2977 }
2978
2979 SDValue CMemVT;
2980 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2981 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2982 else
2983 CMemVT = DAG.getBitcast(MemVT, NewArg);
2984 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2985 Ins[i].Flags.isSExt(), &Ins[i]);
2986 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2987 }
2988 } else {
2989 NewArg =
2990 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2991 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2992 }
2993 Chains.push_back(NewArg.getValue(1));
2994
2995 auto *ParamTy =
2996 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2998 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2999 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3000 // On SI local pointers are just offsets into LDS, so they are always
3001 // less than 16-bits. On CI and newer they could potentially be
3002 // real pointers, so we can't guarantee their size.
3003 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3004 DAG.getValueType(MVT::i16));
3005 }
3006
3007 InVals.push_back(NewArg);
3008 continue;
3009 } else if (!IsEntryFunc && VA.isMemLoc()) {
3010 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3011 InVals.push_back(Val);
3012 if (!Arg.Flags.isByVal())
3013 Chains.push_back(Val.getValue(1));
3014 continue;
3015 }
3016
3017 assert(VA.isRegLoc() && "Parameter must be in a register!");
3018
3019 Register Reg = VA.getLocReg();
3020 const TargetRegisterClass *RC = nullptr;
3021 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3022 RC = &AMDGPU::VGPR_32RegClass;
3023 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3024 RC = &AMDGPU::SGPR_32RegClass;
3025 else
3026 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3027 EVT ValVT = VA.getValVT();
3028
3029 Reg = MF.addLiveIn(Reg, RC);
3030 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3031
3032 if (Arg.Flags.isSRet()) {
3033 // The return object should be reasonably addressable.
3034
3035 // FIXME: This helps when the return is a real sret. If it is a
3036 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3037 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3038 unsigned NumBits
3040 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3041 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3042 }
3043
3044 // If this is an 8 or 16-bit value, it is really passed promoted
3045 // to 32 bits. Insert an assert[sz]ext to capture this, then
3046 // truncate to the right size.
3047 switch (VA.getLocInfo()) {
3048 case CCValAssign::Full:
3049 break;
3050 case CCValAssign::BCvt:
3051 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3052 break;
3053 case CCValAssign::SExt:
3054 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3055 DAG.getValueType(ValVT));
3056 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3057 break;
3058 case CCValAssign::ZExt:
3059 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3060 DAG.getValueType(ValVT));
3061 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3062 break;
3063 case CCValAssign::AExt:
3064 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3065 break;
3066 default:
3067 llvm_unreachable("Unknown loc info!");
3068 }
3069
3070 InVals.push_back(Val);
3071 }
3072
3073 // Start adding system SGPRs.
3074 if (IsEntryFunc)
3075 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3076
3077 auto &ArgUsageInfo =
3079 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3080
3081 unsigned StackArgSize = CCInfo.getStackSize();
3082 Info->setBytesInStackArgArea(StackArgSize);
3083
3084 return Chains.empty() ? Chain :
3085 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3086}
3087
3088// TODO: If return values can't fit in registers, we should return as many as
3089// possible in registers before passing on stack.
3091 CallingConv::ID CallConv,
3092 MachineFunction &MF, bool IsVarArg,
3094 LLVMContext &Context) const {
3095 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3096 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3097 // for shaders. Vector types should be explicitly handled by CC.
3098 if (AMDGPU::isEntryFunctionCC(CallConv))
3099 return true;
3100
3102 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3103 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3104 return false;
3105
3106 // We must use the stack if return would require unavailable registers.
3107 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3108 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3109 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3110 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3111 return false;
3112
3113 return true;
3114}
3115
3116SDValue
3118 bool isVarArg,
3120 const SmallVectorImpl<SDValue> &OutVals,
3121 const SDLoc &DL, SelectionDAG &DAG) const {
3124
3125 if (AMDGPU::isKernel(CallConv)) {
3126 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3127 OutVals, DL, DAG);
3128 }
3129
3130 bool IsShader = AMDGPU::isShader(CallConv);
3131
3132 Info->setIfReturnsVoid(Outs.empty());
3133 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3134
3135 // CCValAssign - represent the assignment of the return value to a location.
3138
3139 // CCState - Info about the registers and stack slots.
3140 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3141 *DAG.getContext());
3142
3143 // Analyze outgoing return values.
3144 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3145
3146 SDValue Glue;
3148 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3149
3150 // Copy the result values into the output registers.
3151 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3152 ++I, ++RealRVLocIdx) {
3153 CCValAssign &VA = RVLocs[I];
3154 assert(VA.isRegLoc() && "Can only return in registers!");
3155 // TODO: Partially return in registers if return values don't fit.
3156 SDValue Arg = OutVals[RealRVLocIdx];
3157
3158 // Copied from other backends.
3159 switch (VA.getLocInfo()) {
3160 case CCValAssign::Full:
3161 break;
3162 case CCValAssign::BCvt:
3163 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3164 break;
3165 case CCValAssign::SExt:
3166 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3167 break;
3168 case CCValAssign::ZExt:
3169 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3170 break;
3171 case CCValAssign::AExt:
3172 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3173 break;
3174 default:
3175 llvm_unreachable("Unknown loc info!");
3176 }
3177
3178 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3179 Glue = Chain.getValue(1);
3180 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3181 }
3182
3183 // FIXME: Does sret work properly?
3184 if (!Info->isEntryFunction()) {
3185 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3186 const MCPhysReg *I =
3187 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3188 if (I) {
3189 for (; *I; ++I) {
3190 if (AMDGPU::SReg_64RegClass.contains(*I))
3191 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3192 else if (AMDGPU::SReg_32RegClass.contains(*I))
3193 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3194 else
3195 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3196 }
3197 }
3198 }
3199
3200 // Update chain and glue.
3201 RetOps[0] = Chain;
3202 if (Glue.getNode())
3203 RetOps.push_back(Glue);
3204
3205 unsigned Opc = AMDGPUISD::ENDPGM;
3206 if (!IsWaveEnd)
3208 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3209}
3210
3212 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3213 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3214 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3215 SDValue ThisVal) const {
3216 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3217
3218 // Assign locations to each value returned by this call.
3220 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3221 *DAG.getContext());
3222 CCInfo.AnalyzeCallResult(Ins, RetCC);
3223
3224 // Copy all of the result registers out of their specified physreg.
3225 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3226 CCValAssign VA = RVLocs[i];
3227 SDValue Val;
3228
3229 if (VA.isRegLoc()) {
3230 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3231 Chain = Val.getValue(1);
3232 InGlue = Val.getValue(2);
3233 } else if (VA.isMemLoc()) {
3234 report_fatal_error("TODO: return values in memory");
3235 } else
3236 llvm_unreachable("unknown argument location type");
3237
3238 switch (VA.getLocInfo()) {
3239 case CCValAssign::Full:
3240 break;
3241 case CCValAssign::BCvt:
3242 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3243 break;
3244 case CCValAssign::ZExt:
3245 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3246 DAG.getValueType(VA.getValVT()));
3247 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3248 break;
3249 case CCValAssign::SExt:
3250 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3251 DAG.getValueType(VA.getValVT()));
3252 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3253 break;
3254 case CCValAssign::AExt:
3255 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3256 break;
3257 default:
3258 llvm_unreachable("Unknown loc info!");
3259 }
3260
3261 InVals.push_back(Val);
3262 }
3263
3264 return Chain;
3265}
3266
3267// Add code to pass special inputs required depending on used features separate
3268// from the explicit user arguments present in the IR.
3270 CallLoweringInfo &CLI,
3271 CCState &CCInfo,
3272 const SIMachineFunctionInfo &Info,
3273 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3274 SmallVectorImpl<SDValue> &MemOpChains,
3275 SDValue Chain) const {
3276 // If we don't have a call site, this was a call inserted by
3277 // legalization. These can never use special inputs.
3278 if (!CLI.CB)
3279 return;
3280
3281 SelectionDAG &DAG = CLI.DAG;
3282 const SDLoc &DL = CLI.DL;
3283 const Function &F = DAG.getMachineFunction().getFunction();
3284
3285 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3286 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3287
3288 const AMDGPUFunctionArgInfo *CalleeArgInfo
3290 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3291 auto &ArgUsageInfo =
3293 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3294 }
3295
3296 // TODO: Unify with private memory register handling. This is complicated by
3297 // the fact that at least in kernels, the input argument is not necessarily
3298 // in the same location as the input.
3299 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3301 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3302 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3303 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3304 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3305 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3306 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3307 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3308 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3309 };
3310
3311 for (auto Attr : ImplicitAttrs) {
3312 const ArgDescriptor *OutgoingArg;
3313 const TargetRegisterClass *ArgRC;
3314 LLT ArgTy;
3315
3316 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3317
3318 // If the callee does not use the attribute value, skip copying the value.
3319 if (CLI.CB->hasFnAttr(Attr.second))
3320 continue;
3321
3322 std::tie(OutgoingArg, ArgRC, ArgTy) =
3323 CalleeArgInfo->getPreloadedValue(InputID);
3324 if (!OutgoingArg)
3325 continue;
3326
3327 const ArgDescriptor *IncomingArg;
3328 const TargetRegisterClass *IncomingArgRC;
3329 LLT Ty;
3330 std::tie(IncomingArg, IncomingArgRC, Ty) =
3331 CallerArgInfo.getPreloadedValue(InputID);
3332 assert(IncomingArgRC == ArgRC);
3333
3334 // All special arguments are ints for now.
3335 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3336 SDValue InputReg;
3337
3338 if (IncomingArg) {
3339 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3340 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3341 // The implicit arg ptr is special because it doesn't have a corresponding
3342 // input for kernels, and is computed from the kernarg segment pointer.
3343 InputReg = getImplicitArgPtr(DAG, DL);
3344 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3345 std::optional<uint32_t> Id =
3347 if (Id.has_value()) {
3348 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3349 } else {
3350 InputReg = DAG.getUNDEF(ArgVT);
3351 }
3352 } else {
3353 // We may have proven the input wasn't needed, although the ABI is
3354 // requiring it. We just need to allocate the register appropriately.
3355 InputReg = DAG.getUNDEF(ArgVT);
3356 }
3357
3358 if (OutgoingArg->isRegister()) {
3359 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3360 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3361 report_fatal_error("failed to allocate implicit input argument");
3362 } else {
3363 unsigned SpecialArgOffset =
3364 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3365 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3366 SpecialArgOffset);
3367 MemOpChains.push_back(ArgStore);
3368 }
3369 }
3370
3371 // Pack workitem IDs into a single register or pass it as is if already
3372 // packed.
3373 const ArgDescriptor *OutgoingArg;
3374 const TargetRegisterClass *ArgRC;
3375 LLT Ty;
3376
3377 std::tie(OutgoingArg, ArgRC, Ty) =
3379 if (!OutgoingArg)
3380 std::tie(OutgoingArg, ArgRC, Ty) =
3382 if (!OutgoingArg)
3383 std::tie(OutgoingArg, ArgRC, Ty) =
3385 if (!OutgoingArg)
3386 return;
3387
3388 const ArgDescriptor *IncomingArgX = std::get<0>(
3390 const ArgDescriptor *IncomingArgY = std::get<0>(
3392 const ArgDescriptor *IncomingArgZ = std::get<0>(
3394
3395 SDValue InputReg;
3396 SDLoc SL;
3397
3398 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3399 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3400 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3401
3402 // If incoming ids are not packed we need to pack them.
3403 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3404 NeedWorkItemIDX) {
3405 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3406 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3407 } else {
3408 InputReg = DAG.getConstant(0, DL, MVT::i32);
3409 }
3410 }
3411
3412 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3413 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3414 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3415 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3416 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3417 InputReg = InputReg.getNode() ?
3418 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3419 }
3420
3421 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3422 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3423 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3424 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3425 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3426 InputReg = InputReg.getNode() ?
3427 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3428 }
3429
3430 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3431 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3432 // We're in a situation where the outgoing function requires the workitem
3433 // ID, but the calling function does not have it (e.g a graphics function
3434 // calling a C calling convention function). This is illegal, but we need
3435 // to produce something.
3436 InputReg = DAG.getUNDEF(MVT::i32);
3437 } else {
3438 // Workitem ids are already packed, any of present incoming arguments
3439 // will carry all required fields.
3441 IncomingArgX ? *IncomingArgX :
3442 IncomingArgY ? *IncomingArgY :
3443 *IncomingArgZ, ~0u);
3444 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3445 }
3446 }
3447
3448 if (OutgoingArg->isRegister()) {
3449 if (InputReg)
3450 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3451
3452 CCInfo.AllocateReg(OutgoingArg->getRegister());
3453 } else {
3454 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3455 if (InputReg) {
3456 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3457 SpecialArgOffset);
3458 MemOpChains.push_back(ArgStore);
3459 }
3460 }
3461}
3462
3464 return CC == CallingConv::Fast;
3465}
3466
3467/// Return true if we might ever do TCO for calls with this calling convention.
3469 switch (CC) {
3470 case CallingConv::C:
3472 return true;
3473 default:
3474 return canGuaranteeTCO(CC);
3475 }
3476}
3477
3479 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3481 const SmallVectorImpl<SDValue> &OutVals,
3482 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3483 if (AMDGPU::isChainCC(CalleeCC))
3484 return true;
3485
3486 if (!mayTailCallThisCC(CalleeCC))
3487 return false;
3488
3489 // For a divergent call target, we need to do a waterfall loop over the
3490 // possible callees which precludes us from using a simple jump.
3491 if (Callee->isDivergent())
3492 return false;
3493
3495 const Function &CallerF = MF.getFunction();
3496 CallingConv::ID CallerCC = CallerF.getCallingConv();
3498 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3499
3500 // Kernels aren't callable, and don't have a live in return address so it
3501 // doesn't make sense to do a tail call with entry functions.
3502 if (!CallerPreserved)
3503 return false;
3504
3505 bool CCMatch = CallerCC == CalleeCC;
3506
3508 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3509 return true;
3510 return false;
3511 }
3512
3513 // TODO: Can we handle var args?
3514 if (IsVarArg)
3515 return false;
3516
3517 for (const Argument &Arg : CallerF.args()) {
3518 if (Arg.hasByValAttr())
3519 return false;
3520 }
3521
3522 LLVMContext &Ctx = *DAG.getContext();
3523
3524 // Check that the call results are passed in the same way.
3525 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3526 CCAssignFnForCall(CalleeCC, IsVarArg),
3527 CCAssignFnForCall(CallerCC, IsVarArg)))
3528 return false;
3529
3530 // The callee has to preserve all registers the caller needs to preserve.
3531 if (!CCMatch) {
3532 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3533 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3534 return false;
3535 }
3536
3537 // Nothing more to check if the callee is taking no arguments.
3538 if (Outs.empty())
3539 return true;
3540
3542 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3543
3544 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3545
3546 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3547 // If the stack arguments for this call do not fit into our own save area then
3548 // the call cannot be made tail.
3549 // TODO: Is this really necessary?
3550 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3551 return false;
3552
3553 const MachineRegisterInfo &MRI = MF.getRegInfo();
3554 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3555}
3556
3558 if (!CI->isTailCall())
3559 return false;
3560
3561 const Function *ParentFn = CI->getParent()->getParent();
3563 return false;
3564 return true;
3565}
3566
3567// The wave scratch offset register is used as the global base pointer.
3569 SmallVectorImpl<SDValue> &InVals) const {
3570 CallingConv::ID CallConv = CLI.CallConv;
3571 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3572
3573 SelectionDAG &DAG = CLI.DAG;
3574
3575 TargetLowering::ArgListEntry RequestedExec;
3576 if (IsChainCallConv) {
3577 // The last argument should be the value that we need to put in EXEC.
3578 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3579 // don't treat it like the rest of the arguments.
3580 RequestedExec = CLI.Args.back();
3581 assert(RequestedExec.Node && "No node for EXEC");
3582
3583 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3584 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3585
3586 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3587 CLI.Outs.pop_back();
3588 CLI.OutVals.pop_back();
3589
3590 if (RequestedExec.Ty->isIntegerTy(64)) {
3591 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3592 CLI.Outs.pop_back();
3593 CLI.OutVals.pop_back();
3594 }
3595
3596 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3597 "Haven't popped all the pieces of the EXEC mask");
3598 }
3599
3600 const SDLoc &DL = CLI.DL;
3602 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3604 SDValue Chain = CLI.Chain;
3605 SDValue Callee = CLI.Callee;
3606 bool &IsTailCall = CLI.IsTailCall;
3607 bool IsVarArg = CLI.IsVarArg;
3608 bool IsSibCall = false;
3610
3611 if (Callee.isUndef() || isNullConstant(Callee)) {
3612 if (!CLI.IsTailCall) {
3613 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3614 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3615 }
3616
3617 return Chain;
3618 }
3619
3620 if (IsVarArg) {
3621 return lowerUnhandledCall(CLI, InVals,
3622 "unsupported call to variadic function ");
3623 }
3624
3625 if (!CLI.CB)
3626 report_fatal_error("unsupported libcall legalization");
3627
3628 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3629 return lowerUnhandledCall(CLI, InVals,
3630 "unsupported required tail call to function ");
3631 }
3632
3633 if (IsTailCall) {
3635 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3636 if (!IsTailCall &&
3637 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3638 report_fatal_error("failed to perform tail call elimination on a call "
3639 "site marked musttail or on llvm.amdgcn.cs.chain");
3640 }
3641
3642 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3643
3644 // A sibling call is one where we're under the usual C ABI and not planning
3645 // to change that but can still do a tail call:
3646 if (!TailCallOpt && IsTailCall)
3647 IsSibCall = true;
3648
3649 if (IsTailCall)
3650 ++NumTailCalls;
3651 }
3652
3655 SmallVector<SDValue, 8> MemOpChains;
3656
3657 // Analyze operands of the call, assigning locations to each operand.
3659 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3660 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3661
3662 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3663 // With a fixed ABI, allocate fixed registers before user arguments.
3664 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3665 }
3666
3667 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3668
3669 // Get a count of how many bytes are to be pushed on the stack.
3670 unsigned NumBytes = CCInfo.getStackSize();
3671
3672 if (IsSibCall) {
3673 // Since we're not changing the ABI to make this a tail call, the memory
3674 // operands are already available in the caller's incoming argument space.
3675 NumBytes = 0;
3676 }
3677
3678 // FPDiff is the byte offset of the call's argument area from the callee's.
3679 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3680 // by this amount for a tail call. In a sibling call it must be 0 because the
3681 // caller will deallocate the entire stack and the callee still expects its
3682 // arguments to begin at SP+0. Completely unused for non-tail calls.
3683 int32_t FPDiff = 0;
3684 MachineFrameInfo &MFI = MF.getFrameInfo();
3685
3686 // Adjust the stack pointer for the new arguments...
3687 // These operations are automatically eliminated by the prolog/epilog pass
3688 if (!IsSibCall)
3689 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3690
3691 if (!IsSibCall || IsChainCallConv) {
3692 if (!Subtarget->enableFlatScratch()) {
3693 SmallVector<SDValue, 4> CopyFromChains;
3694
3695 // In the HSA case, this should be an identity copy.
3696 SDValue ScratchRSrcReg
3697 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3698 RegsToPass.emplace_back(IsChainCallConv
3699 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3700 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3701 ScratchRSrcReg);
3702 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3703 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3704 }
3705 }
3706
3707 MVT PtrVT = MVT::i32;
3708
3709 // Walk the register/memloc assignments, inserting copies/loads.
3710 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3711 CCValAssign &VA = ArgLocs[i];
3712 SDValue Arg = OutVals[i];
3713
3714 // Promote the value if needed.
3715 switch (VA.getLocInfo()) {
3716 case CCValAssign::Full:
3717 break;
3718 case CCValAssign::BCvt:
3719 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3720 break;
3721 case CCValAssign::ZExt:
3722 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3723 break;
3724 case CCValAssign::SExt:
3725 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3726 break;
3727 case CCValAssign::AExt:
3728 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3729 break;
3730 case CCValAssign::FPExt:
3731 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3732 break;
3733 default:
3734 llvm_unreachable("Unknown loc info!");
3735 }
3736
3737 if (VA.isRegLoc()) {
3738 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3739 } else {
3740 assert(VA.isMemLoc());
3741
3742 SDValue DstAddr;
3743 MachinePointerInfo DstInfo;
3744
3745 unsigned LocMemOffset = VA.getLocMemOffset();
3746 int32_t Offset = LocMemOffset;
3747
3748 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3749 MaybeAlign Alignment;
3750
3751 if (IsTailCall) {
3752 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3753 unsigned OpSize = Flags.isByVal() ?
3754 Flags.getByValSize() : VA.getValVT().getStoreSize();
3755
3756 // FIXME: We can have better than the minimum byval required alignment.
3757 Alignment =
3758 Flags.isByVal()
3759 ? Flags.getNonZeroByValAlign()
3760 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3761
3762 Offset = Offset + FPDiff;
3763 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3764
3765 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3766 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3767
3768 // Make sure any stack arguments overlapping with where we're storing
3769 // are loaded before this eventual operation. Otherwise they'll be
3770 // clobbered.
3771
3772 // FIXME: Why is this really necessary? This seems to just result in a
3773 // lot of code to copy the stack and write them back to the same
3774 // locations, which are supposed to be immutable?
3775 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3776 } else {
3777 // Stores to the argument stack area are relative to the stack pointer.
3778 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3779 MVT::i32);
3780 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3781 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3782 Alignment =
3783 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3784 }
3785
3786 if (Outs[i].Flags.isByVal()) {
3787 SDValue SizeNode =
3788 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3789 SDValue Cpy =
3790 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3791 Outs[i].Flags.getNonZeroByValAlign(),
3792 /*isVol = */ false, /*AlwaysInline = */ true,
3793 /*isTailCall = */ false, DstInfo,
3795
3796 MemOpChains.push_back(Cpy);
3797 } else {
3798 SDValue Store =
3799 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3800 MemOpChains.push_back(Store);
3801 }
3802 }
3803 }
3804
3805 if (!MemOpChains.empty())
3806 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3807
3808 // Build a sequence of copy-to-reg nodes chained together with token chain
3809 // and flag operands which copy the outgoing args into the appropriate regs.
3810 SDValue InGlue;
3811 for (auto &RegToPass : RegsToPass) {
3812 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3813 RegToPass.second, InGlue);
3814 InGlue = Chain.getValue(1);
3815 }
3816
3817
3818 // We don't usually want to end the call-sequence here because we would tidy
3819 // the frame up *after* the call, however in the ABI-changing tail-call case
3820 // we've carefully laid out the parameters so that when sp is reset they'll be
3821 // in the correct location.
3822 if (IsTailCall && !IsSibCall) {
3823 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3824 InGlue = Chain.getValue(1);
3825 }
3826
3827 std::vector<SDValue> Ops;
3828 Ops.push_back(Chain);
3829 Ops.push_back(Callee);
3830 // Add a redundant copy of the callee global which will not be legalized, as
3831 // we need direct access to the callee later.
3832 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3833 const GlobalValue *GV = GSD->getGlobal();
3834 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3835 } else {
3836 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3837 }
3838
3839 if (IsTailCall) {
3840 // Each tail call may have to adjust the stack by a different amount, so
3841 // this information must travel along with the operation for eventual
3842 // consumption by emitEpilogue.
3843 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3844 }
3845
3846 if (IsChainCallConv)
3847 Ops.push_back(RequestedExec.Node);
3848
3849 // Add argument registers to the end of the list so that they are known live
3850 // into the call.
3851 for (auto &RegToPass : RegsToPass) {
3852 Ops.push_back(DAG.getRegister(RegToPass.first,
3853 RegToPass.second.getValueType()));
3854 }
3855
3856 // Add a register mask operand representing the call-preserved registers.
3857 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3858 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3859 assert(Mask && "Missing call preserved mask for calling convention");
3860 Ops.push_back(DAG.getRegisterMask(Mask));
3861
3862 if (SDValue Token = CLI.ConvergenceControlToken) {
3864 GlueOps.push_back(Token);
3865 if (InGlue)
3866 GlueOps.push_back(InGlue);
3867
3868 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3869 MVT::Glue, GlueOps),
3870 0);
3871 }
3872
3873 if (InGlue)
3874 Ops.push_back(InGlue);
3875
3876 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3877
3878 // If we're doing a tall call, use a TC_RETURN here rather than an
3879 // actual call instruction.
3880 if (IsTailCall) {
3881 MFI.setHasTailCall();
3882 unsigned OPC = AMDGPUISD::TC_RETURN;
3883 switch (CallConv) {
3886 break;
3890 break;
3891 }
3892
3893 return DAG.getNode(OPC, DL, NodeTys, Ops);
3894 }
3895
3896 // Returns a chain and a flag for retval copy to use.
3897 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3898 Chain = Call.getValue(0);
3899 InGlue = Call.getValue(1);
3900
3901 uint64_t CalleePopBytes = NumBytes;
3902 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3903 if (!Ins.empty())
3904 InGlue = Chain.getValue(1);
3905
3906 // Handle result values, copying them out of physregs into vregs that we
3907 // return.
3908 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3909 InVals, /*IsThisReturn=*/false, SDValue());
3910}
3911
3912// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3913// except for applying the wave size scale to the increment amount.
3915 SDValue Op, SelectionDAG &DAG) const {
3916 const MachineFunction &MF = DAG.getMachineFunction();
3918
3919 SDLoc dl(Op);
3920 EVT VT = Op.getValueType();
3921 SDValue Tmp1 = Op;
3922 SDValue Tmp2 = Op.getValue(1);
3923 SDValue Tmp3 = Op.getOperand(2);
3924 SDValue Chain = Tmp1.getOperand(0);
3925
3926 Register SPReg = Info->getStackPtrOffsetReg();
3927
3928 // Chain the dynamic stack allocation so that it doesn't modify the stack
3929 // pointer when other instructions are using the stack.
3930 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3931
3932 SDValue Size = Tmp2.getOperand(1);
3933 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3934 Chain = SP.getValue(1);
3935 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3936 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3937 unsigned Opc =
3940
3941 SDValue ScaledSize = DAG.getNode(
3942 ISD::SHL, dl, VT, Size,
3943 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3944
3945 Align StackAlign = TFL->getStackAlign();
3946 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3947 if (Alignment && *Alignment > StackAlign) {
3948 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3949 DAG.getConstant(-(uint64_t)Alignment->value()
3950 << Subtarget->getWavefrontSizeLog2(),
3951 dl, VT));
3952 }
3953
3954 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3955 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3956
3957 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3958}
3959
3961 SelectionDAG &DAG) const {
3962 // We only handle constant sizes here to allow non-entry block, static sized
3963 // allocas. A truly dynamic value is more difficult to support because we
3964 // don't know if the size value is uniform or not. If the size isn't uniform,
3965 // we would need to do a wave reduction to get the maximum size to know how
3966 // much to increment the uniform stack pointer.
3967 SDValue Size = Op.getOperand(1);
3968 if (isa<ConstantSDNode>(Size))
3969 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3970
3972}
3973
3975 if (Op.getValueType() != MVT::i32)
3976 return Op; // Defer to cannot select error.
3977
3979 SDLoc SL(Op);
3980
3981 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3982
3983 // Convert from wave uniform to swizzled vector address. This should protect
3984 // from any edge cases where the stacksave result isn't directly used with
3985 // stackrestore.
3986 SDValue VectorAddress =
3987 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3988 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3989}
3990
3992 SelectionDAG &DAG) const {
3993 SDLoc SL(Op);
3994 assert(Op.getValueType() == MVT::i32);
3995
3996 uint32_t BothRoundHwReg =
3998 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3999
4000 SDValue IntrinID =
4001 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4002 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4003 Op.getOperand(0), IntrinID, GetRoundBothImm);
4004
4005 // There are two rounding modes, one for f32 and one for f64/f16. We only
4006 // report in the standard value range if both are the same.
4007 //
4008 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4009 // ties away from zero is not supported, and the other values are rotated by
4010 // 1.
4011 //
4012 // If the two rounding modes are not the same, report a target defined value.
4013
4014 // Mode register rounding mode fields:
4015 //
4016 // [1:0] Single-precision round mode.
4017 // [3:2] Double/Half-precision round mode.
4018 //
4019 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4020 //
4021 // Hardware Spec
4022 // Toward-0 3 0
4023 // Nearest Even 0 1
4024 // +Inf 1 2
4025 // -Inf 2 3
4026 // NearestAway0 N/A 4
4027 //
4028 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4029 // table we can index by the raw hardware mode.
4030 //
4031 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4032
4033 SDValue BitTable =
4035
4036 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4037 SDValue RoundModeTimesNumBits =
4038 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4039
4040 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4041 // knew only one mode was demanded.
4042 SDValue TableValue =
4043 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4044 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4045
4046 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4047 SDValue TableEntry =
4048 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4049
4050 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4051 // if it's an extended value.
4052 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4053 SDValue IsStandardValue =
4054 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4055 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4056 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4057 TableEntry, EnumOffset);
4058
4059 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4060}
4061
4063 SelectionDAG &DAG) const {
4064 SDLoc SL(Op);
4065
4066 SDValue NewMode = Op.getOperand(1);
4067 assert(NewMode.getValueType() == MVT::i32);
4068
4069 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4070 // hardware MODE.fp_round values.
4071 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4072 uint32_t ClampedVal = std::min(
4073 static_cast<uint32_t>(ConstMode->getZExtValue()),
4075 NewMode = DAG.getConstant(
4076 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4077 } else {
4078 // If we know the input can only be one of the supported standard modes in
4079 // the range 0-3, we can use a simplified mapping to hardware values.
4080 KnownBits KB = DAG.computeKnownBits(NewMode);
4081 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4082 // The supported standard values are 0-3. The extended values start at 8. We
4083 // need to offset by 4 if the value is in the extended range.
4084
4085 if (UseReducedTable) {
4086 // Truncate to the low 32-bits.
4087 SDValue BitTable = DAG.getConstant(
4088 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4089
4090 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4091 SDValue RoundModeTimesNumBits =
4092 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4093
4094 NewMode =
4095 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4096
4097 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4098 // the table extracted bits into inline immediates.
4099 } else {
4100 // table_index = umin(value, value - 4)
4101 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4102 SDValue BitTable =
4104
4105 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4106 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4107 SDValue IndexVal =
4108 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4109
4110 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4111 SDValue RoundModeTimesNumBits =
4112 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4113
4114 SDValue TableValue =
4115 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4116 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4117
4118 // No need to mask out the high bits since the setreg will ignore them
4119 // anyway.
4120 NewMode = TruncTable;
4121 }
4122
4123 // Insert a readfirstlane in case the value is a VGPR. We could do this
4124 // earlier and keep more operations scalar, but that interferes with
4125 // combining the source.
4126 SDValue ReadFirstLaneID =
4127 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4128 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4129 ReadFirstLaneID, NewMode);
4130 }
4131
4132 // N.B. The setreg will be later folded into s_round_mode on supported
4133 // targets.
4134 SDValue IntrinID =
4135 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4136 uint32_t BothRoundHwReg =
4138 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4139
4140 SDValue SetReg =
4141 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4142 IntrinID, RoundBothImm, NewMode);
4143
4144 return SetReg;
4145}
4146
4148 if (Op->isDivergent())
4149 return SDValue();
4150
4151 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4156 break;
4157 default:
4158 return SDValue();
4159 }
4160
4161 return Op;
4162}
4163
4164// Work around DAG legality rules only based on the result type.
4166 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4167 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4168 EVT SrcVT = Src.getValueType();
4169
4170 if (SrcVT.getScalarType() != MVT::bf16)
4171 return Op;
4172
4173 SDLoc SL(Op);
4174 SDValue BitCast =
4175 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4176
4177 EVT DstVT = Op.getValueType();
4178 if (IsStrict)
4179 llvm_unreachable("Need STRICT_BF16_TO_FP");
4180
4181 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4182}
4183
4185 SDLoc SL(Op);
4186 if (Op.getValueType() != MVT::i64)
4187 return Op;
4188
4189 uint32_t ModeHwReg =
4191 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4192 uint32_t TrapHwReg =
4194 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4195
4196 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4197 SDValue IntrinID =
4198 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4199 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4200 Op.getOperand(0), IntrinID, ModeHwRegImm);
4201 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4202 Op.getOperand(0), IntrinID, TrapHwRegImm);
4203 SDValue TokenReg =
4204 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4205 GetTrapReg.getValue(1));
4206
4207 SDValue CvtPtr =
4208 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4209 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4210
4211 return DAG.getMergeValues({Result, TokenReg}, SL);
4212}
4213
4215 SDLoc SL(Op);
4216 if (Op.getOperand(1).getValueType() != MVT::i64)
4217 return Op;
4218
4219 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4220 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4221 DAG.getConstant(0, SL, MVT::i32));
4222 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4223 DAG.getConstant(1, SL, MVT::i32));
4224
4225 SDValue ReadFirstLaneID =
4226 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4227 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4228 ReadFirstLaneID, NewModeReg);
4229 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4230 ReadFirstLaneID, NewTrapReg);
4231
4232 unsigned ModeHwReg =
4234 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4235 unsigned TrapHwReg =
4237 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4238
4239 SDValue IntrinID =
4240 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4241 SDValue SetModeReg =
4242 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4243 IntrinID, ModeHwRegImm, NewModeReg);
4244 SDValue SetTrapReg =
4245 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4246 IntrinID, TrapHwRegImm, NewTrapReg);
4247 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4248}
4249
4251 const MachineFunction &MF) const {
4253 .Case("m0", AMDGPU::M0)
4254 .Case("exec", AMDGPU::EXEC)
4255 .Case("exec_lo", AMDGPU::EXEC_LO)
4256 .Case("exec_hi", AMDGPU::EXEC_HI)
4257 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4258 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4259 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4260 .Default(Register());
4261
4262 if (Reg == AMDGPU::NoRegister) {
4263 report_fatal_error(Twine("invalid register name \""
4264 + StringRef(RegName) + "\"."));
4265
4266 }
4267
4268 if (!Subtarget->hasFlatScrRegister() &&
4269 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4270 report_fatal_error(Twine("invalid register \""
4271 + StringRef(RegName) + "\" for subtarget."));
4272 }
4273
4274 switch (Reg) {
4275 case AMDGPU::M0:
4276 case AMDGPU::EXEC_LO:
4277 case AMDGPU::EXEC_HI:
4278 case AMDGPU::FLAT_SCR_LO:
4279 case AMDGPU::FLAT_SCR_HI:
4280 if (VT.getSizeInBits() == 32)
4281 return Reg;
4282 break;
4283 case AMDGPU::EXEC:
4284 case AMDGPU::FLAT_SCR:
4285 if (VT.getSizeInBits() == 64)
4286 return Reg;
4287 break;
4288 default:
4289 llvm_unreachable("missing register type checking");
4290 }
4291
4292 report_fatal_error(Twine("invalid type for register \""
4293 + StringRef(RegName) + "\"."));
4294}
4295
4296// If kill is not the last instruction, split the block so kill is always a
4297// proper terminator.
4300 MachineBasicBlock *BB) const {
4301 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4303 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4304 return SplitBB;
4305}
4306
4307// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4308// \p MI will be the only instruction in the loop body block. Otherwise, it will
4309// be the first instruction in the remainder block.
4310//
4311/// \returns { LoopBody, Remainder }
4312static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4316
4317 // To insert the loop we need to split the block. Move everything after this
4318 // point to a new block, and insert a new empty block between the two.
4320 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4322 ++MBBI;
4323
4324 MF->insert(MBBI, LoopBB);
4325 MF->insert(MBBI, RemainderBB);
4326
4327 LoopBB->addSuccessor(LoopBB);
4328 LoopBB->addSuccessor(RemainderBB);
4329
4330 // Move the rest of the block into a new block.
4331 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4332
4333 if (InstInLoop) {
4334 auto Next = std::next(I);
4335
4336 // Move instruction to loop body.
4337 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4338
4339 // Move the rest of the block.
4340 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4341 } else {
4342 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4343 }
4344
4345 MBB.addSuccessor(LoopBB);
4346
4347 return std::pair(LoopBB, RemainderBB);
4348}
4349
4350/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4352 MachineBasicBlock *MBB = MI.getParent();
4354 auto I = MI.getIterator();
4355 auto E = std::next(I);
4356
4357 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4358 .addImm(0);
4359
4360 MIBundleBuilder Bundler(*MBB, I, E);
4361 finalizeBundle(*MBB, Bundler.begin());
4362}
4363
4366 MachineBasicBlock *BB) const {
4367 const DebugLoc &DL = MI.getDebugLoc();
4368
4370
4371 MachineBasicBlock *LoopBB;
4372 MachineBasicBlock *RemainderBB;
4374
4375 // Apparently kill flags are only valid if the def is in the same block?
4376 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4377 Src->setIsKill(false);
4378
4379 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4380
4381 MachineBasicBlock::iterator I = LoopBB->end();
4382
4383 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4385
4386 // Clear TRAP_STS.MEM_VIOL
4387 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4388 .addImm(0)
4389 .addImm(EncodedReg);
4390
4392
4393 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4394
4395 // Load and check TRAP_STS.MEM_VIOL
4396 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4397 .addImm(EncodedReg);
4398
4399 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4400 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4401 .addReg(Reg, RegState::Kill)
4402 .addImm(0);
4403 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4404 .addMBB(LoopBB);
4405
4406 return RemainderBB;
4407}
4408
4409// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4410// wavefront. If the value is uniform and just happens to be in a VGPR, this
4411// will only do one iteration. In the worst case, this will loop 64 times.
4412//
4413// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4416 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4417 const DebugLoc &DL, const MachineOperand &Idx,
4418 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4419 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4420 Register &SGPRIdxReg) {
4421
4422 MachineFunction *MF = OrigBB.getParent();
4423 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4424 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4426
4427 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4428 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4429 Register NewExec = MRI.createVirtualRegister(BoolRC);
4430 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4431 Register CondReg = MRI.createVirtualRegister(BoolRC);
4432
4433 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4434 .addReg(InitReg)
4435 .addMBB(&OrigBB)
4436 .addReg(ResultReg)
4437 .addMBB(&LoopBB);
4438
4439 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4440 .addReg(InitSaveExecReg)
4441 .addMBB(&OrigBB)
4442 .addReg(NewExec)
4443 .addMBB(&LoopBB);
4444
4445 // Read the next variant <- also loop target.
4446 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4447 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4448
4449 // Compare the just read M0 value to all possible Idx values.
4450 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4451 .addReg(CurrentIdxReg)
4452 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4453
4454 // Update EXEC, save the original EXEC value to VCC.
4455 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4456 : AMDGPU::S_AND_SAVEEXEC_B64),
4457 NewExec)
4458 .addReg(CondReg, RegState::Kill);
4459
4460 MRI.setSimpleHint(NewExec, CondReg);
4461
4462 if (UseGPRIdxMode) {
4463 if (Offset == 0) {
4464 SGPRIdxReg = CurrentIdxReg;
4465 } else {
4466 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4467 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4468 .addReg(CurrentIdxReg, RegState::Kill)
4469 .addImm(Offset);
4470 }
4471 } else {
4472 // Move index from VCC into M0
4473 if (Offset == 0) {
4474 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4475 .addReg(CurrentIdxReg, RegState::Kill);
4476 } else {
4477 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4478 .addReg(CurrentIdxReg, RegState::Kill)
4479 .addImm(Offset);
4480 }
4481 }
4482
4483 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4484 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4485 MachineInstr *InsertPt =
4486 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4487 : AMDGPU::S_XOR_B64_term), Exec)
4488 .addReg(Exec)
4489 .addReg(NewExec);
4490
4491 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4492 // s_cbranch_scc0?
4493
4494 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4495 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4496 .addMBB(&LoopBB);
4497
4498 return InsertPt->getIterator();
4499}
4500
4501// This has slightly sub-optimal regalloc when the source vector is killed by
4502// the read. The register allocator does not understand that the kill is
4503// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4504// subregister from it, using 1 more VGPR than necessary. This was saved when
4505// this was expanded after register allocation.
4508 unsigned InitResultReg, unsigned PhiReg, int Offset,
4509 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4511 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4512 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4514 const DebugLoc &DL = MI.getDebugLoc();
4516
4517 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4518 Register DstReg = MI.getOperand(0).getReg();
4519 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4520 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4521 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4522 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4523
4524 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4525
4526 // Save the EXEC mask
4527 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4528 .addReg(Exec);
4529
4530 MachineBasicBlock *LoopBB;
4531 MachineBasicBlock *RemainderBB;
4532 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4533
4534 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4535
4536 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4537 InitResultReg, DstReg, PhiReg, TmpExec,
4538 Offset, UseGPRIdxMode, SGPRIdxReg);
4539
4540 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4542 ++MBBI;
4543 MF->insert(MBBI, LandingPad);
4544 LoopBB->removeSuccessor(RemainderBB);
4545 LandingPad->addSuccessor(RemainderBB);
4546 LoopBB->addSuccessor(LandingPad);
4547 MachineBasicBlock::iterator First = LandingPad->begin();
4548 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4549 .addReg(SaveExec);
4550
4551 return InsPt;
4552}
4553
4554// Returns subreg index, offset
4555static std::pair<unsigned, int>
4557 const TargetRegisterClass *SuperRC,
4558 unsigned VecReg,
4559 int Offset) {
4560 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4561
4562 // Skip out of bounds offsets, or else we would end up using an undefined
4563 // register.
4564 if (Offset >= NumElts || Offset < 0)
4565 return std::pair(AMDGPU::sub0, Offset);
4566
4567 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4568}
4569
4572 int Offset) {
4573 MachineBasicBlock *MBB = MI.getParent();
4574 const DebugLoc &DL = MI.getDebugLoc();
4576
4577 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4578
4579 assert(Idx->getReg() != AMDGPU::NoRegister);
4580
4581 if (Offset == 0) {
4582 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4583 } else {
4584 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4585 .add(*Idx)
4586 .addImm(Offset);
4587 }
4588}
4589
4592 int Offset) {
4593 MachineBasicBlock *MBB = MI.getParent();
4594 const DebugLoc &DL = MI.getDebugLoc();
4596
4597 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4598
4599 if (Offset == 0)
4600 return Idx->getReg();
4601
4602 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4603 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4604 .add(*Idx)
4605 .addImm(Offset);
4606 return Tmp;
4607}
4608
4611 const GCNSubtarget &ST) {
4612 const SIInstrInfo *TII = ST.getInstrInfo();
4613 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4616
4617 Register Dst = MI.getOperand(0).getReg();
4618 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4619 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4620 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4621
4622 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4623 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4624
4625 unsigned SubReg;
4626 std::tie(SubReg, Offset)
4627 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4628
4629 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4630
4631 // Check for a SGPR index.
4632 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4634 const DebugLoc &DL = MI.getDebugLoc();
4635
4636 if (UseGPRIdxMode) {
4637 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4638 // to avoid interfering with other uses, so probably requires a new
4639 // optimization pass.
4641
4642 const MCInstrDesc &GPRIDXDesc =
4643 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4644 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4645 .addReg(SrcReg)
4646 .addReg(Idx)
4647 .addImm(SubReg);
4648 } else {
4650
4651 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4652 .addReg(SrcReg, 0, SubReg)
4653 .addReg(SrcReg, RegState::Implicit);
4654 }
4655
4656 MI.eraseFromParent();
4657
4658 return &MBB;
4659 }
4660
4661 // Control flow needs to be inserted if indexing with a VGPR.
4662 const DebugLoc &DL = MI.getDebugLoc();
4664
4665 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4666 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4667
4668 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4669
4670 Register SGPRIdxReg;
4671 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4672 UseGPRIdxMode, SGPRIdxReg);
4673
4674 MachineBasicBlock *LoopBB = InsPt->getParent();
4675
4676 if (UseGPRIdxMode) {
4677 const MCInstrDesc &GPRIDXDesc =
4678 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4679
4680 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4681 .addReg(SrcReg)
4682 .addReg(SGPRIdxReg)
4683 .addImm(SubReg);
4684 } else {
4685 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4686 .addReg(SrcReg, 0, SubReg)
4687 .addReg(SrcReg, RegState::Implicit);
4688 }
4689
4690 MI.eraseFromParent();
4691
4692 return LoopBB;
4693}
4694
4697 const GCNSubtarget &ST) {
4698 const SIInstrInfo *TII = ST.getInstrInfo();
4699 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4702
4703 Register Dst = MI.getOperand(0).getReg();
4704 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4705 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4706 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4707 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4708 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4709 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4710
4711 // This can be an immediate, but will be folded later.
4712 assert(Val->getReg());
4713
4714 unsigned SubReg;
4715 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4716 SrcVec->getReg(),
4717 Offset);
4718 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4719
4720 if (Idx->getReg() == AMDGPU::NoRegister) {
4722 const DebugLoc &DL = MI.getDebugLoc();
4723
4724 assert(Offset == 0);
4725
4726 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4727 .add(*SrcVec)
4728 .add(*Val)
4729 .addImm(SubReg);
4730
4731 MI.eraseFromParent();
4732 return &MBB;
4733 }
4734
4735 // Check for a SGPR index.
4736 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4738 const DebugLoc &DL = MI.getDebugLoc();
4739
4740 if (UseGPRIdxMode) {
4742
4743 const MCInstrDesc &GPRIDXDesc =
4744 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4745 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4746 .addReg(SrcVec->getReg())
4747 .add(*Val)
4748 .addReg(Idx)
4749 .addImm(SubReg);
4750 } else {
4752
4753 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4754 TRI.getRegSizeInBits(*VecRC), 32, false);
4755 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4756 .addReg(SrcVec->getReg())
4757 .add(*Val)
4758 .addImm(SubReg);
4759 }
4760 MI.eraseFromParent();
4761 return &MBB;
4762 }
4763
4764 // Control flow needs to be inserted if indexing with a VGPR.
4765 if (Val->isReg())
4766 MRI.clearKillFlags(Val->getReg());
4767
4768 const DebugLoc &DL = MI.getDebugLoc();
4769
4770 Register PhiReg = MRI.createVirtualRegister(VecRC);
4771
4772 Register SGPRIdxReg;
4773 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4774 UseGPRIdxMode, SGPRIdxReg);
4775 MachineBasicBlock *LoopBB = InsPt->getParent();
4776
4777 if (UseGPRIdxMode) {
4778 const MCInstrDesc &GPRIDXDesc =
4779 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4780
4781 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4782 .addReg(PhiReg)
4783 .add(*Val)
4784 .addReg(SGPRIdxReg)
4785 .addImm(AMDGPU::sub0);
4786 } else {
4787 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4788 TRI.getRegSizeInBits(*VecRC), 32, false);
4789 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4790 .addReg(PhiReg)
4791 .add(*Val)
4792 .addImm(AMDGPU::sub0);
4793 }
4794
4795 MI.eraseFromParent();
4796 return LoopBB;
4797}
4798
4801 const GCNSubtarget &ST,
4802 unsigned Opc) {
4804 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4805 const DebugLoc &DL = MI.getDebugLoc();
4806 const SIInstrInfo *TII = ST.getInstrInfo();
4807
4808 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4809 Register SrcReg = MI.getOperand(1).getReg();
4810 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4811 Register DstReg = MI.getOperand(0).getReg();
4812 MachineBasicBlock *RetBB = nullptr;
4813 if (isSGPR) {
4814 // These operations with a uniform value i.e. SGPR are idempotent.
4815 // Reduced value will be same as given sgpr.
4816 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4817 RetBB = &BB;
4818 } else {
4819 // TODO: Implement DPP Strategy and switch based on immediate strategy
4820 // operand. For now, for all the cases (default, Iterative and DPP we use
4821 // iterative approach by default.)
4822
4823 // To reduce the VGPR using iterative approach, we need to iterate
4824 // over all the active lanes. Lowering consists of ComputeLoop,
4825 // which iterate over only active lanes. We use copy of EXEC register
4826 // as induction variable and every active lane modifies it using bitset0
4827 // so that we will get the next active lane for next iteration.
4829 Register SrcReg = MI.getOperand(1).getReg();
4830
4831 // Create Control flow for loop
4832 // Split MI's Machine Basic block into For loop
4833 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4834
4835 // Create virtual registers required for lowering.
4836 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4837 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4838 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4839 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4840
4841 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4842 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4843 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4844
4845 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4846 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4847
4848 bool IsWave32 = ST.isWave32();
4849 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4850 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4851
4852 // Create initail values of induction variable from Exec, Accumulator and
4853 // insert branch instr to newly created ComputeBlockk
4854 uint32_t InitalValue =
4855 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4856 auto TmpSReg =
4857 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4858 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4859 .addImm(InitalValue);
4860 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4861
4862 // Start constructing ComputeLoop
4863 I = ComputeLoop->end();
4864 auto Accumulator =
4865 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4866 .addReg(InitalValReg)
4867 .addMBB(&BB);
4868 auto ActiveBits =
4869 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4870 .addReg(TmpSReg->getOperand(0).getReg())
4871 .addMBB(&BB);
4872
4873 // Perform the computations
4874 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4875 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4876 .addReg(ActiveBits->getOperand(0).getReg());
4877 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4878 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4879 .addReg(SrcReg)
4880 .addReg(FF1->getOperand(0).getReg());
4881 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4882 .addReg(Accumulator->getOperand(0).getReg())
4883 .addReg(LaneValue->getOperand(0).getReg());
4884
4885 // Manipulate the iterator to get the next active lane
4886 unsigned BITSETOpc =
4887 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4888 auto NewActiveBits =
4889 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4890 .addReg(FF1->getOperand(0).getReg())
4891 .addReg(ActiveBits->getOperand(0).getReg());
4892
4893 // Add phi nodes
4894 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4895 .addMBB(ComputeLoop);
4896 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4897 .addMBB(ComputeLoop);
4898
4899 // Creating branching
4900 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4901 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4902 .addReg(NewActiveBits->getOperand(0).getReg())
4903 .addImm(0);
4904 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4905 .addMBB(ComputeLoop);
4906
4907 RetBB = ComputeEnd;
4908 }
4909 MI.eraseFromParent();
4910 return RetBB;
4911}
4912
4914 MachineInstr &MI, MachineBasicBlock *BB) const {
4915
4917 MachineFunction *MF = BB->getParent();
4919
4920 switch (MI.getOpcode()) {
4921 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4925 case AMDGPU::S_UADDO_PSEUDO:
4926 case AMDGPU::S_USUBO_PSEUDO: {
4927 const DebugLoc &DL = MI.getDebugLoc();
4928 MachineOperand &Dest0 = MI.getOperand(0);
4929 MachineOperand &Dest1 = MI.getOperand(1);
4930 MachineOperand &Src0 = MI.getOperand(2);
4931 MachineOperand &Src1 = MI.getOperand(3);
4932
4933 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4934 ? AMDGPU::S_ADD_I32
4935 : AMDGPU::S_SUB_I32;
4936 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4937
4938 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4939 .addImm(1)
4940 .addImm(0);
4941
4942 MI.eraseFromParent();
4943 return BB;
4944 }
4945 case AMDGPU::S_ADD_U64_PSEUDO:
4946 case AMDGPU::S_SUB_U64_PSEUDO: {
4947 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4948 // For GFX12, we emit s_add_u64 and s_sub_u64.
4949 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4951 const DebugLoc &DL = MI.getDebugLoc();
4952 MachineOperand &Dest = MI.getOperand(0);
4953 MachineOperand &Src0 = MI.getOperand(1);
4954 MachineOperand &Src1 = MI.getOperand(2);
4955 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4956 if (Subtarget->hasScalarAddSub64()) {
4957 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4958 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4959 .add(Src0)
4960 .add(Src1);
4961 } else {
4962 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4963 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4964
4965 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4966 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4967
4968 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4969 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4970 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4971 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4972
4973 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4974 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4975 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4976 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4977
4978 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4979 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4980 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4981 .add(Src0Sub0)
4982 .add(Src1Sub0);
4983 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4984 .add(Src0Sub1)
4985 .add(Src1Sub1);
4986 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4987 .addReg(DestSub0)
4988 .addImm(AMDGPU::sub0)
4989 .addReg(DestSub1)
4990 .addImm(AMDGPU::sub1);
4991 }
4992 MI.eraseFromParent();
4993 return BB;
4994 }
4995 case AMDGPU::V_ADD_U64_PSEUDO:
4996 case AMDGPU::V_SUB_U64_PSEUDO: {
4998 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4999 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5000 const DebugLoc &DL = MI.getDebugLoc();
5001
5002 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5003
5004 MachineOperand &Dest = MI.getOperand(0);
5005 MachineOperand &Src0 = MI.getOperand(1);
5006 MachineOperand &Src1 = MI.getOperand(2);
5007
5008 if (IsAdd && ST.hasLshlAddB64()) {
5009 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5010 Dest.getReg())
5011 .add(Src0)
5012 .addImm(0)
5013 .add(Src1);
5014 TII->legalizeOperands(*Add);
5015 MI.eraseFromParent();
5016 return BB;
5017 }
5018
5019 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5020
5021 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5022 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5023
5024 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5025 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5026
5027 const TargetRegisterClass *Src0RC = Src0.isReg()
5028 ? MRI.getRegClass(Src0.getReg())
5029 : &AMDGPU::VReg_64RegClass;
5030 const TargetRegisterClass *Src1RC = Src1.isReg()
5031 ? MRI.getRegClass(Src1.getReg())
5032 : &AMDGPU::VReg_64RegClass;
5033
5034 const TargetRegisterClass *Src0SubRC =
5035 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5036 const TargetRegisterClass *Src1SubRC =
5037 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5038
5039 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5040 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5041 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5042 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5043
5044 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5045 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5046 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5047 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5048
5049 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5050 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5051 .addReg(CarryReg, RegState::Define)
5052 .add(SrcReg0Sub0)
5053 .add(SrcReg1Sub0)
5054 .addImm(0); // clamp bit
5055
5056 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5057 MachineInstr *HiHalf =
5058 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5059 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5060 .add(SrcReg0Sub1)
5061 .add(SrcReg1Sub1)
5062 .addReg(CarryReg, RegState::Kill)
5063 .addImm(0); // clamp bit
5064
5065 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5066 .addReg(DestSub0)
5067 .addImm(AMDGPU::sub0)
5068 .addReg(DestSub1)
5069 .addImm(AMDGPU::sub1);
5070 TII->legalizeOperands(*LoHalf);
5071 TII->legalizeOperands(*HiHalf);
5072 MI.eraseFromParent();
5073 return BB;
5074 }
5075 case AMDGPU::S_ADD_CO_PSEUDO:
5076 case AMDGPU::S_SUB_CO_PSEUDO: {
5077 // This pseudo has a chance to be selected
5078 // only from uniform add/subcarry node. All the VGPR operands
5079 // therefore assumed to be splat vectors.
5081 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5082 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5084 const DebugLoc &DL = MI.getDebugLoc();
5085 MachineOperand &Dest = MI.getOperand(0);
5086 MachineOperand &CarryDest = MI.getOperand(1);
5087 MachineOperand &Src0 = MI.getOperand(2);
5088 MachineOperand &Src1 = MI.getOperand(3);
5089 MachineOperand &Src2 = MI.getOperand(4);
5090 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5091 ? AMDGPU::S_ADDC_U32
5092 : AMDGPU::S_SUBB_U32;
5093 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5094 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5095 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5096 .addReg(Src0.getReg());
5097 Src0.setReg(RegOp0);
5098 }
5099 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5100 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5101 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5102 .addReg(Src1.getReg());
5103 Src1.setReg(RegOp1);
5104 }
5105 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5106 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5107 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5108 .addReg(Src2.getReg());
5109 Src2.setReg(RegOp2);
5110 }
5111
5112 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5113 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5114 assert(WaveSize == 64 || WaveSize == 32);
5115
5116 if (WaveSize == 64) {
5117 if (ST.hasScalarCompareEq64()) {
5118 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5119 .addReg(Src2.getReg())
5120 .addImm(0);
5121 } else {
5122 const TargetRegisterClass *SubRC =
5123 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5124 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5125 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5126 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5127 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5128 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5129
5130 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5131 .add(Src2Sub0)
5132 .add(Src2Sub1);
5133
5134 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5135 .addReg(Src2_32, RegState::Kill)
5136 .addImm(0);
5137 }
5138 } else {
5139 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5140 .addReg(Src2.getReg())
5141 .addImm(0);
5142 }
5143
5144 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5145
5146 unsigned SelOpc =
5147 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5148
5149 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5150 .addImm(-1)
5151 .addImm(0);
5152
5153 MI.eraseFromParent();
5154 return BB;
5155 }
5156 case AMDGPU::SI_INIT_M0: {
5157 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5158 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5159 .add(MI.getOperand(0));
5160 MI.eraseFromParent();
5161 return BB;
5162 }
5163 case AMDGPU::GET_GROUPSTATICSIZE: {
5164 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5165 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5166 DebugLoc DL = MI.getDebugLoc();
5167 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5168 .add(MI.getOperand(0))
5169 .addImm(MFI->getLDSSize());
5170 MI.eraseFromParent();
5171 return BB;
5172 }
5173 case AMDGPU::GET_SHADERCYCLESHILO: {
5176 const DebugLoc &DL = MI.getDebugLoc();
5177 // The algorithm is:
5178 //
5179 // hi1 = getreg(SHADER_CYCLES_HI)
5180 // lo1 = getreg(SHADER_CYCLES_LO)
5181 // hi2 = getreg(SHADER_CYCLES_HI)
5182 //
5183 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5184 // Otherwise there was overflow and the result is hi2:0. In both cases the
5185 // result should represent the actual time at some point during the sequence
5186 // of three getregs.
5187 using namespace AMDGPU::Hwreg;
5188 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5189 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5190 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5191 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5192 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5193 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5194 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5195 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5196 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5197 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5198 .addReg(RegHi1)
5199 .addReg(RegHi2);
5200 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5201 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5202 .addReg(RegLo1)
5203 .addImm(0);
5204 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5205 .add(MI.getOperand(0))
5206 .addReg(RegLo)
5207 .addImm(AMDGPU::sub0)
5208 .addReg(RegHi2)
5209 .addImm(AMDGPU::sub1);
5210 MI.eraseFromParent();
5211 return BB;
5212 }
5213 case AMDGPU::SI_INDIRECT_SRC_V1:
5214 case AMDGPU::SI_INDIRECT_SRC_V2:
5215 case AMDGPU::SI_INDIRECT_SRC_V4:
5216 case AMDGPU::SI_INDIRECT_SRC_V8:
5217 case AMDGPU::SI_INDIRECT_SRC_V9:
5218 case AMDGPU::SI_INDIRECT_SRC_V10:
5219 case AMDGPU::SI_INDIRECT_SRC_V11:
5220 case AMDGPU::SI_INDIRECT_SRC_V12:
5221 case AMDGPU::SI_INDIRECT_SRC_V16:
5222 case AMDGPU::SI_INDIRECT_SRC_V32:
5223 return emitIndirectSrc(MI, *BB, *getSubtarget());
5224 case AMDGPU::SI_INDIRECT_DST_V1:
5225 case AMDGPU::SI_INDIRECT_DST_V2:
5226 case AMDGPU::SI_INDIRECT_DST_V4:
5227 case AMDGPU::SI_INDIRECT_DST_V8:
5228 case AMDGPU::SI_INDIRECT_DST_V9:
5229 case AMDGPU::SI_INDIRECT_DST_V10:
5230 case AMDGPU::SI_INDIRECT_DST_V11:
5231 case AMDGPU::SI_INDIRECT_DST_V12:
5232 case AMDGPU::SI_INDIRECT_DST_V16:
5233 case AMDGPU::SI_INDIRECT_DST_V32:
5234 return emitIndirectDst(MI, *BB, *getSubtarget());
5235 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5236 case AMDGPU::SI_KILL_I1_PSEUDO:
5237 return splitKillBlock(MI, BB);
5238 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5240 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5241 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5242
5243 Register Dst = MI.getOperand(0).getReg();
5244 const MachineOperand &Src0 = MI.getOperand(1);
5245 const MachineOperand &Src1 = MI.getOperand(2);
5246 const DebugLoc &DL = MI.getDebugLoc();
5247 Register SrcCond = MI.getOperand(3).getReg();
5248
5249 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5250 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5251 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5252 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5253
5254 const TargetRegisterClass *Src0RC = Src0.isReg()
5255 ? MRI.getRegClass(Src0.getReg())
5256 : &AMDGPU::VReg_64RegClass;
5257 const TargetRegisterClass *Src1RC = Src1.isReg()
5258 ? MRI.getRegClass(Src1.getReg())
5259 : &AMDGPU::VReg_64RegClass;
5260
5261 const TargetRegisterClass *Src0SubRC =
5262 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5263 const TargetRegisterClass *Src1SubRC =
5264 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5265
5266 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5267 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5268 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5269 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5270
5271 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5272 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5273 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5274 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5275
5276 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5277 .addReg(SrcCond);
5278 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5279 .addImm(0)
5280 .add(Src0Sub0)
5281 .addImm(0)
5282 .add(Src1Sub0)
5283 .addReg(SrcCondCopy);
5284 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5285 .addImm(0)
5286 .add(Src0Sub1)
5287 .addImm(0)
5288 .add(Src1Sub1)
5289 .addReg(SrcCondCopy);
5290
5291 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5292 .addReg(DstLo)
5293 .addImm(AMDGPU::sub0)
5294 .addReg(DstHi)
5295 .addImm(AMDGPU::sub1);
5296 MI.eraseFromParent();
5297 return BB;
5298 }
5299 case AMDGPU::SI_BR_UNDEF: {
5301 const DebugLoc &DL = MI.getDebugLoc();
5302 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5303 .add(MI.getOperand(0));
5304 Br->getOperand(1).setIsUndef(); // read undef SCC
5305 MI.eraseFromParent();
5306 return BB;
5307 }
5308 case AMDGPU::ADJCALLSTACKUP:
5309 case AMDGPU::ADJCALLSTACKDOWN: {
5311 MachineInstrBuilder MIB(*MF, &MI);
5312 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5313 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5314 return BB;
5315 }
5316 case AMDGPU::SI_CALL_ISEL: {
5318 const DebugLoc &DL = MI.getDebugLoc();
5319
5320 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5321
5323 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5324
5325 for (const MachineOperand &MO : MI.operands())
5326 MIB.add(MO);
5327
5328 MIB.cloneMemRefs(MI);
5329 MI.eraseFromParent();
5330 return BB;
5331 }
5332 case AMDGPU::V_ADD_CO_U32_e32:
5333 case AMDGPU::V_SUB_CO_U32_e32:
5334 case AMDGPU::V_SUBREV_CO_U32_e32: {
5335 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5336 const DebugLoc &DL = MI.getDebugLoc();
5337 unsigned Opc = MI.getOpcode();
5338
5339 bool NeedClampOperand = false;
5340 if (TII->pseudoToMCOpcode(Opc) == -1) {
5341 Opc = AMDGPU::getVOPe64(Opc);
5342 NeedClampOperand = true;
5343 }
5344
5345 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5346 if (TII->isVOP3(*I)) {
5347 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5348 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5349 I.addReg(TRI->getVCC(), RegState::Define);
5350 }
5351 I.add(MI.getOperand(1))
5352 .add(MI.getOperand(2));
5353 if (NeedClampOperand)
5354 I.addImm(0); // clamp bit for e64 encoding
5355
5356 TII->legalizeOperands(*I);
5357
5358 MI.eraseFromParent();
5359 return BB;
5360 }
5361 case AMDGPU::V_ADDC_U32_e32:
5362 case AMDGPU::V_SUBB_U32_e32:
5363 case AMDGPU::V_SUBBREV_U32_e32:
5364 // These instructions have an implicit use of vcc which counts towards the
5365 // constant bus limit.
5366 TII->legalizeOperands(MI);
5367 return BB;
5368 case AMDGPU::DS_GWS_INIT:
5369 case AMDGPU::DS_GWS_SEMA_BR:
5370 case AMDGPU::DS_GWS_BARRIER:
5371 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5372 [[fallthrough]];
5373 case AMDGPU::DS_GWS_SEMA_V:
5374 case AMDGPU::DS_GWS_SEMA_P:
5375 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5376 // A s_waitcnt 0 is required to be the instruction immediately following.
5377 if (getSubtarget()->hasGWSAutoReplay()) {
5379 return BB;
5380 }
5381
5382 return emitGWSMemViolTestLoop(MI, BB);
5383 case AMDGPU::S_SETREG_B32: {
5384 // Try to optimize cases that only set the denormal mode or rounding mode.
5385 //
5386 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5387 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5388 // instead.
5389 //
5390 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5391 // allow you to have a no side effect instruction in the output of a
5392 // sideeffecting pattern.
5393 auto [ID, Offset, Width] =
5394 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5396 return BB;
5397
5398 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5399 const unsigned SetMask = WidthMask << Offset;
5400
5401 if (getSubtarget()->hasDenormModeInst()) {
5402 unsigned SetDenormOp = 0;
5403 unsigned SetRoundOp = 0;
5404
5405 // The dedicated instructions can only set the whole denorm or round mode
5406 // at once, not a subset of bits in either.
5407 if (SetMask ==
5409 // If this fully sets both the round and denorm mode, emit the two
5410 // dedicated instructions for these.
5411 SetRoundOp = AMDGPU::S_ROUND_MODE;
5412 SetDenormOp = AMDGPU::S_DENORM_MODE;
5413 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5414 SetRoundOp = AMDGPU::S_ROUND_MODE;
5415 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5416 SetDenormOp = AMDGPU::S_DENORM_MODE;
5417 }
5418
5419 if (SetRoundOp || SetDenormOp) {
5421 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5422 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5423 unsigned ImmVal = Def->getOperand(1).getImm();
5424 if (SetRoundOp) {
5425 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5426 .addImm(ImmVal & 0xf);
5427
5428 // If we also have the denorm mode, get just the denorm mode bits.
5429 ImmVal >>= 4;
5430 }
5431
5432 if (SetDenormOp) {
5433 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5434 .addImm(ImmVal & 0xf);
5435 }
5436
5437 MI.eraseFromParent();
5438 return BB;
5439 }
5440 }
5441 }
5442
5443 // If only FP bits are touched, used the no side effects pseudo.
5444 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5445 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5446 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5447
5448 return BB;
5449 }
5450 case AMDGPU::S_INVERSE_BALLOT_U32:
5451 case AMDGPU::S_INVERSE_BALLOT_U64: {
5453 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5454 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5455 const DebugLoc &DL = MI.getDebugLoc();
5456 const Register DstReg = MI.getOperand(0).getReg();
5457 Register MaskReg = MI.getOperand(1).getReg();
5458
5459 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5460
5461 if (IsVALU) {
5462 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5463 }
5464
5465 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5466 MI.eraseFromParent();
5467 return BB;
5468 }
5469 case AMDGPU::ENDPGM_TRAP: {
5470 const DebugLoc &DL = MI.getDebugLoc();
5471 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5472 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5473 MI.addOperand(MachineOperand::CreateImm(0));
5474 return BB;
5475 }
5476
5477 // We need a block split to make the real endpgm a terminator. We also don't
5478 // want to break phis in successor blocks, so we can't just delete to the
5479 // end of the block.
5480
5481 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5483 MF->push_back(TrapBB);
5484 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5485 .addImm(0);
5486 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5487 .addMBB(TrapBB);
5488
5489 BB->addSuccessor(TrapBB);
5490 MI.eraseFromParent();
5491 return SplitBB;
5492 }
5493 case AMDGPU::SIMULATED_TRAP: {
5494 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5496 MachineBasicBlock *SplitBB =
5497 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5498 MI.eraseFromParent();
5499 return SplitBB;
5500 }
5501 default:
5502 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5503 if (!MI.mayStore())
5505 return BB;
5506 }
5508 }
5509}
5510
5512 // This currently forces unfolding various combinations of fsub into fma with
5513 // free fneg'd operands. As long as we have fast FMA (controlled by
5514 // isFMAFasterThanFMulAndFAdd), we should perform these.
5515
5516 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5517 // most of these combines appear to be cycle neutral but save on instruction
5518 // count / code size.
5519 return true;
5520}
5521
5523
5525 EVT VT) const {
5526 if (!VT.isVector()) {
5527 return MVT::i1;
5528 }
5529 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5530}
5531
5533 // TODO: Should i16 be used always if legal? For now it would force VALU
5534 // shifts.
5535 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5536}
5537
5539 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5540 ? Ty.changeElementSize(16)
5541 : Ty.changeElementSize(32);
5542}
5543
5544// Answering this is somewhat tricky and depends on the specific device which
5545// have different rates for fma or all f64 operations.
5546//
5547// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5548// regardless of which device (although the number of cycles differs between
5549// devices), so it is always profitable for f64.
5550//
5551// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5552// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5553// which we can always do even without fused FP ops since it returns the same
5554// result as the separate operations and since it is always full
5555// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5556// however does not support denormals, so we do report fma as faster if we have
5557// a fast fma device and require denormals.
5558//
5560 EVT VT) const {
5561 VT = VT.getScalarType();
5562
5563 switch (VT.getSimpleVT().SimpleTy) {
5564 case MVT::f32: {
5565 // If mad is not available this depends only on if f32 fma is full rate.
5566 if (!Subtarget->hasMadMacF32Insts())
5567 return Subtarget->hasFastFMAF32();
5568
5569 // Otherwise f32 mad is always full rate and returns the same result as
5570 // the separate operations so should be preferred over fma.
5571 // However does not support denormals.
5573 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5574
5575 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5576 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5577 }
5578 case MVT::f64:
5579 return true;
5580 case MVT::f16:
5581 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5582 default:
5583 break;
5584 }
5585
5586 return false;
5587}
5588
5590 LLT Ty) const {
5591 switch (Ty.getScalarSizeInBits()) {
5592 case 16:
5593 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5594 case 32:
5595 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5596 case 64:
5597 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5598 default:
5599 break;
5600 }
5601
5602 return false;
5603}
5604
5606 if (!Ty.isScalar())
5607 return false;
5608
5609 if (Ty.getScalarSizeInBits() == 16)
5610 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5611 if (Ty.getScalarSizeInBits() == 32)
5612 return Subtarget->hasMadMacF32Insts() &&
5613 denormalModeIsFlushAllF32(*MI.getMF());
5614
5615 return false;
5616}
5617
5619 const SDNode *N) const {
5620 // TODO: Check future ftz flag
5621 // v_mad_f32/v_mac_f32 do not support denormals.
5622 EVT VT = N->getValueType(0);
5623 if (VT == MVT::f32)
5624 return Subtarget->hasMadMacF32Insts() &&
5626 if (VT == MVT::f16) {
5627 return Subtarget->hasMadF16() &&
5629 }
5630
5631 return false;
5632}
5633
5634//===----------------------------------------------------------------------===//
5635// Custom DAG Lowering Operations
5636//===----------------------------------------------------------------------===//
5637
5638// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5639// wider vector type is legal.
5641 SelectionDAG &DAG) const {
5642 unsigned Opc = Op.getOpcode();
5643 EVT VT = Op.getValueType();
5644 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5645 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5646 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5647 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5648
5649 SDValue Lo, Hi;
5650 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5651
5652 SDLoc SL(Op);
5653 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5654 Op->getFlags());
5655 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5656 Op->getFlags());
5657
5658 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5659}
5660
5661// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5662// wider vector type is legal.
5664 SelectionDAG &DAG) const {
5665 unsigned Opc = Op.getOpcode();
5666 EVT VT = Op.getValueType();
5667 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5668 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5669 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5670 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5671
5672 SDValue Lo0, Hi0;
5673 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5674 SDValue Lo1, Hi1;
5675 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5676
5677 SDLoc SL(Op);
5678
5679 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5680 Op->getFlags());
5681 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5682 Op->getFlags());
5683
5684 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5685}
5686
5688 SelectionDAG &DAG) const {
5689 unsigned Opc = Op.getOpcode();
5690 EVT VT = Op.getValueType();
5691 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5692 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5693 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5694 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5695 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5696 VT == MVT::v32bf16);
5697
5698 SDValue Lo0, Hi0;
5699 SDValue Op0 = Op.getOperand(0);
5700 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5701 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5702 : std::pair(Op0, Op0);
5703 SDValue Lo1, Hi1;
5704 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5705 SDValue Lo2, Hi2;
5706 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5707
5708 SDLoc SL(Op);
5709 auto ResVT = DAG.GetSplitDestVTs(VT);
5710
5711 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5712 Op->getFlags());
5713 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5714 Op->getFlags());
5715
5716 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5717}
5718
5719
5721 switch (Op.getOpcode()) {
5722 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5723 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5724 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5725 case ISD::LOAD: {
5726 SDValue Result = LowerLOAD(Op, DAG);
5727 assert((!Result.getNode() ||
5728 Result.getNode()->getNumValues() == 2) &&
5729 "Load should return a value and a chain");
5730 return Result;
5731 }
5732 case ISD::FSQRT: {
5733 EVT VT = Op.getValueType();
5734 if (VT == MVT::f32)
5735 return lowerFSQRTF32(Op, DAG);
5736 if (VT == MVT::f64)
5737 return lowerFSQRTF64(Op, DAG);
5738 return SDValue();
5739 }
5740 case ISD::FSIN:
5741 case ISD::FCOS:
5742 return LowerTrig(Op, DAG);
5743 case ISD::SELECT: return LowerSELECT(Op, DAG);
5744 case ISD::FDIV: return LowerFDIV(Op, DAG);
5745 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5746 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5747 case ISD::STORE: return LowerSTORE(Op, DAG);
5748 case ISD::GlobalAddress: {
5751 return LowerGlobalAddress(MFI, Op, DAG);
5752 }
5753 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5754 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5755 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5756 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5758 return lowerINSERT_SUBVECTOR(Op, DAG);
5760 return lowerINSERT_VECTOR_ELT(Op, DAG);
5762 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5764 return lowerVECTOR_SHUFFLE(Op, DAG);
5766 return lowerSCALAR_TO_VECTOR(Op, DAG);
5767 case ISD::BUILD_VECTOR:
5768 return lowerBUILD_VECTOR(Op, DAG);
5769 case ISD::FP_ROUND:
5771 return lowerFP_ROUND(Op, DAG);
5772 case ISD::FPTRUNC_ROUND: {
5773 unsigned Opc;
5774 SDLoc DL(Op);
5775
5776 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5777 return SDValue();
5778
5779 // Get the rounding mode from the last operand
5780 int RoundMode = Op.getConstantOperandVal(1);
5781 if (RoundMode == (int)RoundingMode::TowardPositive)
5783 else if (RoundMode == (int)RoundingMode::TowardNegative)
5785 else
5786 return SDValue();
5787
5788 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5789 }
5790 case ISD::TRAP:
5791 return lowerTRAP(Op, DAG);
5792 case ISD::DEBUGTRAP:
5793 return lowerDEBUGTRAP(Op, DAG);
5794 case ISD::FABS:
5795 case ISD::FNEG:
5796 case ISD::FCANONICALIZE:
5797 case ISD::BSWAP:
5798 return splitUnaryVectorOp(Op, DAG);
5799 case ISD::FMINNUM:
5800 case ISD::FMAXNUM:
5801 return lowerFMINNUM_FMAXNUM(Op, DAG);
5802 case ISD::FLDEXP:
5803 case ISD::STRICT_FLDEXP:
5804 return lowerFLDEXP(Op, DAG);
5805 case ISD::FMA:
5806 return splitTernaryVectorOp(Op, DAG);
5807 case ISD::FP_TO_SINT:
5808 case ISD::FP_TO_UINT:
5809 return LowerFP_TO_INT(Op, DAG);
5810 case ISD::SHL:
5811 case ISD::SRA:
5812 case ISD::SRL:
5813 case ISD::ADD:
5814 case ISD::SUB:
5815 case ISD::SMIN:
5816 case ISD::SMAX:
5817 case ISD::UMIN:
5818 case ISD::UMAX:
5819 case ISD::FADD:
5820 case ISD::FMUL:
5821 case ISD::FMINNUM_IEEE:
5822 case ISD::FMAXNUM_IEEE:
5823 case ISD::FMINIMUM:
5824 case ISD::FMAXIMUM:
5825 case ISD::UADDSAT:
5826 case ISD::USUBSAT:
5827 case ISD::SADDSAT:
5828 case ISD::SSUBSAT:
5829 return splitBinaryVectorOp(Op, DAG);
5830 case ISD::MUL:
5831 return lowerMUL(Op, DAG);
5832 case ISD::SMULO:
5833 case ISD::UMULO:
5834 return lowerXMULO(Op, DAG);
5835 case ISD::SMUL_LOHI:
5836 case ISD::UMUL_LOHI:
5837 return lowerXMUL_LOHI(Op, DAG);
5839 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5840 case ISD::STACKSAVE:
5841 return LowerSTACKSAVE(Op, DAG);
5842 case ISD::GET_ROUNDING:
5843 return lowerGET_ROUNDING(Op, DAG);
5844 case ISD::SET_ROUNDING:
5845 return lowerSET_ROUNDING(Op, DAG);
5846 case ISD::PREFETCH:
5847 return lowerPREFETCH(Op, DAG);
5848 case ISD::FP_EXTEND:
5850 return lowerFP_EXTEND(Op, DAG);
5851 case ISD::GET_FPENV:
5852 return lowerGET_FPENV(Op, DAG);
5853 case ISD::SET_FPENV:
5854 return lowerSET_FPENV(Op, DAG);
5855 }
5856 return SDValue();
5857}
5858
5859// Used for D16: Casts the result of an instruction into the right vector,
5860// packs values if loads return unpacked values.
5862 const SDLoc &DL,
5863 SelectionDAG &DAG, bool Unpacked) {
5864 if (!LoadVT.isVector())
5865 return Result;
5866
5867 // Cast back to the original packed type or to a larger type that is a
5868 // multiple of 32 bit for D16. Widening the return type is a required for
5869 // legalization.
5870 EVT FittingLoadVT = LoadVT;
5871 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5872 FittingLoadVT =
5874 LoadVT.getVectorNumElements() + 1);
5875 }
5876
5877 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5878 // Truncate to v2i16/v4i16.
5879 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5880
5881 // Workaround legalizer not scalarizing truncate after vector op
5882 // legalization but not creating intermediate vector trunc.
5884 DAG.ExtractVectorElements(Result, Elts);
5885 for (SDValue &Elt : Elts)
5886 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5887
5888 // Pad illegal v1i16/v3fi6 to v4i16
5889 if ((LoadVT.getVectorNumElements() % 2) == 1)
5890 Elts.push_back(DAG.getUNDEF(MVT::i16));
5891
5892 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5893
5894 // Bitcast to original type (v2f16/v4f16).
5895 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5896 }
5897
5898 // Cast back to the original packed type.
5899 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5900}
5901
5902SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5903 MemSDNode *M,
5904 SelectionDAG &DAG,
5906 bool IsIntrinsic) const {
5907 SDLoc DL(M);
5908
5909 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5910 EVT LoadVT = M->getValueType(0);
5911
5912 EVT EquivLoadVT = LoadVT;
5913 if (LoadVT.isVector()) {
5914 if (Unpacked) {
5915 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5916 LoadVT.getVectorNumElements());
5917 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5918 // Widen v3f16 to legal type
5919 EquivLoadVT =
5921 LoadVT.getVectorNumElements() + 1);
5922 }
5923 }
5924
5925 // Change from v4f16/v2f16 to EquivLoadVT.
5926 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5927
5929 = DAG.getMemIntrinsicNode(
5930 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5931 VTList, Ops, M->getMemoryVT(),
5932 M->getMemOperand());
5933
5934 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5935
5936 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5937}
5938
5939SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5940 SelectionDAG &DAG,
5941 ArrayRef<SDValue> Ops) const {
5942 SDLoc DL(M);
5943 EVT LoadVT = M->getValueType(0);
5944 EVT EltType = LoadVT.getScalarType();
5945 EVT IntVT = LoadVT.changeTypeToInteger();
5946
5947 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5948
5949 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5950 bool IsTFE = M->getNumValues() == 3;
5951
5952 unsigned Opc;
5953 if (IsFormat) {
5956 } else {
5957 // TODO: Support non-format TFE loads.
5958 if (IsTFE)
5959 return SDValue();
5961 }
5962
5963 if (IsD16) {
5964 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5965 }
5966
5967 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5968 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5969 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
5970
5971 if (isTypeLegal(LoadVT)) {
5972 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5973 M->getMemOperand(), DAG);
5974 }
5975
5976 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5977 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5978 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5979 M->getMemOperand(), DAG);
5980 return DAG.getMergeValues(
5981 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5982 DL);
5983}
5984
5986 SDNode *N, SelectionDAG &DAG) {
5987 EVT VT = N->getValueType(0);
5988 unsigned CondCode = N->getConstantOperandVal(3);
5989 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5990 return DAG.getUNDEF(VT);
5991
5992 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5993
5994 SDValue LHS = N->getOperand(1);
5995 SDValue RHS = N->getOperand(2);
5996
5997 SDLoc DL(N);
5998
5999 EVT CmpVT = LHS.getValueType();
6000 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6001 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6003 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6004 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6005 }
6006
6007 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6008
6009 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6010 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6011
6012 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6013 DAG.getCondCode(CCOpcode));
6014 if (VT.bitsEq(CCVT))
6015 return SetCC;
6016 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6017}
6018
6020 SDNode *N, SelectionDAG &DAG) {
6021 EVT VT = N->getValueType(0);
6022
6023 unsigned CondCode = N->getConstantOperandVal(3);
6024 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6025 return DAG.getUNDEF(VT);
6026
6027 SDValue Src0 = N->getOperand(1);
6028 SDValue Src1 = N->getOperand(2);
6029 EVT CmpVT = Src0.getValueType();
6030 SDLoc SL(N);
6031
6032 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6033 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6034 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6035 }
6036
6037 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6038 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6039 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6040 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6041 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6042 Src1, DAG.getCondCode(CCOpcode));
6043 if (VT.bitsEq(CCVT))
6044 return SetCC;
6045 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6046}
6047
6049 SelectionDAG &DAG) {
6050 EVT VT = N->getValueType(0);
6051 SDValue Src = N->getOperand(1);
6052 SDLoc SL(N);
6053
6054 if (Src.getOpcode() == ISD::SETCC) {
6055 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6056 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6057 Src.getOperand(1), Src.getOperand(2));
6058 }
6059 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6060 // (ballot 0) -> 0
6061 if (Arg->isZero())
6062 return DAG.getConstant(0, SL, VT);
6063
6064 // (ballot 1) -> EXEC/EXEC_LO
6065 if (Arg->isOne()) {
6066 Register Exec;
6067 if (VT.getScalarSizeInBits() == 32)
6068 Exec = AMDGPU::EXEC_LO;
6069 else if (VT.getScalarSizeInBits() == 64)
6070 Exec = AMDGPU::EXEC;
6071 else
6072 return SDValue();
6073
6074 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6075 }
6076 }
6077
6078 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6079 // ISD::SETNE)
6080 return DAG.getNode(
6081 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6082 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6083}
6084
6087 SelectionDAG &DAG) const {
6088 switch (N->getOpcode()) {
6090 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6091 Results.push_back(Res);
6092 return;
6093 }
6095 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6096 Results.push_back(Res);
6097 return;
6098 }
6100 unsigned IID = N->getConstantOperandVal(0);
6101 switch (IID) {
6102 case Intrinsic::amdgcn_make_buffer_rsrc:
6103 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6104 return;
6105 case Intrinsic::amdgcn_cvt_pkrtz: {
6106 SDValue Src0 = N->getOperand(1);
6107 SDValue Src1 = N->getOperand(2);
6108 SDLoc SL(N);
6109 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6110 Src0, Src1);
6111 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6112 return;
6113 }
6114 case Intrinsic::amdgcn_cvt_pknorm_i16:
6115 case Intrinsic::amdgcn_cvt_pknorm_u16:
6116 case Intrinsic::amdgcn_cvt_pk_i16:
6117 case Intrinsic::amdgcn_cvt_pk_u16: {
6118 SDValue Src0 = N->getOperand(1);
6119 SDValue Src1 = N->getOperand(2);
6120 SDLoc SL(N);
6121 unsigned Opcode;
6122
6123 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6125 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6127 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6129 else
6131
6132 EVT VT = N->getValueType(0);
6133 if (isTypeLegal(VT))
6134 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6135 else {
6136 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6137 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6138 }
6139 return;
6140 }
6141 case Intrinsic::amdgcn_s_buffer_load: {
6142 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6143 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6144 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6145 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6146 // s_buffer_load_i8.
6147 if (!Subtarget->hasScalarSubwordLoads())
6148 return;
6149 SDValue Op = SDValue(N, 0);
6150 SDValue Rsrc = Op.getOperand(1);
6151 SDValue Offset = Op.getOperand(2);
6152 SDValue CachePolicy = Op.getOperand(3);
6153 EVT VT = Op.getValueType();
6154 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6155 SDLoc DL(Op);
6157 const DataLayout &DataLayout = DAG.getDataLayout();
6158 Align Alignment =
6164 VT.getStoreSize(), Alignment);
6165 SDValue LoadVal;
6166 if (!Offset->isDivergent()) {
6167 SDValue Ops[] = {Rsrc, // source register
6168 Offset, CachePolicy};
6169 SDValue BufferLoad =
6171 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6172 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6173 } else {
6174 SDValue Ops[] = {
6175 DAG.getEntryNode(), // Chain
6176 Rsrc, // rsrc
6177 DAG.getConstant(0, DL, MVT::i32), // vindex
6178 {}, // voffset
6179 {}, // soffset
6180 {}, // offset
6181 CachePolicy, // cachepolicy
6182 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6183 };
6184 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6185 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6186 }
6187 Results.push_back(LoadVal);
6188 return;
6189 }
6190 }
6191 break;
6192 }
6194 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6195 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6196 // FIXME: Hacky
6197 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6198 Results.push_back(Res.getOperand(I));
6199 }
6200 } else {
6201 Results.push_back(Res);
6202 Results.push_back(Res.getValue(1));
6203 }
6204 return;
6205 }
6206
6207 break;
6208 }
6209 case ISD::SELECT: {
6210 SDLoc SL(N);
6211 EVT VT = N->getValueType(0);
6212 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6213 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6214 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6215
6216 EVT SelectVT = NewVT;
6217 if (NewVT.bitsLT(MVT::i32)) {
6218 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6219 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6220 SelectVT = MVT::i32;
6221 }
6222
6223 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6224 N->getOperand(0), LHS, RHS);
6225
6226 if (NewVT != SelectVT)
6227 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6228 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6229 return;
6230 }
6231 case ISD::FNEG: {
6232 if (N->getValueType(0) != MVT::v2f16)
6233 break;
6234
6235 SDLoc SL(N);
6236 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6237
6238 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6239 BC,
6240 DAG.getConstant(0x80008000, SL, MVT::i32));
6241 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6242 return;
6243 }
6244 case ISD::FABS: {
6245 if (N->getValueType(0) != MVT::v2f16)
6246 break;
6247
6248 SDLoc SL(N);
6249 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6250
6251 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6252 BC,
6253 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6254 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6255 return;
6256 }
6257 case ISD::FSQRT: {
6258 if (N->getValueType(0) != MVT::f16)
6259 break;
6260 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6261 break;
6262 }
6263 default:
6265 break;
6266 }
6267}
6268
6269/// Helper function for LowerBRCOND
6270static SDNode *findUser(SDValue Value, unsigned Opcode) {
6271
6272 SDNode *Parent = Value.getNode();
6273 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6274 I != E; ++I) {
6275
6276 if (I.getUse().get() != Value)
6277 continue;
6278
6279 if (I->getOpcode() == Opcode)
6280 return *I;
6281 }
6282 return nullptr;
6283}
6284
6285unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6286 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6287 switch (Intr->getConstantOperandVal(1)) {
6288 case Intrinsic::amdgcn_if:
6289 return AMDGPUISD::IF;
6290 case Intrinsic::amdgcn_else:
6291 return AMDGPUISD::ELSE;
6292 case Intrinsic::amdgcn_loop:
6293 return AMDGPUISD::LOOP;
6294 case Intrinsic::amdgcn_end_cf:
6295 llvm_unreachable("should not occur");
6296 default:
6297 return 0;
6298 }
6299 }
6300
6301 // break, if_break, else_break are all only used as inputs to loop, not
6302 // directly as branch conditions.
6303 return 0;
6304}
6305
6307 const Triple &TT = getTargetMachine().getTargetTriple();
6311}
6312
6314 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6315 return false;
6316
6317 // FIXME: Either avoid relying on address space here or change the default
6318 // address space for functions to avoid the explicit check.
6319 return (GV->getValueType()->isFunctionTy() ||
6322}
6323
6325 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6326}
6327
6329 if (!GV->hasExternalLinkage())
6330 return true;
6331
6332 const auto OS = getTargetMachine().getTargetTriple().getOS();
6333 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6334}
6335
6336/// This transforms the control flow intrinsics to get the branch destination as
6337/// last parameter, also switches branch target with BR if the need arise
6338SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6339 SelectionDAG &DAG) const {
6340 SDLoc DL(BRCOND);
6341
6342 SDNode *Intr = BRCOND.getOperand(1).getNode();
6343 SDValue Target = BRCOND.getOperand(2);
6344 SDNode *BR = nullptr;
6345 SDNode *SetCC = nullptr;
6346
6347 if (Intr->getOpcode() == ISD::SETCC) {
6348 // As long as we negate the condition everything is fine
6349 SetCC = Intr;
6350 Intr = SetCC->getOperand(0).getNode();
6351
6352 } else {
6353 // Get the target from BR if we don't negate the condition
6354 BR = findUser(BRCOND, ISD::BR);
6355 assert(BR && "brcond missing unconditional branch user");
6356 Target = BR->getOperand(1);
6357 }
6358
6359 unsigned CFNode = isCFIntrinsic(Intr);
6360 if (CFNode == 0) {
6361 // This is a uniform branch so we don't need to legalize.
6362 return BRCOND;
6363 }
6364
6365 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6366 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6367
6368 assert(!SetCC ||
6369 (SetCC->getConstantOperandVal(1) == 1 &&
6370 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6371 ISD::SETNE));
6372
6373 // operands of the new intrinsic call
6375 if (HaveChain)
6376 Ops.push_back(BRCOND.getOperand(0));
6377
6378 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6379 Ops.push_back(Target);
6380
6381 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6382
6383 // build the new intrinsic call
6384 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6385
6386 if (!HaveChain) {
6387 SDValue Ops[] = {
6388 SDValue(Result, 0),
6389 BRCOND.getOperand(0)
6390 };
6391
6392 Result = DAG.getMergeValues(Ops, DL).getNode();
6393 }
6394
6395 if (BR) {
6396 // Give the branch instruction our target
6397 SDValue Ops[] = {
6398 BR->getOperand(0),
6399 BRCOND.getOperand(2)
6400 };
6401 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6402 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6403 }
6404
6405 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6406
6407 // Copy the intrinsic results to registers
6408 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6410 if (!CopyToReg)
6411 continue;
6412
6413 Chain = DAG.getCopyToReg(
6414 Chain, DL,
6415 CopyToReg->getOperand(1),
6416 SDValue(Result, i - 1),
6417 SDValue());
6418
6419 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6420 }
6421
6422 // Remove the old intrinsic from the chain
6424 SDValue(Intr, Intr->getNumValues() - 1),
6425 Intr->getOperand(0));
6426
6427 return Chain;
6428}
6429
6430SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6431 SelectionDAG &DAG) const {
6432 MVT VT = Op.getSimpleValueType();
6433 SDLoc DL(Op);
6434 // Checking the depth
6435 if (Op.getConstantOperandVal(0) != 0)
6436 return DAG.getConstant(0, DL, VT);
6437
6440 // Check for kernel and shader functions
6441 if (Info->isEntryFunction())
6442 return DAG.getConstant(0, DL, VT);
6443
6444 MachineFrameInfo &MFI = MF.getFrameInfo();
6445 // There is a call to @llvm.returnaddress in this function
6446 MFI.setReturnAddressIsTaken(true);
6447
6449 // Get the return address reg and mark it as an implicit live-in
6450 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6451
6452 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6453}
6454
6455SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6456 SDValue Op,
6457 const SDLoc &DL,
6458 EVT VT) const {
6459 return Op.getValueType().bitsLE(VT) ?
6460 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6461 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6462 DAG.getTargetConstant(0, DL, MVT::i32));
6463}
6464
6465SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6466 assert(Op.getValueType() == MVT::f16 &&
6467 "Do not know how to custom lower FP_ROUND for non-f16 type");
6468
6469 SDValue Src = Op.getOperand(0);
6470 EVT SrcVT = Src.getValueType();
6471 if (SrcVT != MVT::f64)
6472 return Op;
6473
6474 // TODO: Handle strictfp
6475 if (Op.getOpcode() != ISD::FP_ROUND)
6476 return Op;
6477
6478 SDLoc DL(Op);
6479
6480 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6481 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6482 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6483}
6484
6485SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6486 SelectionDAG &DAG) const {
6487 EVT VT = Op.getValueType();
6488 const MachineFunction &MF = DAG.getMachineFunction();
6490 bool IsIEEEMode = Info->getMode().IEEE;
6491
6492 // FIXME: Assert during selection that this is only selected for
6493 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6494 // mode functions, but this happens to be OK since it's only done in cases
6495 // where there is known no sNaN.
6496 if (IsIEEEMode)
6497 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6498
6499 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6500 VT == MVT::v16bf16)
6501 return splitBinaryVectorOp(Op, DAG);
6502 return Op;
6503}
6504
6505SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6506 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6507 EVT VT = Op.getValueType();
6508 assert(VT == MVT::f16);
6509
6510 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6511 EVT ExpVT = Exp.getValueType();
6512 if (ExpVT == MVT::i16)
6513 return Op;
6514
6515 SDLoc DL(Op);
6516
6517 // Correct the exponent type for f16 to i16.
6518 // Clamp the range of the exponent to the instruction's range.
6519
6520 // TODO: This should be a generic narrowing legalization, and can easily be
6521 // for GlobalISel.
6522
6523 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6524 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6525
6526 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6527 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6528
6529 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6530
6531 if (IsStrict) {
6532 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6533 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6534 }
6535
6536 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6537}
6538
6539// Custom lowering for vector multiplications and s_mul_u64.
6540SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6541 EVT VT = Op.getValueType();
6542
6543 // Split vector operands.
6544 if (VT.isVector())
6545 return splitBinaryVectorOp(Op, DAG);
6546
6547 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6548
6549 // There are four ways to lower s_mul_u64:
6550 //
6551 // 1. If all the operands are uniform, then we lower it as it is.
6552 //
6553 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6554 // multiplications because there is not a vector equivalent of s_mul_u64.
6555 //
6556 // 3. If the cost model decides that it is more efficient to use vector
6557 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6558 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6559 //
6560 // 4. If the cost model decides to use vector registers and both of the
6561 // operands are zero-extended/sign-extended from 32-bits, then we split the
6562 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6563 // possible to check if the operands are zero-extended or sign-extended in
6564 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6565 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6566 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6567 // If the cost model decides that we have to use vector registers, then
6568 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6569 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6570 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6571 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6572 // SIInstrInfo.cpp .
6573
6574 if (Op->isDivergent())
6575 return SDValue();
6576
6577 SDValue Op0 = Op.getOperand(0);
6578 SDValue Op1 = Op.getOperand(1);
6579 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6580 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6581 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6582 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6583 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6584 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6585 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6586 SDLoc SL(Op);
6587 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6588 return SDValue(
6589 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6590 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6591 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6592 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6593 return SDValue(
6594 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6595 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6596 return Op;
6597}
6598
6599SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6600 EVT VT = Op.getValueType();
6601 SDLoc SL(Op);
6602 SDValue LHS = Op.getOperand(0);
6603 SDValue RHS = Op.getOperand(1);
6604 bool isSigned = Op.getOpcode() == ISD::SMULO;
6605
6606 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6607 const APInt &C = RHSC->getAPIntValue();
6608 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6609 if (C.isPowerOf2()) {
6610 // smulo(x, signed_min) is same as umulo(x, signed_min).
6611 bool UseArithShift = isSigned && !C.isMinSignedValue();
6612 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6613 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6614 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6615 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6616 SL, VT, Result, ShiftAmt),
6617 LHS, ISD::SETNE);
6618 return DAG.getMergeValues({ Result, Overflow }, SL);
6619 }
6620 }
6621
6622 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6624 SL, VT, LHS, RHS);
6625
6626 SDValue Sign = isSigned
6627 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6628 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6629 : DAG.getConstant(0, SL, VT);
6630 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6631
6632 return DAG.getMergeValues({ Result, Overflow }, SL);
6633}
6634
6635SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6636 if (Op->isDivergent()) {
6637 // Select to V_MAD_[IU]64_[IU]32.
6638 return Op;
6639 }
6640 if (Subtarget->hasSMulHi()) {
6641 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6642 return SDValue();
6643 }
6644 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6645 // calculate the high part, so we might as well do the whole thing with
6646 // V_MAD_[IU]64_[IU]32.
6647 return Op;
6648}
6649
6650SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6651 if (!Subtarget->isTrapHandlerEnabled() ||
6653 return lowerTrapEndpgm(Op, DAG);
6654
6655 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6656 lowerTrapHsaQueuePtr(Op, DAG);
6657}
6658
6659SDValue SITargetLowering::lowerTrapEndpgm(
6660 SDValue Op, SelectionDAG &DAG) const {
6661 SDLoc SL(Op);
6662 SDValue Chain = Op.getOperand(0);
6663 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6664}
6665
6666SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6667 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6670 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6672 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6675}
6676
6677SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6678 SDValue Op, SelectionDAG &DAG) const {
6679 SDLoc SL(Op);
6680 SDValue Chain = Op.getOperand(0);
6681
6682 SDValue QueuePtr;
6683 // For code object version 5, QueuePtr is passed through implicit kernarg.
6684 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6686 QueuePtr =
6687 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6688 } else {
6691 Register UserSGPR = Info->getQueuePtrUserSGPR();
6692
6693 if (UserSGPR == AMDGPU::NoRegister) {
6694 // We probably are in a function incorrectly marked with
6695 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6696 // trap, so just use a null pointer.
6697 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6698 } else {
6699 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6700 MVT::i64);
6701 }
6702 }
6703
6704 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6705 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6706 QueuePtr, SDValue());
6707
6709 SDValue Ops[] = {
6710 ToReg,
6711 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6712 SGPR01,
6713 ToReg.getValue(1)
6714 };
6715 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6716}
6717
6718SDValue SITargetLowering::lowerTrapHsa(
6719 SDValue Op, SelectionDAG &DAG) const {
6720 SDLoc SL(Op);
6721 SDValue Chain = Op.getOperand(0);
6722
6723 // We need to simulate the 's_trap 2' instruction on targets that run in
6724 // PRIV=1 (where it is treated as a nop).
6725 if (Subtarget->hasPrivEnabledTrap2NopBug())
6726 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6727
6729 SDValue Ops[] = {
6730 Chain,
6731 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6732 };
6733 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6734}
6735
6736SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6737 SDLoc SL(Op);
6738 SDValue Chain = Op.getOperand(0);
6740
6741 if (!Subtarget->isTrapHandlerEnabled() ||
6744 "debugtrap handler not supported",
6745 Op.getDebugLoc(),
6746 DS_Warning);
6747 LLVMContext &Ctx = MF.getFunction().getContext();
6748 Ctx.diagnose(NoTrap);
6749 return Chain;
6750 }
6751
6753 SDValue Ops[] = {
6754 Chain,
6755 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6756 };
6757 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6758}
6759
6760SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6761 SelectionDAG &DAG) const {
6762 if (Subtarget->hasApertureRegs()) {
6763 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6764 ? AMDGPU::SRC_SHARED_BASE
6765 : AMDGPU::SRC_PRIVATE_BASE;
6766 // Note: this feature (register) is broken. When used as a 32-bit operand,
6767 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6768 // bits.
6769 //
6770 // To work around the issue, directly emit a 64 bit mov from this register
6771 // then extract the high bits. Note that this shouldn't even result in a
6772 // shift being emitted and simply become a pair of registers (e.g.):
6773 // s_mov_b64 s[6:7], src_shared_base
6774 // v_mov_b32_e32 v1, s7
6775 //
6776 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6777 // coalescing would kick in and it would think it's okay to use the "HI"
6778 // subregister directly (instead of extracting the HI 32 bits) which is an
6779 // artificial (unusable) register.
6780 // Register TableGen definitions would need an overhaul to get rid of the
6781 // artificial "HI" aperture registers and prevent this kind of issue from
6782 // happening.
6783 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6784 DAG.getRegister(ApertureRegNo, MVT::i64));
6785 return DAG.getNode(
6786 ISD::TRUNCATE, DL, MVT::i32,
6787 DAG.getNode(ISD::SRL, DL, MVT::i64,
6788 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6789 }
6790
6791 // For code object version 5, private_base and shared_base are passed through
6792 // implicit kernargs.
6793 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6797 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6798 }
6799
6802 Register UserSGPR = Info->getQueuePtrUserSGPR();
6803 if (UserSGPR == AMDGPU::NoRegister) {
6804 // We probably are in a function incorrectly marked with
6805 // amdgpu-no-queue-ptr. This is undefined.
6806 return DAG.getUNDEF(MVT::i32);
6807 }
6808
6809 SDValue QueuePtr = CreateLiveInRegister(
6810 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6811
6812 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6813 // private_segment_aperture_base_hi.
6814 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6815
6816 SDValue Ptr =
6817 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
6818
6819 // TODO: Use custom target PseudoSourceValue.
6820 // TODO: We should use the value from the IR intrinsic call, but it might not
6821 // be available and how do we get it?
6823 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6824 commonAlignment(Align(64), StructOffset),
6827}
6828
6829/// Return true if the value is a known valid address, such that a null check is
6830/// not necessary.
6832 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6833 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6834 isa<BasicBlockSDNode>(Val))
6835 return true;
6836
6837 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6838 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6839
6840 // TODO: Search through arithmetic, handle arguments and loads
6841 // marked nonnull.
6842 return false;
6843}
6844
6845SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6846 SelectionDAG &DAG) const {
6847 SDLoc SL(Op);
6848
6849 const AMDGPUTargetMachine &TM =
6850 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6851
6852 unsigned DestAS, SrcAS;
6853 SDValue Src;
6854 bool IsNonNull = false;
6855 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6856 SrcAS = ASC->getSrcAddressSpace();
6857 Src = ASC->getOperand(0);
6858 DestAS = ASC->getDestAddressSpace();
6859 } else {
6860 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6861 Op.getConstantOperandVal(0) ==
6862 Intrinsic::amdgcn_addrspacecast_nonnull);
6863 Src = Op->getOperand(1);
6864 SrcAS = Op->getConstantOperandVal(2);
6865 DestAS = Op->getConstantOperandVal(3);
6866 IsNonNull = true;
6867 }
6868
6869 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6870
6871 // flat -> local/private
6872 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6873 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6874 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6875 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6876
6877 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6878 return Ptr;
6879
6880 unsigned NullVal = TM.getNullPointerValue(DestAS);
6881 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6882 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6883
6884 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6885 SegmentNullPtr);
6886 }
6887 }
6888
6889 // local/private -> flat
6890 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
6891 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6892 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6893
6894 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6895 SDValue CvtPtr =
6896 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6897 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6898
6899 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6900 return CvtPtr;
6901
6902 unsigned NullVal = TM.getNullPointerValue(SrcAS);
6903 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6904
6905 SDValue NonNull
6906 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6907
6908 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6909 FlatNullPtr);
6910 }
6911 }
6912
6913 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6914 Op.getValueType() == MVT::i64) {
6917 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6918 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6919 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6920 }
6921
6922 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6923 Src.getValueType() == MVT::i64)
6924 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6925
6926 // global <-> flat are no-ops and never emitted.
6927
6928 const MachineFunction &MF = DAG.getMachineFunction();
6929 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6930 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6931 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
6932
6933 return DAG.getUNDEF(Op->getValueType(0));
6934}
6935
6936// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6937// the small vector and inserting them into the big vector. That is better than
6938// the default expansion of doing it via a stack slot. Even though the use of
6939// the stack slot would be optimized away afterwards, the stack slot itself
6940// remains.
6941SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6942 SelectionDAG &DAG) const {
6943 SDValue Vec = Op.getOperand(0);
6944 SDValue Ins = Op.getOperand(1);
6945 SDValue Idx = Op.getOperand(2);
6946 EVT VecVT = Vec.getValueType();
6947 EVT InsVT = Ins.getValueType();
6948 EVT EltVT = VecVT.getVectorElementType();
6949 unsigned InsNumElts = InsVT.getVectorNumElements();
6950 unsigned IdxVal = Idx->getAsZExtVal();
6951 SDLoc SL(Op);
6952
6953 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6954 // Insert 32-bit registers at a time.
6955 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6956
6957 unsigned VecNumElts = VecVT.getVectorNumElements();
6958 EVT NewVecVT =
6959 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6960 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6962 MVT::i32, InsNumElts / 2);
6963
6964 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6965 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6966
6967 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6968 SDValue Elt;
6969 if (InsNumElts == 2) {
6970 Elt = Ins;
6971 } else {
6972 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6973 DAG.getConstant(I, SL, MVT::i32));
6974 }
6975 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6976 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6977 }
6978
6979 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6980 }
6981
6982 for (unsigned I = 0; I != InsNumElts; ++I) {
6983 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6984 DAG.getConstant(I, SL, MVT::i32));
6985 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6986 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6987 }
6988 return Vec;
6989}
6990
6991SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6992 SelectionDAG &DAG) const {
6993 SDValue Vec = Op.getOperand(0);
6994 SDValue InsVal = Op.getOperand(1);
6995 SDValue Idx = Op.getOperand(2);
6996 EVT VecVT = Vec.getValueType();
6997 EVT EltVT = VecVT.getVectorElementType();
6998 unsigned VecSize = VecVT.getSizeInBits();
6999 unsigned EltSize = EltVT.getSizeInBits();
7000 SDLoc SL(Op);
7001
7002 // Specially handle the case of v4i16 with static indexing.
7003 unsigned NumElts = VecVT.getVectorNumElements();
7004 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7005 if (NumElts == 4 && EltSize == 16 && KIdx) {
7006 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7007
7008 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7009 DAG.getConstant(0, SL, MVT::i32));
7010 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7011 DAG.getConstant(1, SL, MVT::i32));
7012
7013 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7014 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7015
7016 unsigned Idx = KIdx->getZExtValue();
7017 bool InsertLo = Idx < 2;
7018 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7019 InsertLo ? LoVec : HiVec,
7020 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7021 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7022
7023 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7024
7025 SDValue Concat = InsertLo ?
7026 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7027 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7028
7029 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7030 }
7031
7032 // Static indexing does not lower to stack access, and hence there is no need
7033 // for special custom lowering to avoid stack access.
7034 if (isa<ConstantSDNode>(Idx))
7035 return SDValue();
7036
7037 // Avoid stack access for dynamic indexing by custom lowering to
7038 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7039
7040 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7041
7042 MVT IntVT = MVT::getIntegerVT(VecSize);
7043
7044 // Convert vector index to bit-index and get the required bit mask.
7045 assert(isPowerOf2_32(EltSize));
7046 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7047 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7048 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7049 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7050 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7051
7052 // 1. Create a congruent vector with the target value in each element.
7053 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7054 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7055
7056 // 2. Mask off all other indices except the required index within (1).
7057 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7058
7059 // 3. Mask off the required index within the target vector.
7060 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7061 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7062 DAG.getNOT(SL, BFM, IntVT), BCVec);
7063
7064 // 4. Get (2) and (3) ORed into the target vector.
7065 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7066
7067 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7068}
7069
7070SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7071 SelectionDAG &DAG) const {
7072 SDLoc SL(Op);
7073
7074 EVT ResultVT = Op.getValueType();
7075 SDValue Vec = Op.getOperand(0);
7076 SDValue Idx = Op.getOperand(1);
7077 EVT VecVT = Vec.getValueType();
7078 unsigned VecSize = VecVT.getSizeInBits();
7079 EVT EltVT = VecVT.getVectorElementType();
7080
7081 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7082
7083 // Make sure we do any optimizations that will make it easier to fold
7084 // source modifiers before obscuring it with bit operations.
7085
7086 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7087 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7088 return Combined;
7089
7090 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7091 SDValue Lo, Hi;
7092 EVT LoVT, HiVT;
7093 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7094
7095 if (VecSize == 128) {
7096 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7097 Lo = DAG.getBitcast(LoVT,
7098 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7099 DAG.getConstant(0, SL, MVT::i32)));
7100 Hi = DAG.getBitcast(HiVT,
7101 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7102 DAG.getConstant(1, SL, MVT::i32)));
7103 } else if (VecSize == 256) {
7104 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7105 SDValue Parts[4];
7106 for (unsigned P = 0; P < 4; ++P) {
7107 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7108 DAG.getConstant(P, SL, MVT::i32));
7109 }
7110
7111 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7112 Parts[0], Parts[1]));
7113 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7114 Parts[2], Parts[3]));
7115 } else {
7116 assert(VecSize == 512);
7117
7118 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7119 SDValue Parts[8];
7120 for (unsigned P = 0; P < 8; ++P) {
7121 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7122 DAG.getConstant(P, SL, MVT::i32));
7123 }
7124
7125 Lo = DAG.getBitcast(LoVT,
7126 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7127 Parts[0], Parts[1], Parts[2], Parts[3]));
7128 Hi = DAG.getBitcast(HiVT,
7129 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7130 Parts[4], Parts[5],Parts[6], Parts[7]));
7131 }
7132
7133 EVT IdxVT = Idx.getValueType();
7134 unsigned NElem = VecVT.getVectorNumElements();
7135 assert(isPowerOf2_32(NElem));
7136 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7137 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7138 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7139 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7140 }
7141
7142 assert(VecSize <= 64);
7143
7144 MVT IntVT = MVT::getIntegerVT(VecSize);
7145
7146 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7147 SDValue VecBC = peekThroughBitcasts(Vec);
7148 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7149 SDValue Src = VecBC.getOperand(0);
7150 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7151 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7152 }
7153
7154 unsigned EltSize = EltVT.getSizeInBits();
7155 assert(isPowerOf2_32(EltSize));
7156
7157 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7158
7159 // Convert vector index to bit-index (* EltSize)
7160 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7161
7162 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7163 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7164
7165 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7166 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7167 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7168 }
7169
7170 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7171}
7172
7173static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7174 assert(Elt % 2 == 0);
7175 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7176}
7177
7178SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7179 SelectionDAG &DAG) const {
7180 SDLoc SL(Op);
7181 EVT ResultVT = Op.getValueType();
7182 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7183
7184 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7185 EVT EltVT = PackVT.getVectorElementType();
7186 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7187
7188 // vector_shuffle <0,1,6,7> lhs, rhs
7189 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7190 //
7191 // vector_shuffle <6,7,2,3> lhs, rhs
7192 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7193 //
7194 // vector_shuffle <6,7,0,1> lhs, rhs
7195 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7196
7197 // Avoid scalarizing when both halves are reading from consecutive elements.
7199 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7200 if (elementPairIsContiguous(SVN->getMask(), I)) {
7201 const int Idx = SVN->getMaskElt(I);
7202 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7203 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7204 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7205 PackVT, SVN->getOperand(VecIdx),
7206 DAG.getConstant(EltIdx, SL, MVT::i32));
7207 Pieces.push_back(SubVec);
7208 } else {
7209 const int Idx0 = SVN->getMaskElt(I);
7210 const int Idx1 = SVN->getMaskElt(I + 1);
7211 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7212 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7213 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7214 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7215
7216 SDValue Vec0 = SVN->getOperand(VecIdx0);
7217 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7218 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7219
7220 SDValue Vec1 = SVN->getOperand(VecIdx1);
7221 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7222 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7223 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7224 }
7225 }
7226
7227 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7228}
7229
7230SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7231 SelectionDAG &DAG) const {
7232 SDValue SVal = Op.getOperand(0);
7233 EVT ResultVT = Op.getValueType();
7234 EVT SValVT = SVal.getValueType();
7235 SDValue UndefVal = DAG.getUNDEF(SValVT);
7236 SDLoc SL(Op);
7237
7239 VElts.push_back(SVal);
7240 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7241 VElts.push_back(UndefVal);
7242
7243 return DAG.getBuildVector(ResultVT, SL, VElts);
7244}
7245
7246SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7247 SelectionDAG &DAG) const {
7248 SDLoc SL(Op);
7249 EVT VT = Op.getValueType();
7250
7251 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7252 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7254 VT.getVectorNumElements() / 2);
7255 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7256
7257 // Turn into pair of packed build_vectors.
7258 // TODO: Special case for constants that can be materialized with s_mov_b64.
7259 SmallVector<SDValue, 4> LoOps, HiOps;
7260 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7261 LoOps.push_back(Op.getOperand(I));
7262 HiOps.push_back(Op.getOperand(I + E));
7263 }
7264 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7265 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7266
7267 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7268 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7269
7270 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7271 { CastLo, CastHi });
7272 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7273 }
7274
7275 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7277 VT.getVectorNumElements() / 4);
7278 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7279
7280 SmallVector<SDValue, 4> Parts[4];
7281 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7282 for (unsigned P = 0; P < 4; ++P)
7283 Parts[P].push_back(Op.getOperand(I + P * E));
7284 }
7285 SDValue Casts[4];
7286 for (unsigned P = 0; P < 4; ++P) {
7287 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7288 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7289 }
7290
7291 SDValue Blend =
7292 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7293 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7294 }
7295
7296 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7298 VT.getVectorNumElements() / 8);
7299 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7300
7301 SmallVector<SDValue, 8> Parts[8];
7302 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7303 for (unsigned P = 0; P < 8; ++P)
7304 Parts[P].push_back(Op.getOperand(I + P * E));
7305 }
7306 SDValue Casts[8];
7307 for (unsigned P = 0; P < 8; ++P) {
7308 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7309 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7310 }
7311
7312 SDValue Blend =
7313 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7314 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7315 }
7316
7317 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7318 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7319
7320 SDValue Lo = Op.getOperand(0);
7321 SDValue Hi = Op.getOperand(1);
7322
7323 // Avoid adding defined bits with the zero_extend.
7324 if (Hi.isUndef()) {
7325 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7326 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7327 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7328 }
7329
7330 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7331 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7332
7333 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7334 DAG.getConstant(16, SL, MVT::i32));
7335 if (Lo.isUndef())
7336 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7337
7338 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7339 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7340
7341 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7342 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7343}
7344
7345bool
7347 // OSes that use ELF REL relocations (instead of RELA) can only store a
7348 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7349 // which can create arbitrary 64-bit addends. (This is only a problem for
7350 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7351 // the high 32 bits of the addend.)
7352 //
7353 // This should be kept in sync with how HasRelocationAddend is initialized in
7354 // the constructor of ELFAMDGPUAsmBackend.
7355 if (!Subtarget->isAmdHsaOS())
7356 return false;
7357
7358 // We can fold offsets for anything that doesn't require a GOT relocation.
7359 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7363}
7364
7365static SDValue
7367 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7368 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7369 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7370 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7371 // lowered to the following code sequence:
7372 //
7373 // For constant address space:
7374 // s_getpc_b64 s[0:1]
7375 // s_add_u32 s0, s0, $symbol
7376 // s_addc_u32 s1, s1, 0
7377 //
7378 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7379 // a fixup or relocation is emitted to replace $symbol with a literal
7380 // constant, which is a pc-relative offset from the encoding of the $symbol
7381 // operand to the global variable.
7382 //
7383 // For global address space:
7384 // s_getpc_b64 s[0:1]
7385 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7386 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7387 //
7388 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7389 // fixups or relocations are emitted to replace $symbol@*@lo and
7390 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7391 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7392 // operand to the global variable.
7393 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7394 SDValue PtrHi;
7395 if (GAFlags == SIInstrInfo::MO_NONE)
7396 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7397 else
7398 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7399 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7400}
7401
7402SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7403 SDValue Op,
7404 SelectionDAG &DAG) const {
7405 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7406 SDLoc DL(GSD);
7407 EVT PtrVT = Op.getValueType();
7408
7409 const GlobalValue *GV = GSD->getGlobal();
7415 GV->hasExternalLinkage()) {
7416 Type *Ty = GV->getValueType();
7417 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7418 // zero-sized type in other languages to declare the dynamic shared
7419 // memory which size is not known at the compile time. They will be
7420 // allocated by the runtime and placed directly after the static
7421 // allocated ones. They all share the same offset.
7422 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7423 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7424 // Adjust alignment for that dynamic shared memory array.
7426 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7427 MFI->setUsesDynamicLDS(true);
7428 return SDValue(
7429 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7430 }
7431 }
7433 }
7434
7436 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7438 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7439 }
7440
7441 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7442 SDValue AddrLo = DAG.getTargetGlobalAddress(
7443 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7444 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7445
7446 SDValue AddrHi = DAG.getTargetGlobalAddress(
7447 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7448 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7449
7450 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7451 }
7452
7453 if (shouldEmitFixup(GV))
7454 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7455
7456 if (shouldEmitPCReloc(GV))
7457 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7459
7460 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7462
7463 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7465 const DataLayout &DataLayout = DAG.getDataLayout();
7466 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7467 MachinePointerInfo PtrInfo
7469
7470 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7473}
7474
7476 const SDLoc &DL, SDValue V) const {
7477 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7478 // the destination register.
7479 //
7480 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7481 // so we will end up with redundant moves to m0.
7482 //
7483 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7484
7485 // A Null SDValue creates a glue result.
7486 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7487 V, Chain);
7488 return SDValue(M0, 0);
7489}
7490
7491SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7492 SDValue Op,
7493 MVT VT,
7494 unsigned Offset) const {
7495 SDLoc SL(Op);
7496 SDValue Param = lowerKernargMemParameter(
7497 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7498 // The local size values will have the hi 16-bits as zero.
7499 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7500 DAG.getValueType(VT));
7501}
7502
7504 EVT VT) {
7506 "non-hsa intrinsic with hsa target",
7507 DL.getDebugLoc());
7508 DAG.getContext()->diagnose(BadIntrin);
7509 return DAG.getUNDEF(VT);
7510}
7511
7513 EVT VT) {
7515 "intrinsic not supported on subtarget",
7516 DL.getDebugLoc());
7517 DAG.getContext()->diagnose(BadIntrin);
7518 return DAG.getUNDEF(VT);
7519}
7520
7522 ArrayRef<SDValue> Elts) {
7523 assert(!Elts.empty());
7524 MVT Type;
7525 unsigned NumElts = Elts.size();
7526
7527 if (NumElts <= 12) {
7528 Type = MVT::getVectorVT(MVT::f32, NumElts);
7529 } else {
7530 assert(Elts.size() <= 16);
7531 Type = MVT::v16f32;
7532 NumElts = 16;
7533 }
7534
7535 SmallVector<SDValue, 16> VecElts(NumElts);
7536 for (unsigned i = 0; i < Elts.size(); ++i) {
7537 SDValue Elt = Elts[i];
7538 if (Elt.getValueType() != MVT::f32)
7539 Elt = DAG.getBitcast(MVT::f32, Elt);
7540 VecElts[i] = Elt;
7541 }
7542 for (unsigned i = Elts.size(); i < NumElts; ++i)
7543 VecElts[i] = DAG.getUNDEF(MVT::f32);
7544
7545 if (NumElts == 1)
7546 return VecElts[0];
7547 return DAG.getBuildVector(Type, DL, VecElts);
7548}
7549
7550static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7551 SDValue Src, int ExtraElts) {
7552 EVT SrcVT = Src.getValueType();
7553
7555
7556 if (SrcVT.isVector())
7557 DAG.ExtractVectorElements(Src, Elts);
7558 else
7559 Elts.push_back(Src);
7560
7561 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7562 while (ExtraElts--)
7563 Elts.push_back(Undef);
7564
7565 return DAG.getBuildVector(CastVT, DL, Elts);
7566}
7567
7568// Re-construct the required return value for a image load intrinsic.
7569// This is more complicated due to the optional use TexFailCtrl which means the required
7570// return type is an aggregate
7572 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7573 bool Unpacked, bool IsD16, int DMaskPop,
7574 int NumVDataDwords, bool IsAtomicPacked16Bit,
7575 const SDLoc &DL) {
7576 // Determine the required return type. This is the same regardless of IsTexFail flag
7577 EVT ReqRetVT = ResultTypes[0];
7578 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7579 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7580 ? (ReqRetNumElts + 1) / 2
7581 : ReqRetNumElts;
7582
7583 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7584 DMaskPop : (DMaskPop + 1) / 2;
7585
7586 MVT DataDwordVT = NumDataDwords == 1 ?
7587 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7588
7589 MVT MaskPopVT = MaskPopDwords == 1 ?
7590 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7591
7592 SDValue Data(Result, 0);
7593 SDValue TexFail;
7594
7595 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7596 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7597 if (MaskPopVT.isVector()) {
7598 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7599 SDValue(Result, 0), ZeroIdx);
7600 } else {
7601 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7602 SDValue(Result, 0), ZeroIdx);
7603 }
7604 }
7605
7606 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7607 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7608 NumDataDwords - MaskPopDwords);
7609
7610 if (IsD16)
7611 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7612
7613 EVT LegalReqRetVT = ReqRetVT;
7614 if (!ReqRetVT.isVector()) {
7615 if (!Data.getValueType().isInteger())
7616 Data = DAG.getNode(ISD::BITCAST, DL,
7617 Data.getValueType().changeTypeToInteger(), Data);
7618 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7619 } else {
7620 // We need to widen the return vector to a legal type
7621 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7622 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7623 LegalReqRetVT =
7625 ReqRetVT.getVectorNumElements() + 1);
7626 }
7627 }
7628 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7629
7630 if (IsTexFail) {
7631 TexFail =
7632 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7633 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7634
7635 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7636 }
7637
7638 if (Result->getNumValues() == 1)
7639 return Data;
7640
7641 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7642}
7643
7644static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7645 SDValue *LWE, bool &IsTexFail) {
7646 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7647
7648 uint64_t Value = TexFailCtrlConst->getZExtValue();
7649 if (Value) {
7650 IsTexFail = true;
7651 }
7652
7653 SDLoc DL(TexFailCtrlConst);
7654 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7655 Value &= ~(uint64_t)0x1;
7656 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7657 Value &= ~(uint64_t)0x2;
7658
7659 return Value == 0;
7660}
7661
7663 MVT PackVectorVT,
7664 SmallVectorImpl<SDValue> &PackedAddrs,
7665 unsigned DimIdx, unsigned EndIdx,
7666 unsigned NumGradients) {
7667 SDLoc DL(Op);
7668 for (unsigned I = DimIdx; I < EndIdx; I++) {
7669 SDValue Addr = Op.getOperand(I);
7670
7671 // Gradients are packed with undef for each coordinate.
7672 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7673 // 1D: undef,dx/dh; undef,dx/dv
7674 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7675 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7676 if (((I + 1) >= EndIdx) ||
7677 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7678 I == DimIdx + NumGradients - 1))) {
7679 if (Addr.getValueType() != MVT::i16)
7680 Addr = DAG.getBitcast(MVT::i16, Addr);
7681 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7682 } else {
7683 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7684 I++;
7685 }
7686 Addr = DAG.getBitcast(MVT::f32, Addr);
7687 PackedAddrs.push_back(Addr);
7688 }
7689}
7690
7691SDValue SITargetLowering::lowerImage(SDValue Op,
7693 SelectionDAG &DAG, bool WithChain) const {
7694 SDLoc DL(Op);
7696 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7697 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7699 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7700 unsigned IntrOpcode = Intr->BaseOpcode;
7701 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7702 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7703 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7704
7705 SmallVector<EVT, 3> ResultTypes(Op->values());
7706 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7707 bool IsD16 = false;
7708 bool IsG16 = false;
7709 bool IsA16 = false;
7710 SDValue VData;
7711 int NumVDataDwords;
7712 bool AdjustRetType = false;
7713 bool IsAtomicPacked16Bit = false;
7714
7715 // Offset of intrinsic arguments
7716 const unsigned ArgOffset = WithChain ? 2 : 1;
7717
7718 unsigned DMask;
7719 unsigned DMaskLanes = 0;
7720
7721 if (BaseOpcode->Atomic) {
7722 VData = Op.getOperand(2);
7723
7724 IsAtomicPacked16Bit =
7725 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7726 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7727
7728 bool Is64Bit = VData.getValueSizeInBits() == 64;
7729 if (BaseOpcode->AtomicX2) {
7730 SDValue VData2 = Op.getOperand(3);
7731 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7732 {VData, VData2});
7733 if (Is64Bit)
7734 VData = DAG.getBitcast(MVT::v4i32, VData);
7735
7736 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7737 DMask = Is64Bit ? 0xf : 0x3;
7738 NumVDataDwords = Is64Bit ? 4 : 2;
7739 } else {
7740 DMask = Is64Bit ? 0x3 : 0x1;
7741 NumVDataDwords = Is64Bit ? 2 : 1;
7742 }
7743 } else {
7744 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7745 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7746
7747 if (BaseOpcode->Store) {
7748 VData = Op.getOperand(2);
7749
7750 MVT StoreVT = VData.getSimpleValueType();
7751 if (StoreVT.getScalarType() == MVT::f16) {
7752 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7753 return Op; // D16 is unsupported for this instruction
7754
7755 IsD16 = true;
7756 VData = handleD16VData(VData, DAG, true);
7757 }
7758
7759 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7760 } else {
7761 // Work out the num dwords based on the dmask popcount and underlying type
7762 // and whether packing is supported.
7763 MVT LoadVT = ResultTypes[0].getSimpleVT();
7764 if (LoadVT.getScalarType() == MVT::f16) {
7765 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7766 return Op; // D16 is unsupported for this instruction
7767
7768 IsD16 = true;
7769 }
7770
7771 // Confirm that the return type is large enough for the dmask specified
7772 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7773 (!LoadVT.isVector() && DMaskLanes > 1))
7774 return Op;
7775
7776 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7777 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7778 // instructions.
7779 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7780 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7781 NumVDataDwords = (DMaskLanes + 1) / 2;
7782 else
7783 NumVDataDwords = DMaskLanes;
7784
7785 AdjustRetType = true;
7786 }
7787 }
7788
7789 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7791
7792 // Check for 16 bit addresses or derivatives and pack if true.
7793 MVT VAddrVT =
7794 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7795 MVT VAddrScalarVT = VAddrVT.getScalarType();
7796 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7797 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7798
7799 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7800 VAddrScalarVT = VAddrVT.getScalarType();
7801 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7802 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7803
7804 // Push back extra arguments.
7805 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7806 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7807 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7808 // Special handling of bias when A16 is on. Bias is of type half but
7809 // occupies full 32-bit.
7810 SDValue Bias = DAG.getBuildVector(
7811 MVT::v2f16, DL,
7812 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7813 VAddrs.push_back(Bias);
7814 } else {
7815 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7816 "Bias needs to be converted to 16 bit in A16 mode");
7817 VAddrs.push_back(Op.getOperand(ArgOffset + I));
7818 }
7819 }
7820
7821 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7822 // 16 bit gradients are supported, but are tied to the A16 control
7823 // so both gradients and addresses must be 16 bit
7824 LLVM_DEBUG(
7825 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7826 "require 16 bit args for both gradients and addresses");
7827 return Op;
7828 }
7829
7830 if (IsA16) {
7831 if (!ST->hasA16()) {
7832 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7833 "support 16 bit addresses\n");
7834 return Op;
7835 }
7836 }
7837
7838 // We've dealt with incorrect input so we know that if IsA16, IsG16
7839 // are set then we have to compress/pack operands (either address,
7840 // gradient or both)
7841 // In the case where a16 and gradients are tied (no G16 support) then we
7842 // have already verified that both IsA16 and IsG16 are true
7843 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7844 // Activate g16
7845 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7847 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7848 }
7849
7850 // Add gradients (packed or unpacked)
7851 if (IsG16) {
7852 // Pack the gradients
7853 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7854 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
7855 ArgOffset + Intr->GradientStart,
7856 ArgOffset + Intr->CoordStart, Intr->NumGradients);
7857 } else {
7858 for (unsigned I = ArgOffset + Intr->GradientStart;
7859 I < ArgOffset + Intr->CoordStart; I++)
7860 VAddrs.push_back(Op.getOperand(I));
7861 }
7862
7863 // Add addresses (packed or unpacked)
7864 if (IsA16) {
7865 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
7866 ArgOffset + Intr->CoordStart, VAddrEnd,
7867 0 /* No gradients */);
7868 } else {
7869 // Add uncompressed address
7870 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7871 VAddrs.push_back(Op.getOperand(I));
7872 }
7873
7874 // If the register allocator cannot place the address registers contiguously
7875 // without introducing moves, then using the non-sequential address encoding
7876 // is always preferable, since it saves VALU instructions and is usually a
7877 // wash in terms of code size or even better.
7878 //
7879 // However, we currently have no way of hinting to the register allocator that
7880 // MIMG addresses should be placed contiguously when it is possible to do so,
7881 // so force non-NSA for the common 2-address case as a heuristic.
7882 //
7883 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7884 // allocation when possible.
7885 //
7886 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7887 // set of the remaining addresses.
7888 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
7889 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7890 const bool UseNSA = ST->hasNSAEncoding() &&
7891 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7892 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7893 const bool UsePartialNSA =
7894 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7895
7896 SDValue VAddr;
7897 if (UsePartialNSA) {
7898 VAddr = getBuildDwordsVector(DAG, DL,
7899 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7900 }
7901 else if (!UseNSA) {
7902 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
7903 }
7904
7905 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7906 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7907 SDValue Unorm;
7908 if (!BaseOpcode->Sampler) {
7909 Unorm = True;
7910 } else {
7911 uint64_t UnormConst =
7912 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
7913
7914 Unorm = UnormConst ? True : False;
7915 }
7916
7917 SDValue TFE;
7918 SDValue LWE;
7919 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
7920 bool IsTexFail = false;
7921 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7922 return Op;
7923
7924 if (IsTexFail) {
7925 if (!DMaskLanes) {
7926 // Expecting to get an error flag since TFC is on - and dmask is 0
7927 // Force dmask to be at least 1 otherwise the instruction will fail
7928 DMask = 0x1;
7929 DMaskLanes = 1;
7930 NumVDataDwords = 1;
7931 }
7932 NumVDataDwords += 1;
7933 AdjustRetType = true;
7934 }
7935
7936 // Has something earlier tagged that the return type needs adjusting
7937 // This happens if the instruction is a load or has set TexFailCtrl flags
7938 if (AdjustRetType) {
7939 // NumVDataDwords reflects the true number of dwords required in the return type
7940 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7941 // This is a no-op load. This can be eliminated
7942 SDValue Undef = DAG.getUNDEF(Op.getValueType());
7943 if (isa<MemSDNode>(Op))
7944 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
7945 return Undef;
7946 }
7947
7948 EVT NewVT = NumVDataDwords > 1 ?
7949 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7950 : MVT::i32;
7951
7952 ResultTypes[0] = NewVT;
7953 if (ResultTypes.size() == 3) {
7954 // Original result was aggregate type used for TexFailCtrl results
7955 // The actual instruction returns as a vector type which has now been
7956 // created. Remove the aggregate result.
7957 ResultTypes.erase(&ResultTypes[1]);
7958 }
7959 }
7960
7961 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
7962 if (BaseOpcode->Atomic)
7963 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7964 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7966 return Op;
7967
7969 if (BaseOpcode->Store || BaseOpcode->Atomic)
7970 Ops.push_back(VData); // vdata
7971 if (UsePartialNSA) {
7972 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
7973 Ops.push_back(VAddr);
7974 }
7975 else if (UseNSA)
7976 append_range(Ops, VAddrs);
7977 else
7978 Ops.push_back(VAddr);
7979 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
7980 if (BaseOpcode->Sampler)
7981 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
7982 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7983 if (IsGFX10Plus)
7984 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7985 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7986 Ops.push_back(Unorm);
7987 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7988 Ops.push_back(IsA16 && // r128, a16 for gfx9
7989 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7990 if (IsGFX10Plus)
7991 Ops.push_back(IsA16 ? True : False);
7992 if (!Subtarget->hasGFX90AInsts()) {
7993 Ops.push_back(TFE); //tfe
7994 } else if (TFE->getAsZExtVal()) {
7995 report_fatal_error("TFE is not supported on this GPU");
7996 }
7997 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7998 Ops.push_back(LWE); // lwe
7999 if (!IsGFX10Plus)
8000 Ops.push_back(DimInfo->DA ? True : False);
8001 if (BaseOpcode->HasD16)
8002 Ops.push_back(IsD16 ? True : False);
8003 if (isa<MemSDNode>(Op))
8004 Ops.push_back(Op.getOperand(0)); // chain
8005
8006 int NumVAddrDwords =
8007 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8008 int Opcode = -1;
8009
8010 if (IsGFX12Plus) {
8011 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8012 NumVDataDwords, NumVAddrDwords);
8013 } else if (IsGFX11Plus) {
8014 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8015 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8016 : AMDGPU::MIMGEncGfx11Default,
8017 NumVDataDwords, NumVAddrDwords);
8018 } else if (IsGFX10Plus) {
8019 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8020 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8021 : AMDGPU::MIMGEncGfx10Default,
8022 NumVDataDwords, NumVAddrDwords);
8023 } else {
8024 if (Subtarget->hasGFX90AInsts()) {
8025 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8026 NumVDataDwords, NumVAddrDwords);
8027 if (Opcode == -1)
8029 "requested image instruction is not supported on this GPU");
8030 }
8031 if (Opcode == -1 &&
8033 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8034 NumVDataDwords, NumVAddrDwords);
8035 if (Opcode == -1)
8036 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8037 NumVDataDwords, NumVAddrDwords);
8038 }
8039 if (Opcode == -1)
8040 return Op;
8041
8042 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8043 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8044 MachineMemOperand *MemRef = MemOp->getMemOperand();
8045 DAG.setNodeMemRefs(NewNode, {MemRef});
8046 }
8047
8048 if (BaseOpcode->AtomicX2) {
8050 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8051 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8052 }
8053 if (BaseOpcode->Store)
8054 return SDValue(NewNode, 0);
8055 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8056 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8057 NumVDataDwords, IsAtomicPacked16Bit, DL);
8058}
8059
8060SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8061 SDValue Offset, SDValue CachePolicy,
8062 SelectionDAG &DAG) const {
8064
8065 const DataLayout &DataLayout = DAG.getDataLayout();
8066 Align Alignment =
8068
8073 VT.getStoreSize(), Alignment);
8074
8075 if (!Offset->isDivergent()) {
8076 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8077
8078 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8079 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8080 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8081 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8082 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8083 SDValue BufferLoad =
8085 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8086 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8087 }
8088
8089 // Widen vec3 load to vec4.
8090 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8091 !Subtarget->hasScalarDwordx3Loads()) {
8092 EVT WidenedVT =
8094 auto WidenedOp = DAG.getMemIntrinsicNode(
8095 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8096 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8097 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8098 DAG.getVectorIdxConstant(0, DL));
8099 return Subvector;
8100 }
8101
8103 DAG.getVTList(VT), Ops, VT, MMO);
8104 }
8105
8106 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8107 // assume that the buffer is unswizzled.
8108 SDValue Ops[] = {
8109 DAG.getEntryNode(), // Chain
8110 Rsrc, // rsrc
8111 DAG.getConstant(0, DL, MVT::i32), // vindex
8112 {}, // voffset
8113 {}, // soffset
8114 {}, // offset
8115 CachePolicy, // cachepolicy
8116 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8117 };
8118 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8119 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8120 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8121 }
8122
8124 unsigned NumLoads = 1;
8125 MVT LoadVT = VT.getSimpleVT();
8126 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8127 assert((LoadVT.getScalarType() == MVT::i32 ||
8128 LoadVT.getScalarType() == MVT::f32));
8129
8130 if (NumElts == 8 || NumElts == 16) {
8131 NumLoads = NumElts / 4;
8132 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8133 }
8134
8135 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8136
8137 // Use the alignment to ensure that the required offsets will fit into the
8138 // immediate offsets.
8139 setBufferOffsets(Offset, DAG, &Ops[3],
8140 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8141
8142 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8143 for (unsigned i = 0; i < NumLoads; ++i) {
8144 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8145 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8146 LoadVT, MMO, DAG));
8147 }
8148
8149 if (NumElts == 8 || NumElts == 16)
8150 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8151
8152 return Loads[0];
8153}
8154
8155SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8156 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8157 if (!Subtarget->hasArchitectedSGPRs())
8158 return {};
8159 SDLoc SL(Op);
8160 MVT VT = MVT::i32;
8161 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8162 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8163 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8164}
8165
8166SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8167 unsigned Dim,
8168 const ArgDescriptor &Arg) const {
8169 SDLoc SL(Op);
8171 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8172 if (MaxID == 0)
8173 return DAG.getConstant(0, SL, MVT::i32);
8174
8175 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8176 SDLoc(DAG.getEntryNode()), Arg);
8177
8178 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8179 // masking operations anyway.
8180 //
8181 // TODO: We could assert the top bit is 0 for the source copy.
8182 if (Arg.isMasked())
8183 return Val;
8184
8185 // Preserve the known bits after expansion to a copy.
8187 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8188 DAG.getValueType(SmallVT));
8189}
8190
8191SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8192 SelectionDAG &DAG) const {
8194 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8195
8196 EVT VT = Op.getValueType();
8197 SDLoc DL(Op);
8198 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8199
8200 // TODO: Should this propagate fast-math-flags?
8201
8202 switch (IntrinsicID) {
8203 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8204 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8205 return emitNonHSAIntrinsicError(DAG, DL, VT);
8206 return getPreloadedValue(DAG, *MFI, VT,
8208 }
8209 case Intrinsic::amdgcn_dispatch_ptr:
8210 case Intrinsic::amdgcn_queue_ptr: {
8211 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8212 DiagnosticInfoUnsupported BadIntrin(
8213 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8214 DL.getDebugLoc());
8215 DAG.getContext()->diagnose(BadIntrin);
8216 return DAG.getUNDEF(VT);
8217 }
8218
8219 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8221 return getPreloadedValue(DAG, *MFI, VT, RegID);
8222 }
8223 case Intrinsic::amdgcn_implicitarg_ptr: {
8224 if (MFI->isEntryFunction())
8225 return getImplicitArgPtr(DAG, DL);
8226 return getPreloadedValue(DAG, *MFI, VT,
8228 }
8229 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8231 // This only makes sense to call in a kernel, so just lower to null.
8232 return DAG.getConstant(0, DL, VT);
8233 }
8234
8235 return getPreloadedValue(DAG, *MFI, VT,
8237 }
8238 case Intrinsic::amdgcn_dispatch_id: {
8239 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8240 }
8241 case Intrinsic::amdgcn_rcp:
8242 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8243 case Intrinsic::amdgcn_rsq:
8244 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8245 case Intrinsic::amdgcn_rsq_legacy:
8247 return emitRemovedIntrinsicError(DAG, DL, VT);
8248 return SDValue();
8249 case Intrinsic::amdgcn_rcp_legacy:
8251 return emitRemovedIntrinsicError(DAG, DL, VT);
8252 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8253 case Intrinsic::amdgcn_rsq_clamp: {
8255 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8256
8257 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8260
8261 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8262 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8263 DAG.getConstantFP(Max, DL, VT));
8264 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8265 DAG.getConstantFP(Min, DL, VT));
8266 }
8267 case Intrinsic::r600_read_ngroups_x:
8268 if (Subtarget->isAmdHsaOS())
8269 return emitNonHSAIntrinsicError(DAG, DL, VT);
8270
8271 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8273 false);
8274 case Intrinsic::r600_read_ngroups_y:
8275 if (Subtarget->isAmdHsaOS())
8276 return emitNonHSAIntrinsicError(DAG, DL, VT);
8277
8278 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8280 false);
8281 case Intrinsic::r600_read_ngroups_z:
8282 if (Subtarget->isAmdHsaOS())
8283 return emitNonHSAIntrinsicError(DAG, DL, VT);
8284
8285 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8287 false);
8288 case Intrinsic::r600_read_global_size_x:
8289 if (Subtarget->isAmdHsaOS())
8290 return emitNonHSAIntrinsicError(DAG, DL, VT);
8291
8292 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8294 Align(4), false);
8295 case Intrinsic::r600_read_global_size_y:
8296 if (Subtarget->isAmdHsaOS())
8297 return emitNonHSAIntrinsicError(DAG, DL, VT);
8298
8299 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8301 Align(4), false);
8302 case Intrinsic::r600_read_global_size_z:
8303 if (Subtarget->isAmdHsaOS())
8304 return emitNonHSAIntrinsicError(DAG, DL, VT);
8305
8306 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8308 Align(4), false);
8309 case Intrinsic::r600_read_local_size_x:
8310 if (Subtarget->isAmdHsaOS())
8311 return emitNonHSAIntrinsicError(DAG, DL, VT);
8312
8313 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8315 case Intrinsic::r600_read_local_size_y:
8316 if (Subtarget->isAmdHsaOS())
8317 return emitNonHSAIntrinsicError(DAG, DL, VT);
8318
8319 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8321 case Intrinsic::r600_read_local_size_z:
8322 if (Subtarget->isAmdHsaOS())
8323 return emitNonHSAIntrinsicError(DAG, DL, VT);
8324
8325 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8327 case Intrinsic::amdgcn_workgroup_id_x:
8328 return getPreloadedValue(DAG, *MFI, VT,
8330 case Intrinsic::amdgcn_workgroup_id_y:
8331 return getPreloadedValue(DAG, *MFI, VT,
8333 case Intrinsic::amdgcn_workgroup_id_z:
8334 return getPreloadedValue(DAG, *MFI, VT,
8336 case Intrinsic::amdgcn_wave_id:
8337 return lowerWaveID(DAG, Op);
8338 case Intrinsic::amdgcn_lds_kernel_id: {
8339 if (MFI->isEntryFunction())
8340 return getLDSKernelId(DAG, DL);
8341 return getPreloadedValue(DAG, *MFI, VT,
8343 }
8344 case Intrinsic::amdgcn_workitem_id_x:
8345 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8346 case Intrinsic::amdgcn_workitem_id_y:
8347 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8348 case Intrinsic::amdgcn_workitem_id_z:
8349 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8350 case Intrinsic::amdgcn_wavefrontsize:
8352 SDLoc(Op), MVT::i32);
8353 case Intrinsic::amdgcn_s_buffer_load: {
8354 unsigned CPol = Op.getConstantOperandVal(3);
8355 // s_buffer_load, because of how it's optimized, can't be volatile
8356 // so reject ones with the volatile bit set.
8357 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8360 return Op;
8361 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8362 DAG);
8363 }
8364 case Intrinsic::amdgcn_fdiv_fast:
8365 return lowerFDIV_FAST(Op, DAG);
8366 case Intrinsic::amdgcn_sin:
8367 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8368
8369 case Intrinsic::amdgcn_cos:
8370 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8371
8372 case Intrinsic::amdgcn_mul_u24:
8373 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8374 case Intrinsic::amdgcn_mul_i24:
8375 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8376
8377 case Intrinsic::amdgcn_log_clamp: {
8379 return SDValue();
8380
8381 return emitRemovedIntrinsicError(DAG, DL, VT);
8382 }
8383 case Intrinsic::amdgcn_fract:
8384 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8385
8386 case Intrinsic::amdgcn_class:
8387 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8388 Op.getOperand(1), Op.getOperand(2));
8389 case Intrinsic::amdgcn_div_fmas:
8390 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8391 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8392 Op.getOperand(4));
8393
8394 case Intrinsic::amdgcn_div_fixup:
8395 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8396 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8397
8398 case Intrinsic::amdgcn_div_scale: {
8399 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8400
8401 // Translate to the operands expected by the machine instruction. The
8402 // first parameter must be the same as the first instruction.
8403 SDValue Numerator = Op.getOperand(1);
8404 SDValue Denominator = Op.getOperand(2);
8405
8406 // Note this order is opposite of the machine instruction's operations,
8407 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8408 // intrinsic has the numerator as the first operand to match a normal
8409 // division operation.
8410
8411 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8412
8413 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8414 Denominator, Numerator);
8415 }
8416 case Intrinsic::amdgcn_icmp: {
8417 // There is a Pat that handles this variant, so return it as-is.
8418 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8419 Op.getConstantOperandVal(2) == 0 &&
8420 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8421 return Op;
8422 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8423 }
8424 case Intrinsic::amdgcn_fcmp: {
8425 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8426 }
8427 case Intrinsic::amdgcn_ballot:
8428 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8429 case Intrinsic::amdgcn_fmed3:
8430 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8431 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8432 case Intrinsic::amdgcn_fdot2:
8433 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8434 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8435 Op.getOperand(4));
8436 case Intrinsic::amdgcn_fmul_legacy:
8437 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8438 Op.getOperand(1), Op.getOperand(2));
8439 case Intrinsic::amdgcn_sffbh:
8440 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8441 case Intrinsic::amdgcn_sbfe:
8442 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8443 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8444 case Intrinsic::amdgcn_ubfe:
8445 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8446 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8447 case Intrinsic::amdgcn_cvt_pkrtz:
8448 case Intrinsic::amdgcn_cvt_pknorm_i16:
8449 case Intrinsic::amdgcn_cvt_pknorm_u16:
8450 case Intrinsic::amdgcn_cvt_pk_i16:
8451 case Intrinsic::amdgcn_cvt_pk_u16: {
8452 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8453 EVT VT = Op.getValueType();
8454 unsigned Opcode;
8455
8456 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8458 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8460 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8462 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8464 else
8466
8467 if (isTypeLegal(VT))
8468 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8469
8470 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8471 Op.getOperand(1), Op.getOperand(2));
8472 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8473 }
8474 case Intrinsic::amdgcn_fmad_ftz:
8475 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8476 Op.getOperand(2), Op.getOperand(3));
8477
8478 case Intrinsic::amdgcn_if_break:
8479 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8480 Op->getOperand(1), Op->getOperand(2)), 0);
8481
8482 case Intrinsic::amdgcn_groupstaticsize: {
8484 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8485 return Op;
8486
8487 const Module *M = MF.getFunction().getParent();
8488 const GlobalValue *GV =
8489 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8490 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8492 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8493 }
8494 case Intrinsic::amdgcn_is_shared:
8495 case Intrinsic::amdgcn_is_private: {
8496 SDLoc SL(Op);
8497 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8499 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8500 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8501 Op.getOperand(1));
8502
8503 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8504 DAG.getConstant(1, SL, MVT::i32));
8505 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8506 }
8507 case Intrinsic::amdgcn_perm:
8508 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8509 Op.getOperand(2), Op.getOperand(3));
8510 case Intrinsic::amdgcn_reloc_constant: {
8511 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8512 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8513 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8514 auto RelocSymbol = cast<GlobalVariable>(
8515 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8516 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8518 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8519 }
8520 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8521 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8522 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8528 if (Op.getOperand(4).getValueType() == MVT::i32)
8529 return SDValue();
8530
8531 SDLoc SL(Op);
8532 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8533 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8534 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8535 Op.getOperand(3), IndexKeyi32);
8536 }
8537 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8538 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8539 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8540 if (Op.getOperand(6).getValueType() == MVT::i32)
8541 return SDValue();
8542
8543 SDLoc SL(Op);
8544 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8545 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8546 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8547 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8548 IndexKeyi32, Op.getOperand(7)});
8549 }
8550 case Intrinsic::amdgcn_addrspacecast_nonnull:
8551 return lowerADDRSPACECAST(Op, DAG);
8552 default:
8553 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8555 return lowerImage(Op, ImageDimIntr, DAG, false);
8556
8557 return Op;
8558 }
8559}
8560
8561// On targets not supporting constant in soffset field, turn zero to
8562// SGPR_NULL to avoid generating an extra s_mov with zero.
8564 const GCNSubtarget *Subtarget) {
8565 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8566 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8567 return SOffset;
8568}
8569
8570SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8571 SelectionDAG &DAG,
8572 unsigned NewOpcode) const {
8573 SDLoc DL(Op);
8574
8575 SDValue VData = Op.getOperand(2);
8576 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8577 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8578 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8579 SDValue Ops[] = {
8580 Op.getOperand(0), // Chain
8581 VData, // vdata
8582 Rsrc, // rsrc
8583 DAG.getConstant(0, DL, MVT::i32), // vindex
8584 Offsets.first, // voffset
8585 SOffset, // soffset
8586 Offsets.second, // offset
8587 Op.getOperand(6), // cachepolicy
8588 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8589 };
8590
8591 auto *M = cast<MemSDNode>(Op);
8592
8593 EVT MemVT = VData.getValueType();
8594 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8595 M->getMemOperand());
8596}
8597
8598// Return a value to use for the idxen operand by examining the vindex operand.
8599static unsigned getIdxEn(SDValue VIndex) {
8600 // No need to set idxen if vindex is known to be zero.
8601 return isNullConstant(VIndex) ? 0 : 1;
8602}
8603
8604SDValue
8605SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8606 unsigned NewOpcode) const {
8607 SDLoc DL(Op);
8608
8609 SDValue VData = Op.getOperand(2);
8610 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8611 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8612 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8613 SDValue Ops[] = {
8614 Op.getOperand(0), // Chain
8615 VData, // vdata
8616 Rsrc, // rsrc
8617 Op.getOperand(4), // vindex
8618 Offsets.first, // voffset
8619 SOffset, // soffset
8620 Offsets.second, // offset
8621 Op.getOperand(7), // cachepolicy
8622 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8623 };
8624
8625 auto *M = cast<MemSDNode>(Op);
8626
8627 EVT MemVT = VData.getValueType();
8628 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8629 M->getMemOperand());
8630}
8631
8632SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8633 SelectionDAG &DAG) const {
8634 unsigned IntrID = Op.getConstantOperandVal(1);
8635 SDLoc DL(Op);
8636
8637 switch (IntrID) {
8638 case Intrinsic::amdgcn_ds_ordered_add:
8639 case Intrinsic::amdgcn_ds_ordered_swap: {
8640 MemSDNode *M = cast<MemSDNode>(Op);
8641 SDValue Chain = M->getOperand(0);
8642 SDValue M0 = M->getOperand(2);
8643 SDValue Value = M->getOperand(3);
8644 unsigned IndexOperand = M->getConstantOperandVal(7);
8645 unsigned WaveRelease = M->getConstantOperandVal(8);
8646 unsigned WaveDone = M->getConstantOperandVal(9);
8647
8648 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8649 IndexOperand &= ~0x3f;
8650 unsigned CountDw = 0;
8651
8652 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8653 CountDw = (IndexOperand >> 24) & 0xf;
8654 IndexOperand &= ~(0xf << 24);
8655
8656 if (CountDw < 1 || CountDw > 4) {
8658 "ds_ordered_count: dword count must be between 1 and 4");
8659 }
8660 }
8661
8662 if (IndexOperand)
8663 report_fatal_error("ds_ordered_count: bad index operand");
8664
8665 if (WaveDone && !WaveRelease)
8666 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8667
8668 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8669 unsigned ShaderType =
8671 unsigned Offset0 = OrderedCountIndex << 2;
8672 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8673
8674 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8675 Offset1 |= (CountDw - 1) << 6;
8676
8677 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8678 Offset1 |= ShaderType << 2;
8679
8680 unsigned Offset = Offset0 | (Offset1 << 8);
8681
8682 SDValue Ops[] = {
8683 Chain,
8684 Value,
8685 DAG.getTargetConstant(Offset, DL, MVT::i16),
8686 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8687 };
8689 M->getVTList(), Ops, M->getMemoryVT(),
8690 M->getMemOperand());
8691 }
8692 case Intrinsic::amdgcn_ds_fadd: {
8693 MemSDNode *M = cast<MemSDNode>(Op);
8694 unsigned Opc;
8695 switch (IntrID) {
8696 case Intrinsic::amdgcn_ds_fadd:
8698 break;
8699 }
8700
8701 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
8702 M->getOperand(0), M->getOperand(2), M->getOperand(3),
8703 M->getMemOperand());
8704 }
8705 case Intrinsic::amdgcn_ds_fmin:
8706 case Intrinsic::amdgcn_ds_fmax: {
8707 MemSDNode *M = cast<MemSDNode>(Op);
8708 unsigned Opc;
8709 switch (IntrID) {
8710 case Intrinsic::amdgcn_ds_fmin:
8712 break;
8713 case Intrinsic::amdgcn_ds_fmax:
8715 break;
8716 default:
8717 llvm_unreachable("Unknown intrinsic!");
8718 }
8719 SDValue Ops[] = {
8720 M->getOperand(0), // Chain
8721 M->getOperand(2), // Ptr
8722 M->getOperand(3) // Value
8723 };
8724
8725 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8726 M->getMemoryVT(), M->getMemOperand());
8727 }
8728 case Intrinsic::amdgcn_buffer_load:
8729 case Intrinsic::amdgcn_buffer_load_format: {
8730 unsigned Glc = Op.getConstantOperandVal(5);
8731 unsigned Slc = Op.getConstantOperandVal(6);
8732 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8733 SDValue Ops[] = {
8734 Op.getOperand(0), // Chain
8735 Op.getOperand(2), // rsrc
8736 Op.getOperand(3), // vindex
8737 SDValue(), // voffset -- will be set by setBufferOffsets
8738 SDValue(), // soffset -- will be set by setBufferOffsets
8739 SDValue(), // offset -- will be set by setBufferOffsets
8740 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8741 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8742 };
8743 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
8744
8745 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8747
8748 EVT VT = Op.getValueType();
8749 EVT IntVT = VT.changeTypeToInteger();
8750 auto *M = cast<MemSDNode>(Op);
8751 EVT LoadVT = Op.getValueType();
8752
8753 if (LoadVT.getScalarType() == MVT::f16)
8754 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8755 M, DAG, Ops);
8756
8757 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8758 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8759 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8760 M->getMemOperand());
8761
8762 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8763 M->getMemOperand(), DAG);
8764 }
8765 case Intrinsic::amdgcn_raw_buffer_load:
8766 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8767 case Intrinsic::amdgcn_raw_buffer_load_format:
8768 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8769 const bool IsFormat =
8770 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8771 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8772
8773 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8774 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8775 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8776 SDValue Ops[] = {
8777 Op.getOperand(0), // Chain
8778 Rsrc, // rsrc
8779 DAG.getConstant(0, DL, MVT::i32), // vindex
8780 Offsets.first, // voffset
8781 SOffset, // soffset
8782 Offsets.second, // offset
8783 Op.getOperand(5), // cachepolicy, swizzled buffer
8784 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8785 };
8786
8787 auto *M = cast<MemSDNode>(Op);
8788 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8789 }
8790 case Intrinsic::amdgcn_struct_buffer_load:
8791 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8792 case Intrinsic::amdgcn_struct_buffer_load_format:
8793 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8794 const bool IsFormat =
8795 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8796 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8797
8798 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8799 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8800 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8801 SDValue Ops[] = {
8802 Op.getOperand(0), // Chain
8803 Rsrc, // rsrc
8804 Op.getOperand(3), // vindex
8805 Offsets.first, // voffset
8806 SOffset, // soffset
8807 Offsets.second, // offset
8808 Op.getOperand(6), // cachepolicy, swizzled buffer
8809 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8810 };
8811
8812 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8813 }
8814 case Intrinsic::amdgcn_tbuffer_load: {
8815 MemSDNode *M = cast<MemSDNode>(Op);
8816 EVT LoadVT = Op.getValueType();
8817
8818 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8819 unsigned Dfmt = Op.getConstantOperandVal(7);
8820 unsigned Nfmt = Op.getConstantOperandVal(8);
8821 unsigned Glc = Op.getConstantOperandVal(9);
8822 unsigned Slc = Op.getConstantOperandVal(10);
8823 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8824 SDValue Ops[] = {
8825 Op.getOperand(0), // Chain
8826 Op.getOperand(2), // rsrc
8827 Op.getOperand(3), // vindex
8828 Op.getOperand(4), // voffset
8829 SOffset, // soffset
8830 Op.getOperand(6), // offset
8831 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8832 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8833 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8834 };
8835
8836 if (LoadVT.getScalarType() == MVT::f16)
8837 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8838 M, DAG, Ops);
8839 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8840 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8841 DAG);
8842 }
8843 case Intrinsic::amdgcn_raw_tbuffer_load:
8844 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8845 MemSDNode *M = cast<MemSDNode>(Op);
8846 EVT LoadVT = Op.getValueType();
8847 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8848 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8849 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8850
8851 SDValue Ops[] = {
8852 Op.getOperand(0), // Chain
8853 Rsrc, // rsrc
8854 DAG.getConstant(0, DL, MVT::i32), // vindex
8855 Offsets.first, // voffset
8856 SOffset, // soffset
8857 Offsets.second, // offset
8858 Op.getOperand(5), // format
8859 Op.getOperand(6), // cachepolicy, swizzled buffer
8860 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8861 };
8862
8863 if (LoadVT.getScalarType() == MVT::f16)
8864 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8865 M, DAG, Ops);
8866 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8867 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8868 DAG);
8869 }
8870 case Intrinsic::amdgcn_struct_tbuffer_load:
8871 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8872 MemSDNode *M = cast<MemSDNode>(Op);
8873 EVT LoadVT = Op.getValueType();
8874 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8875 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8876 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8877
8878 SDValue Ops[] = {
8879 Op.getOperand(0), // Chain
8880 Rsrc, // rsrc
8881 Op.getOperand(3), // vindex
8882 Offsets.first, // voffset
8883 SOffset, // soffset
8884 Offsets.second, // offset
8885 Op.getOperand(6), // format
8886 Op.getOperand(7), // cachepolicy, swizzled buffer
8887 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8888 };
8889
8890 if (LoadVT.getScalarType() == MVT::f16)
8891 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8892 M, DAG, Ops);
8893 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8894 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8895 DAG);
8896 }
8897 case Intrinsic::amdgcn_buffer_atomic_swap:
8898 case Intrinsic::amdgcn_buffer_atomic_add:
8899 case Intrinsic::amdgcn_buffer_atomic_sub:
8900 case Intrinsic::amdgcn_buffer_atomic_csub:
8901 case Intrinsic::amdgcn_buffer_atomic_smin:
8902 case Intrinsic::amdgcn_buffer_atomic_umin:
8903 case Intrinsic::amdgcn_buffer_atomic_smax:
8904 case Intrinsic::amdgcn_buffer_atomic_umax:
8905 case Intrinsic::amdgcn_buffer_atomic_and:
8906 case Intrinsic::amdgcn_buffer_atomic_or:
8907 case Intrinsic::amdgcn_buffer_atomic_xor:
8908 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8909 unsigned Slc = Op.getConstantOperandVal(6);
8910 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8911 SDValue Ops[] = {
8912 Op.getOperand(0), // Chain
8913 Op.getOperand(2), // vdata
8914 Op.getOperand(3), // rsrc
8915 Op.getOperand(4), // vindex
8916 SDValue(), // voffset -- will be set by setBufferOffsets
8917 SDValue(), // soffset -- will be set by setBufferOffsets
8918 SDValue(), // offset -- will be set by setBufferOffsets
8919 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8920 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8921 };
8922 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8923
8924 EVT VT = Op.getValueType();
8925
8926 auto *M = cast<MemSDNode>(Op);
8927 unsigned Opcode = 0;
8928
8929 switch (IntrID) {
8930 case Intrinsic::amdgcn_buffer_atomic_swap:
8932 break;
8933 case Intrinsic::amdgcn_buffer_atomic_add:
8935 break;
8936 case Intrinsic::amdgcn_buffer_atomic_sub:
8938 break;
8939 case Intrinsic::amdgcn_buffer_atomic_csub:
8941 break;
8942 case Intrinsic::amdgcn_buffer_atomic_smin:
8944 break;
8945 case Intrinsic::amdgcn_buffer_atomic_umin:
8947 break;
8948 case Intrinsic::amdgcn_buffer_atomic_smax:
8950 break;
8951 case Intrinsic::amdgcn_buffer_atomic_umax:
8953 break;
8954 case Intrinsic::amdgcn_buffer_atomic_and:
8956 break;
8957 case Intrinsic::amdgcn_buffer_atomic_or:
8959 break;
8960 case Intrinsic::amdgcn_buffer_atomic_xor:
8962 break;
8963 case Intrinsic::amdgcn_buffer_atomic_fadd:
8965 break;
8966 default:
8967 llvm_unreachable("unhandled atomic opcode");
8968 }
8969
8970 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8971 M->getMemOperand());
8972 }
8973 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8975 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8977 return lowerRawBufferAtomicIntrin(Op, DAG,
8979 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8980 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8981 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8982 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8983 return lowerStructBufferAtomicIntrin(Op, DAG,
8985 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8986 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8987 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8988 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8989 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8990 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8991 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8992 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8993 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8994 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8995 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8996 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8997 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8998 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8999 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9000 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9002 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9003 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9004 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9005 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9006 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9007 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9008 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9009 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9011 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9012 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9013 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9014 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9015 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9017 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9018 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9019 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9020 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9021 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9023 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9024 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9026 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9027 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9029 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9030 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9032 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9033 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9034 return lowerRawBufferAtomicIntrin(Op, DAG,
9036 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9037 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9038 return lowerStructBufferAtomicIntrin(Op, DAG,
9040 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9042 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9043 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9044 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9045 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9046 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9047 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9048 return lowerStructBufferAtomicIntrin(Op, DAG,
9050 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9052 return lowerStructBufferAtomicIntrin(Op, DAG,
9054 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9056 return lowerStructBufferAtomicIntrin(Op, DAG,
9058 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9060 return lowerStructBufferAtomicIntrin(Op, DAG,
9062 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9064 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9065 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9067 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9068 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9070 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9071 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9073 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9074 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9076 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9077 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9078 return lowerStructBufferAtomicIntrin(Op, DAG,
9080
9081 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
9082 unsigned Slc = Op.getConstantOperandVal(7);
9083 unsigned IdxEn = getIdxEn(Op.getOperand(5));
9084 SDValue Ops[] = {
9085 Op.getOperand(0), // Chain
9086 Op.getOperand(2), // src
9087 Op.getOperand(3), // cmp
9088 Op.getOperand(4), // rsrc
9089 Op.getOperand(5), // vindex
9090 SDValue(), // voffset -- will be set by setBufferOffsets
9091 SDValue(), // soffset -- will be set by setBufferOffsets
9092 SDValue(), // offset -- will be set by setBufferOffsets
9093 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
9094 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9095 };
9096 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
9097
9098 EVT VT = Op.getValueType();
9099 auto *M = cast<MemSDNode>(Op);
9100
9102 Op->getVTList(), Ops, VT, M->getMemOperand());
9103 }
9104 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9105 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9106 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9107 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9108 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9109 SDValue Ops[] = {
9110 Op.getOperand(0), // Chain
9111 Op.getOperand(2), // src
9112 Op.getOperand(3), // cmp
9113 Rsrc, // rsrc
9114 DAG.getConstant(0, DL, MVT::i32), // vindex
9115 Offsets.first, // voffset
9116 SOffset, // soffset
9117 Offsets.second, // offset
9118 Op.getOperand(7), // cachepolicy
9119 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9120 };
9121 EVT VT = Op.getValueType();
9122 auto *M = cast<MemSDNode>(Op);
9123
9125 Op->getVTList(), Ops, VT, M->getMemOperand());
9126 }
9127 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9129 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9130 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9131 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9132 SDValue Ops[] = {
9133 Op.getOperand(0), // Chain
9134 Op.getOperand(2), // src
9135 Op.getOperand(3), // cmp
9136 Rsrc, // rsrc
9137 Op.getOperand(5), // vindex
9138 Offsets.first, // voffset
9139 SOffset, // soffset
9140 Offsets.second, // offset
9141 Op.getOperand(8), // cachepolicy
9142 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9143 };
9144 EVT VT = Op.getValueType();
9145 auto *M = cast<MemSDNode>(Op);
9146
9148 Op->getVTList(), Ops, VT, M->getMemOperand());
9149 }
9150 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9151 MemSDNode *M = cast<MemSDNode>(Op);
9152 SDValue NodePtr = M->getOperand(2);
9153 SDValue RayExtent = M->getOperand(3);
9154 SDValue RayOrigin = M->getOperand(4);
9155 SDValue RayDir = M->getOperand(5);
9156 SDValue RayInvDir = M->getOperand(6);
9157 SDValue TDescr = M->getOperand(7);
9158
9159 assert(NodePtr.getValueType() == MVT::i32 ||
9160 NodePtr.getValueType() == MVT::i64);
9161 assert(RayDir.getValueType() == MVT::v3f16 ||
9162 RayDir.getValueType() == MVT::v3f32);
9163
9164 if (!Subtarget->hasGFX10_AEncoding()) {
9165 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9166 return SDValue();
9167 }
9168
9169 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9170 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9171 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9172 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9173 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9174 const unsigned NumVDataDwords = 4;
9175 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9176 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9177 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9178 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9179 IsGFX12Plus;
9180 const unsigned BaseOpcodes[2][2] = {
9181 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9182 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9183 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9184 int Opcode;
9185 if (UseNSA) {
9186 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9187 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9188 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9189 : AMDGPU::MIMGEncGfx10NSA,
9190 NumVDataDwords, NumVAddrDwords);
9191 } else {
9192 assert(!IsGFX12Plus);
9193 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9194 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9195 : AMDGPU::MIMGEncGfx10Default,
9196 NumVDataDwords, NumVAddrDwords);
9197 }
9198 assert(Opcode != -1);
9199
9201
9202 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9204 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9205 if (Lanes[0].getValueSizeInBits() == 32) {
9206 for (unsigned I = 0; I < 3; ++I)
9207 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9208 } else {
9209 if (IsAligned) {
9210 Ops.push_back(
9211 DAG.getBitcast(MVT::i32,
9212 DAG.getBuildVector(MVT::v2f16, DL,
9213 { Lanes[0], Lanes[1] })));
9214 Ops.push_back(Lanes[2]);
9215 } else {
9216 SDValue Elt0 = Ops.pop_back_val();
9217 Ops.push_back(
9218 DAG.getBitcast(MVT::i32,
9219 DAG.getBuildVector(MVT::v2f16, DL,
9220 { Elt0, Lanes[0] })));
9221 Ops.push_back(
9222 DAG.getBitcast(MVT::i32,
9223 DAG.getBuildVector(MVT::v2f16, DL,
9224 { Lanes[1], Lanes[2] })));
9225 }
9226 }
9227 };
9228
9229 if (UseNSA && IsGFX11Plus) {
9230 Ops.push_back(NodePtr);
9231 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9232 Ops.push_back(RayOrigin);
9233 if (IsA16) {
9234 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9235 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9236 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9237 for (unsigned I = 0; I < 3; ++I) {
9238 MergedLanes.push_back(DAG.getBitcast(
9239 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9240 {DirLanes[I], InvDirLanes[I]})));
9241 }
9242 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9243 } else {
9244 Ops.push_back(RayDir);
9245 Ops.push_back(RayInvDir);
9246 }
9247 } else {
9248 if (Is64)
9249 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9250 2);
9251 else
9252 Ops.push_back(NodePtr);
9253
9254 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9255 packLanes(RayOrigin, true);
9256 packLanes(RayDir, true);
9257 packLanes(RayInvDir, false);
9258 }
9259
9260 if (!UseNSA) {
9261 // Build a single vector containing all the operands so far prepared.
9262 if (NumVAddrDwords > 12) {
9263 SDValue Undef = DAG.getUNDEF(MVT::i32);
9264 Ops.append(16 - Ops.size(), Undef);
9265 }
9266 assert(Ops.size() >= 8 && Ops.size() <= 12);
9267 SDValue MergedOps = DAG.getBuildVector(
9268 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9269 Ops.clear();
9270 Ops.push_back(MergedOps);
9271 }
9272
9273 Ops.push_back(TDescr);
9274 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9275 Ops.push_back(M->getChain());
9276
9277 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9278 MachineMemOperand *MemRef = M->getMemOperand();
9279 DAG.setNodeMemRefs(NewNode, {MemRef});
9280 return SDValue(NewNode, 0);
9281 }
9282 case Intrinsic::amdgcn_global_atomic_fmin:
9283 case Intrinsic::amdgcn_global_atomic_fmax:
9284 case Intrinsic::amdgcn_global_atomic_fmin_num:
9285 case Intrinsic::amdgcn_global_atomic_fmax_num:
9286 case Intrinsic::amdgcn_flat_atomic_fmin:
9287 case Intrinsic::amdgcn_flat_atomic_fmax:
9288 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9289 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9290 MemSDNode *M = cast<MemSDNode>(Op);
9291 SDValue Ops[] = {
9292 M->getOperand(0), // Chain
9293 M->getOperand(2), // Ptr
9294 M->getOperand(3) // Value
9295 };
9296 unsigned Opcode = 0;
9297 switch (IntrID) {
9298 case Intrinsic::amdgcn_global_atomic_fmin:
9299 case Intrinsic::amdgcn_global_atomic_fmin_num:
9300 case Intrinsic::amdgcn_flat_atomic_fmin:
9301 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9303 break;
9304 }
9305 case Intrinsic::amdgcn_global_atomic_fmax:
9306 case Intrinsic::amdgcn_global_atomic_fmax_num:
9307 case Intrinsic::amdgcn_flat_atomic_fmax:
9308 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9310 break;
9311 }
9312 default:
9313 llvm_unreachable("unhandled atomic opcode");
9314 }
9315 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9316 M->getVTList(), Ops, M->getMemoryVT(),
9317 M->getMemOperand());
9318 }
9319 case Intrinsic::amdgcn_s_get_barrier_state: {
9320 SDValue Chain = Op->getOperand(0);
9322 unsigned Opc;
9323 bool IsInlinableBarID = false;
9324 int64_t BarID;
9325
9326 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9327 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9328 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9329 }
9330
9331 if (IsInlinableBarID) {
9332 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9333 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9334 Ops.push_back(K);
9335 } else {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9337 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9338 Ops.push_back(M0Val.getValue(0));
9339 }
9340
9341 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9342 return SDValue(NewMI, 0);
9343 }
9344 default:
9345
9346 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9348 return lowerImage(Op, ImageDimIntr, DAG, true);
9349
9350 return SDValue();
9351 }
9352}
9353
9354// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9355// dwordx4 if on SI and handle TFE loads.
9356SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9357 SDVTList VTList,
9358 ArrayRef<SDValue> Ops, EVT MemVT,
9359 MachineMemOperand *MMO,
9360 SelectionDAG &DAG) const {
9361 LLVMContext &C = *DAG.getContext();
9363 EVT VT = VTList.VTs[0];
9364
9365 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9366 bool IsTFE = VTList.NumVTs == 3;
9367 if (IsTFE) {
9368 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9369 unsigned NumOpDWords = NumValueDWords + 1;
9370 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9371 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9372 MachineMemOperand *OpDWordsMMO =
9373 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9374 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9375 OpDWordsVT, OpDWordsMMO, DAG);
9377 DAG.getVectorIdxConstant(NumValueDWords, DL));
9378 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9379 SDValue ValueDWords =
9380 NumValueDWords == 1
9381 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9383 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9384 ZeroIdx);
9385 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9386 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9387 }
9388
9389 if (!Subtarget->hasDwordx3LoadStores() &&
9390 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9391 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9392 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9393 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9394 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9395 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9396 WidenedMemVT, WidenedMMO);
9398 DAG.getVectorIdxConstant(0, DL));
9399 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9400 }
9401
9402 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9403}
9404
9405SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9406 bool ImageStore) const {
9407 EVT StoreVT = VData.getValueType();
9408
9409 // No change for f16 and legal vector D16 types.
9410 if (!StoreVT.isVector())
9411 return VData;
9412
9413 SDLoc DL(VData);
9414 unsigned NumElements = StoreVT.getVectorNumElements();
9415
9416 if (Subtarget->hasUnpackedD16VMem()) {
9417 // We need to unpack the packed data to store.
9418 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9419 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9420
9421 EVT EquivStoreVT =
9422 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9423 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9424 return DAG.UnrollVectorOp(ZExt.getNode());
9425 }
9426
9427 // The sq block of gfx8.1 does not estimate register use correctly for d16
9428 // image store instructions. The data operand is computed as if it were not a
9429 // d16 image instruction.
9430 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9431 // Bitcast to i16
9432 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9433 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9434
9435 // Decompose into scalars
9437 DAG.ExtractVectorElements(IntVData, Elts);
9438
9439 // Group pairs of i16 into v2i16 and bitcast to i32
9440 SmallVector<SDValue, 4> PackedElts;
9441 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9442 SDValue Pair =
9443 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9444 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9445 PackedElts.push_back(IntPair);
9446 }
9447 if ((NumElements % 2) == 1) {
9448 // Handle v3i16
9449 unsigned I = Elts.size() / 2;
9450 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9451 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9452 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9453 PackedElts.push_back(IntPair);
9454 }
9455
9456 // Pad using UNDEF
9457 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9458
9459 // Build final vector
9460 EVT VecVT =
9461 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9462 return DAG.getBuildVector(VecVT, DL, PackedElts);
9463 }
9464
9465 if (NumElements == 3) {
9466 EVT IntStoreVT =
9468 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9469
9470 EVT WidenedStoreVT = EVT::getVectorVT(
9471 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9472 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9473 WidenedStoreVT.getStoreSizeInBits());
9474 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9475 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9476 }
9477
9478 assert(isTypeLegal(StoreVT));
9479 return VData;
9480}
9481
9482SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9483 SelectionDAG &DAG) const {
9484 SDLoc DL(Op);
9485 SDValue Chain = Op.getOperand(0);
9486 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9488
9489 switch (IntrinsicID) {
9490 case Intrinsic::amdgcn_exp_compr: {
9491 if (!Subtarget->hasCompressedExport()) {
9492 DiagnosticInfoUnsupported BadIntrin(
9494 "intrinsic not supported on subtarget", DL.getDebugLoc());
9495 DAG.getContext()->diagnose(BadIntrin);
9496 }
9497 SDValue Src0 = Op.getOperand(4);
9498 SDValue Src1 = Op.getOperand(5);
9499 // Hack around illegal type on SI by directly selecting it.
9500 if (isTypeLegal(Src0.getValueType()))
9501 return SDValue();
9502
9503 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9504 SDValue Undef = DAG.getUNDEF(MVT::f32);
9505 const SDValue Ops[] = {
9506 Op.getOperand(2), // tgt
9507 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9508 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9509 Undef, // src2
9510 Undef, // src3
9511 Op.getOperand(7), // vm
9512 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9513 Op.getOperand(3), // en
9514 Op.getOperand(0) // Chain
9515 };
9516
9517 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9518 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9519 }
9520 case Intrinsic::amdgcn_s_barrier: {
9523 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9524 if (WGSize <= ST.getWavefrontSize())
9525 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9526 Op.getOperand(0)), 0);
9527 }
9528
9529 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9530 if (ST.hasSplitBarriers()) {
9531 SDValue K =
9533 SDValue BarSignal =
9534 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9535 MVT::Other, K, Op.getOperand(0)),
9536 0);
9537 SDValue BarWait =
9538 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9539 BarSignal.getValue(0)),
9540 0);
9541 return BarWait;
9542 }
9543
9544 return SDValue();
9545 };
9546 case Intrinsic::amdgcn_tbuffer_store: {
9547 SDValue VData = Op.getOperand(2);
9548 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9549 if (IsD16)
9550 VData = handleD16VData(VData, DAG);
9551 unsigned Dfmt = Op.getConstantOperandVal(8);
9552 unsigned Nfmt = Op.getConstantOperandVal(9);
9553 unsigned Glc = Op.getConstantOperandVal(10);
9554 unsigned Slc = Op.getConstantOperandVal(11);
9555 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9556 SDValue Ops[] = {
9557 Chain,
9558 VData, // vdata
9559 Op.getOperand(3), // rsrc
9560 Op.getOperand(4), // vindex
9561 Op.getOperand(5), // voffset
9562 Op.getOperand(6), // soffset
9563 Op.getOperand(7), // offset
9564 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9565 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9566 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9567 };
9568 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9570 MemSDNode *M = cast<MemSDNode>(Op);
9571 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9572 M->getMemoryVT(), M->getMemOperand());
9573 }
9574
9575 case Intrinsic::amdgcn_struct_tbuffer_store:
9576 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9577 SDValue VData = Op.getOperand(2);
9578 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9579 if (IsD16)
9580 VData = handleD16VData(VData, DAG);
9581 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9582 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9583 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9584 SDValue Ops[] = {
9585 Chain,
9586 VData, // vdata
9587 Rsrc, // rsrc
9588 Op.getOperand(4), // vindex
9589 Offsets.first, // voffset
9590 SOffset, // soffset
9591 Offsets.second, // offset
9592 Op.getOperand(7), // format
9593 Op.getOperand(8), // cachepolicy, swizzled buffer
9594 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9595 };
9596 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9598 MemSDNode *M = cast<MemSDNode>(Op);
9599 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9600 M->getMemoryVT(), M->getMemOperand());
9601 }
9602
9603 case Intrinsic::amdgcn_raw_tbuffer_store:
9604 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9605 SDValue VData = Op.getOperand(2);
9606 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9607 if (IsD16)
9608 VData = handleD16VData(VData, DAG);
9609 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9610 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9611 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9612 SDValue Ops[] = {
9613 Chain,
9614 VData, // vdata
9615 Rsrc, // rsrc
9616 DAG.getConstant(0, DL, MVT::i32), // vindex
9617 Offsets.first, // voffset
9618 SOffset, // soffset
9619 Offsets.second, // offset
9620 Op.getOperand(6), // format
9621 Op.getOperand(7), // cachepolicy, swizzled buffer
9622 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9623 };
9624 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9626 MemSDNode *M = cast<MemSDNode>(Op);
9627 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9628 M->getMemoryVT(), M->getMemOperand());
9629 }
9630
9631 case Intrinsic::amdgcn_buffer_store:
9632 case Intrinsic::amdgcn_buffer_store_format: {
9633 SDValue VData = Op.getOperand(2);
9634 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9635 if (IsD16)
9636 VData = handleD16VData(VData, DAG);
9637 unsigned Glc = Op.getConstantOperandVal(6);
9638 unsigned Slc = Op.getConstantOperandVal(7);
9639 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9640 SDValue Ops[] = {
9641 Chain,
9642 VData,
9643 Op.getOperand(3), // rsrc
9644 Op.getOperand(4), // vindex
9645 SDValue(), // voffset -- will be set by setBufferOffsets
9646 SDValue(), // soffset -- will be set by setBufferOffsets
9647 SDValue(), // offset -- will be set by setBufferOffsets
9648 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9649 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9650 };
9651 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
9652
9653 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9655 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9656 MemSDNode *M = cast<MemSDNode>(Op);
9657
9658 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9659 EVT VDataType = VData.getValueType().getScalarType();
9660 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9661 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9662
9663 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9664 M->getMemoryVT(), M->getMemOperand());
9665 }
9666
9667 case Intrinsic::amdgcn_raw_buffer_store:
9668 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9669 case Intrinsic::amdgcn_raw_buffer_store_format:
9670 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9671 const bool IsFormat =
9672 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9673 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9674
9675 SDValue VData = Op.getOperand(2);
9676 EVT VDataVT = VData.getValueType();
9677 EVT EltType = VDataVT.getScalarType();
9678 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9679 if (IsD16) {
9680 VData = handleD16VData(VData, DAG);
9681 VDataVT = VData.getValueType();
9682 }
9683
9684 if (!isTypeLegal(VDataVT)) {
9685 VData =
9686 DAG.getNode(ISD::BITCAST, DL,
9687 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9688 }
9689
9690 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9691 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9692 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9693 SDValue Ops[] = {
9694 Chain,
9695 VData,
9696 Rsrc,
9697 DAG.getConstant(0, DL, MVT::i32), // vindex
9698 Offsets.first, // voffset
9699 SOffset, // soffset
9700 Offsets.second, // offset
9701 Op.getOperand(6), // cachepolicy, swizzled buffer
9702 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9703 };
9704 unsigned Opc =
9706 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9707 MemSDNode *M = cast<MemSDNode>(Op);
9708
9709 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9710 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9711 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9712
9713 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9714 M->getMemoryVT(), M->getMemOperand());
9715 }
9716
9717 case Intrinsic::amdgcn_struct_buffer_store:
9718 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9719 case Intrinsic::amdgcn_struct_buffer_store_format:
9720 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9721 const bool IsFormat =
9722 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9723 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9724
9725 SDValue VData = Op.getOperand(2);
9726 EVT VDataVT = VData.getValueType();
9727 EVT EltType = VDataVT.getScalarType();
9728 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9729
9730 if (IsD16) {
9731 VData = handleD16VData(VData, DAG);
9732 VDataVT = VData.getValueType();
9733 }
9734
9735 if (!isTypeLegal(VDataVT)) {
9736 VData =
9737 DAG.getNode(ISD::BITCAST, DL,
9738 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9739 }
9740
9741 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9742 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9743 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9744 SDValue Ops[] = {
9745 Chain,
9746 VData,
9747 Rsrc,
9748 Op.getOperand(4), // vindex
9749 Offsets.first, // voffset
9750 SOffset, // soffset
9751 Offsets.second, // offset
9752 Op.getOperand(7), // cachepolicy, swizzled buffer
9753 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9754 };
9755 unsigned Opc =
9757 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9758 MemSDNode *M = cast<MemSDNode>(Op);
9759
9760 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9761 EVT VDataType = VData.getValueType().getScalarType();
9762 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9763 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9764
9765 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9766 M->getMemoryVT(), M->getMemOperand());
9767 }
9768 case Intrinsic::amdgcn_raw_buffer_load_lds:
9769 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9770 case Intrinsic::amdgcn_struct_buffer_load_lds:
9771 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9772 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9773 unsigned Opc;
9774 bool HasVIndex =
9775 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9776 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9777 unsigned OpOffset = HasVIndex ? 1 : 0;
9778 SDValue VOffset = Op.getOperand(5 + OpOffset);
9779 bool HasVOffset = !isNullConstant(VOffset);
9780 unsigned Size = Op->getConstantOperandVal(4);
9781
9782 switch (Size) {
9783 default:
9784 return SDValue();
9785 case 1:
9786 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9787 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9788 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9789 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9790 break;
9791 case 2:
9792 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9793 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9794 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9795 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9796 break;
9797 case 4:
9798 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9799 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9800 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9801 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9802 break;
9803 }
9804
9805 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9806
9808
9809 if (HasVIndex && HasVOffset)
9810 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9811 { Op.getOperand(5), // VIndex
9812 VOffset }));
9813 else if (HasVIndex)
9814 Ops.push_back(Op.getOperand(5));
9815 else if (HasVOffset)
9816 Ops.push_back(VOffset);
9817
9818 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9819 Ops.push_back(Rsrc);
9820 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9821 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9822 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9823 Ops.push_back(
9824 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9826 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9827 Ops.push_back(M0Val.getValue(0)); // Chain
9828 Ops.push_back(M0Val.getValue(1)); // Glue
9829
9830 auto *M = cast<MemSDNode>(Op);
9831 MachineMemOperand *LoadMMO = M->getMemOperand();
9832 // Don't set the offset value here because the pointer points to the base of
9833 // the buffer.
9834 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9835
9836 MachinePointerInfo StorePtrI = LoadPtrI;
9837 LoadPtrI.V = PoisonValue::get(
9841
9842 auto F = LoadMMO->getFlags() &
9844 LoadMMO =
9846 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9847
9849 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9850 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9851
9852 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9853 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9854
9855 return SDValue(Load, 0);
9856 }
9857 case Intrinsic::amdgcn_global_load_lds: {
9858 unsigned Opc;
9859 unsigned Size = Op->getConstantOperandVal(4);
9860 switch (Size) {
9861 default:
9862 return SDValue();
9863 case 1:
9864 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9865 break;
9866 case 2:
9867 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9868 break;
9869 case 4:
9870 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9871 break;
9872 }
9873
9874 auto *M = cast<MemSDNode>(Op);
9875 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9876
9878
9879 SDValue Addr = Op.getOperand(2); // Global ptr
9880 SDValue VOffset;
9881 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9882 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9883 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9884 SDValue LHS = Addr.getOperand(0);
9885 SDValue RHS = Addr.getOperand(1);
9886
9887 if (LHS->isDivergent())
9888 std::swap(LHS, RHS);
9889
9890 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9891 RHS.getOperand(0).getValueType() == MVT::i32) {
9892 // add (i64 sgpr), (zero_extend (i32 vgpr))
9893 Addr = LHS;
9894 VOffset = RHS.getOperand(0);
9895 }
9896 }
9897
9898 Ops.push_back(Addr);
9899 if (!Addr->isDivergent()) {
9900 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9901 if (!VOffset)
9902 VOffset = SDValue(
9903 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9904 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9905 Ops.push_back(VOffset);
9906 }
9907
9908 Ops.push_back(Op.getOperand(5)); // Offset
9909 Ops.push_back(Op.getOperand(6)); // CPol
9910 Ops.push_back(M0Val.getValue(0)); // Chain
9911 Ops.push_back(M0Val.getValue(1)); // Glue
9912
9913 MachineMemOperand *LoadMMO = M->getMemOperand();
9914 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9915 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9916 MachinePointerInfo StorePtrI = LoadPtrI;
9917 LoadPtrI.V = PoisonValue::get(
9921 auto F = LoadMMO->getFlags() &
9923 LoadMMO =
9925 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9927 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9928 LoadMMO->getAAInfo());
9929
9930 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9931 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9932
9933 return SDValue(Load, 0);
9934 }
9935 case Intrinsic::amdgcn_end_cf:
9936 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9937 Op->getOperand(2), Chain), 0);
9938 case Intrinsic::amdgcn_s_barrier_init:
9939 case Intrinsic::amdgcn_s_barrier_join:
9940 case Intrinsic::amdgcn_s_wakeup_barrier: {
9941 SDValue Chain = Op->getOperand(0);
9943 SDValue BarOp = Op->getOperand(2);
9944 unsigned Opc;
9945 bool IsInlinableBarID = false;
9946 int64_t BarVal;
9947
9948 if (isa<ConstantSDNode>(BarOp)) {
9949 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9950 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9951 }
9952
9953 if (IsInlinableBarID) {
9954 switch (IntrinsicID) {
9955 default:
9956 return SDValue();
9957 case Intrinsic::amdgcn_s_barrier_init:
9958 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9959 break;
9960 case Intrinsic::amdgcn_s_barrier_join:
9961 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9962 break;
9963 case Intrinsic::amdgcn_s_wakeup_barrier:
9964 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9965 break;
9966 }
9967
9968 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9969 Ops.push_back(K);
9970 } else {
9971 switch (IntrinsicID) {
9972 default:
9973 return SDValue();
9974 case Intrinsic::amdgcn_s_barrier_init:
9975 Opc = AMDGPU::S_BARRIER_INIT_M0;
9976 break;
9977 case Intrinsic::amdgcn_s_barrier_join:
9978 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9979 break;
9980 case Intrinsic::amdgcn_s_wakeup_barrier:
9981 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9982 break;
9983 }
9984 }
9985
9986 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9987 SDValue M0Val;
9988 // Member count will be read from M0[16:22]
9989 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9990 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9991
9992 if (!IsInlinableBarID) {
9993 // If reference to barrier id is not an inline constant then it must be
9994 // referenced with M0[4:0]. Perform an OR with the member count to
9995 // include it in M0.
9996 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9997 Op.getOperand(2), M0Val),
9998 0);
9999 }
10000 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10001 } else if (!IsInlinableBarID) {
10002 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
10003 }
10004
10005 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10006 return SDValue(NewMI, 0);
10007 }
10008 default: {
10009 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10011 return lowerImage(Op, ImageDimIntr, DAG, true);
10012
10013 return Op;
10014 }
10015 }
10016}
10017
10018// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10019// offset (the offset that is included in bounds checking and swizzling, to be
10020// split between the instruction's voffset and immoffset fields) and soffset
10021// (the offset that is excluded from bounds checking and swizzling, to go in
10022// the instruction's soffset field). This function takes the first kind of
10023// offset and figures out how to split it between voffset and immoffset.
10024std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
10025 SDValue Offset, SelectionDAG &DAG) const {
10026 SDLoc DL(Offset);
10027 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10028 SDValue N0 = Offset;
10029 ConstantSDNode *C1 = nullptr;
10030
10031 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10032 N0 = SDValue();
10033 else if (DAG.isBaseWithConstantOffset(N0)) {
10034 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10035 N0 = N0.getOperand(0);
10036 }
10037
10038 if (C1) {
10039 unsigned ImmOffset = C1->getZExtValue();
10040 // If the immediate value is too big for the immoffset field, put only bits
10041 // that would normally fit in the immoffset field. The remaining value that
10042 // is copied/added for the voffset field is a large power of 2, and it
10043 // stands more chance of being CSEd with the copy/add for another similar
10044 // load/store.
10045 // However, do not do that rounding down if that is a negative
10046 // number, as it appears to be illegal to have a negative offset in the
10047 // vgpr, even if adding the immediate offset makes it positive.
10048 unsigned Overflow = ImmOffset & ~MaxImm;
10049 ImmOffset -= Overflow;
10050 if ((int32_t)Overflow < 0) {
10051 Overflow += ImmOffset;
10052 ImmOffset = 0;
10053 }
10054 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10055 if (Overflow) {
10056 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10057 if (!N0)
10058 N0 = OverflowVal;
10059 else {
10060 SDValue Ops[] = { N0, OverflowVal };
10061 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10062 }
10063 }
10064 }
10065 if (!N0)
10066 N0 = DAG.getConstant(0, DL, MVT::i32);
10067 if (!C1)
10068 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10069 return {N0, SDValue(C1, 0)};
10070}
10071
10072// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
10073// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10074// pointed to by Offsets.
10075void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10076 SelectionDAG &DAG, SDValue *Offsets,
10077 Align Alignment) const {
10079 SDLoc DL(CombinedOffset);
10080 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10081 uint32_t Imm = C->getZExtValue();
10082 uint32_t SOffset, ImmOffset;
10083 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10084 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10085 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10086 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10087 return;
10088 }
10089 }
10090 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10091 SDValue N0 = CombinedOffset.getOperand(0);
10092 SDValue N1 = CombinedOffset.getOperand(1);
10093 uint32_t SOffset, ImmOffset;
10094 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10095 if (Offset >= 0 &&
10096 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10097 Offsets[0] = N0;
10098 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10099 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10100 return;
10101 }
10102 }
10103
10104 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10105 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10106 : DAG.getConstant(0, DL, MVT::i32);
10107
10108 Offsets[0] = CombinedOffset;
10109 Offsets[1] = SOffsetZero;
10110 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10111}
10112
10113SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10114 SelectionDAG &DAG) const {
10115 if (!MaybePointer.getValueType().isScalarInteger())
10116 return MaybePointer;
10117
10118 SDLoc DL(MaybePointer);
10119
10120 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10121 return Rsrc;
10122}
10123
10124// Wrap a global or flat pointer into a buffer intrinsic using the flags
10125// specified in the intrinsic.
10126SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10127 SelectionDAG &DAG) const {
10128 SDLoc Loc(Op);
10129
10130 SDValue Pointer = Op->getOperand(1);
10131 SDValue Stride = Op->getOperand(2);
10132 SDValue NumRecords = Op->getOperand(3);
10133 SDValue Flags = Op->getOperand(4);
10134
10135 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10136 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10137 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10138 std::optional<uint32_t> ConstStride = std::nullopt;
10139 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10140 ConstStride = ConstNode->getZExtValue();
10141
10142 SDValue NewHighHalf = Masked;
10143 if (!ConstStride || *ConstStride != 0) {
10144 SDValue ShiftedStride;
10145 if (ConstStride) {
10146 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10147 } else {
10148 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10149 ShiftedStride =
10150 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10151 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10152 }
10153 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10154 }
10155
10156 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10157 NewHighHalf, NumRecords, Flags);
10158 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10159 return RsrcPtr;
10160}
10161
10162// Handle 8 bit and 16 bit buffer loads
10163SDValue
10164SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
10166 MachineMemOperand *MMO) const {
10167 EVT IntVT = LoadVT.changeTypeToInteger();
10168 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10170
10171 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10172 SDValue BufferLoad =
10173 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10174 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10175 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10176
10177 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10178}
10179
10180// Handle 8 bit and 16 bit buffer stores
10181SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10182 EVT VDataType, SDLoc DL,
10183 SDValue Ops[],
10184 MemSDNode *M) const {
10185 if (VDataType == MVT::f16)
10186 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10187
10188 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10189 Ops[1] = BufferStoreExt;
10190 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10192 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10193 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10194 M->getMemOperand());
10195}
10196
10198 ISD::LoadExtType ExtType, SDValue Op,
10199 const SDLoc &SL, EVT VT) {
10200 if (VT.bitsLT(Op.getValueType()))
10201 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10202
10203 switch (ExtType) {
10204 case ISD::SEXTLOAD:
10205 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10206 case ISD::ZEXTLOAD:
10207 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10208 case ISD::EXTLOAD:
10209 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10210 case ISD::NON_EXTLOAD:
10211 return Op;
10212 }
10213
10214 llvm_unreachable("invalid ext type");
10215}
10216
10217// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10218// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10219SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10220 SelectionDAG &DAG = DCI.DAG;
10221 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10222 return SDValue();
10223
10224 // FIXME: Constant loads should all be marked invariant.
10225 unsigned AS = Ld->getAddressSpace();
10226 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10228 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10229 return SDValue();
10230
10231 // Don't do this early, since it may interfere with adjacent load merging for
10232 // illegal types. We can avoid losing alignment information for exotic types
10233 // pre-legalize.
10234 EVT MemVT = Ld->getMemoryVT();
10235 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10236 MemVT.getSizeInBits() >= 32)
10237 return SDValue();
10238
10239 SDLoc SL(Ld);
10240
10241 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10242 "unexpected vector extload");
10243
10244 // TODO: Drop only high part of range.
10245 SDValue Ptr = Ld->getBasePtr();
10246 SDValue NewLoad = DAG.getLoad(
10247 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10248 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10249 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10250 nullptr); // Drop ranges
10251
10252 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10253 if (MemVT.isFloatingPoint()) {
10255 "unexpected fp extload");
10256 TruncVT = MemVT.changeTypeToInteger();
10257 }
10258
10259 SDValue Cvt = NewLoad;
10260 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10261 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10262 DAG.getValueType(TruncVT));
10263 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10265 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10266 } else {
10268 }
10269
10270 EVT VT = Ld->getValueType(0);
10271 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10272
10273 DCI.AddToWorklist(Cvt.getNode());
10274
10275 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10276 // the appropriate extension from the 32-bit load.
10277 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10278 DCI.AddToWorklist(Cvt.getNode());
10279
10280 // Handle conversion back to floating point if necessary.
10281 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10282
10283 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10284}
10285
10287 const SIMachineFunctionInfo &Info) {
10288 // TODO: Should check if the address can definitely not access stack.
10289 if (Info.isEntryFunction())
10290 return Info.getUserSGPRInfo().hasFlatScratchInit();
10291 return true;
10292}
10293
10294SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10295 SDLoc DL(Op);
10296 LoadSDNode *Load = cast<LoadSDNode>(Op);
10297 ISD::LoadExtType ExtType = Load->getExtensionType();
10298 EVT MemVT = Load->getMemoryVT();
10299
10300 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10301 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10302 return SDValue();
10303
10304 // FIXME: Copied from PPC
10305 // First, load into 32 bits, then truncate to 1 bit.
10306
10307 SDValue Chain = Load->getChain();
10308 SDValue BasePtr = Load->getBasePtr();
10309 MachineMemOperand *MMO = Load->getMemOperand();
10310
10311 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10312
10313 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10314 BasePtr, RealMemVT, MMO);
10315
10316 if (!MemVT.isVector()) {
10317 SDValue Ops[] = {
10318 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10319 NewLD.getValue(1)
10320 };
10321
10322 return DAG.getMergeValues(Ops, DL);
10323 }
10324
10326 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10327 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10328 DAG.getConstant(I, DL, MVT::i32));
10329
10330 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10331 }
10332
10333 SDValue Ops[] = {
10334 DAG.getBuildVector(MemVT, DL, Elts),
10335 NewLD.getValue(1)
10336 };
10337
10338 return DAG.getMergeValues(Ops, DL);
10339 }
10340
10341 if (!MemVT.isVector())
10342 return SDValue();
10343
10344 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10345 "Custom lowering for non-i32 vectors hasn't been implemented.");
10346
10347 Align Alignment = Load->getAlign();
10348 unsigned AS = Load->getAddressSpace();
10349 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10350 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10351 return SplitVectorLoad(Op, DAG);
10352 }
10353
10356 // If there is a possibility that flat instruction access scratch memory
10357 // then we need to use the same legalization rules we use for private.
10358 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10360 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10362
10363 unsigned NumElements = MemVT.getVectorNumElements();
10364
10365 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10367 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10368 if (MemVT.isPow2VectorType() ||
10369 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10370 return SDValue();
10371 return WidenOrSplitVectorLoad(Op, DAG);
10372 }
10373 // Non-uniform loads will be selected to MUBUF instructions, so they
10374 // have the same legalization requirements as global and private
10375 // loads.
10376 //
10377 }
10378
10379 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10382 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10383 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10384 Alignment >= Align(4) && NumElements < 32) {
10385 if (MemVT.isPow2VectorType() ||
10386 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10387 return SDValue();
10388 return WidenOrSplitVectorLoad(Op, DAG);
10389 }
10390 // Non-uniform loads will be selected to MUBUF instructions, so they
10391 // have the same legalization requirements as global and private
10392 // loads.
10393 //
10394 }
10395 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10398 AS == AMDGPUAS::FLAT_ADDRESS) {
10399 if (NumElements > 4)
10400 return SplitVectorLoad(Op, DAG);
10401 // v3 loads not supported on SI.
10402 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10403 return WidenOrSplitVectorLoad(Op, DAG);
10404
10405 // v3 and v4 loads are supported for private and global memory.
10406 return SDValue();
10407 }
10408 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10409 // Depending on the setting of the private_element_size field in the
10410 // resource descriptor, we can only make private accesses up to a certain
10411 // size.
10412 switch (Subtarget->getMaxPrivateElementSize()) {
10413 case 4: {
10414 SDValue Ops[2];
10415 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10416 return DAG.getMergeValues(Ops, DL);
10417 }
10418 case 8:
10419 if (NumElements > 2)
10420 return SplitVectorLoad(Op, DAG);
10421 return SDValue();
10422 case 16:
10423 // Same as global/flat
10424 if (NumElements > 4)
10425 return SplitVectorLoad(Op, DAG);
10426 // v3 loads not supported on SI.
10427 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10428 return WidenOrSplitVectorLoad(Op, DAG);
10429
10430 return SDValue();
10431 default:
10432 llvm_unreachable("unsupported private_element_size");
10433 }
10434 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10435 unsigned Fast = 0;
10436 auto Flags = Load->getMemOperand()->getFlags();
10438 Load->getAlign(), Flags, &Fast) &&
10439 Fast > 1)
10440 return SDValue();
10441
10442 if (MemVT.isVector())
10443 return SplitVectorLoad(Op, DAG);
10444 }
10445
10447 MemVT, *Load->getMemOperand())) {
10448 SDValue Ops[2];
10449 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10450 return DAG.getMergeValues(Ops, DL);
10451 }
10452
10453 return SDValue();
10454}
10455
10456SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10457 EVT VT = Op.getValueType();
10458 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10459 VT.getSizeInBits() == 512)
10460 return splitTernaryVectorOp(Op, DAG);
10461
10462 assert(VT.getSizeInBits() == 64);
10463
10464 SDLoc DL(Op);
10465 SDValue Cond = Op.getOperand(0);
10466
10467 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10468 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10469
10470 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10471 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10472
10473 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10474 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10475
10476 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10477
10478 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10479 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10480
10481 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10482
10483 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10484 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10485}
10486
10487// Catch division cases where we can use shortcuts with rcp and rsq
10488// instructions.
10489SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10490 SelectionDAG &DAG) const {
10491 SDLoc SL(Op);
10492 SDValue LHS = Op.getOperand(0);
10493 SDValue RHS = Op.getOperand(1);
10494 EVT VT = Op.getValueType();
10495 const SDNodeFlags Flags = Op->getFlags();
10496
10497 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10499
10500 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10501 // Without !fpmath accuracy information, we can't do more because we don't
10502 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10503 // f16 is always accurate enough
10504 if (!AllowInaccurateRcp && VT != MVT::f16)
10505 return SDValue();
10506
10507 if (CLHS->isExactlyValue(1.0)) {
10508 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10509 // the CI documentation has a worst case error of 1 ulp.
10510 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10511 // use it as long as we aren't trying to use denormals.
10512 //
10513 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10514
10515 // 1.0 / sqrt(x) -> rsq(x)
10516
10517 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10518 // error seems really high at 2^29 ULP.
10519 // 1.0 / x -> rcp(x)
10520 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10521 }
10522
10523 // Same as for 1.0, but expand the sign out of the constant.
10524 if (CLHS->isExactlyValue(-1.0)) {
10525 // -1.0 / x -> rcp (fneg x)
10526 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10527 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10528 }
10529 }
10530
10531 // For f16 require afn or arcp.
10532 // For f32 require afn.
10533 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10534 return SDValue();
10535
10536 // Turn into multiply by the reciprocal.
10537 // x / y -> x * (1.0 / y)
10538 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10539 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10540}
10541
10542SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10543 SelectionDAG &DAG) const {
10544 SDLoc SL(Op);
10545 SDValue X = Op.getOperand(0);
10546 SDValue Y = Op.getOperand(1);
10547 EVT VT = Op.getValueType();
10548 const SDNodeFlags Flags = Op->getFlags();
10549
10550 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10552 if (!AllowInaccurateDiv)
10553 return SDValue();
10554
10555 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10556 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10557
10558 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10559 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10560
10561 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10562 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10563 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10564 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10565 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10566 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10567}
10568
10569static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10570 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10571 SDNodeFlags Flags) {
10572 if (GlueChain->getNumValues() <= 1) {
10573 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10574 }
10575
10576 assert(GlueChain->getNumValues() == 3);
10577
10578 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10579 switch (Opcode) {
10580 default: llvm_unreachable("no chain equivalent for opcode");
10581 case ISD::FMUL:
10582 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10583 break;
10584 }
10585
10586 return DAG.getNode(Opcode, SL, VTList,
10587 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10588 Flags);
10589}
10590
10591static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10592 EVT VT, SDValue A, SDValue B, SDValue C,
10593 SDValue GlueChain, SDNodeFlags Flags) {
10594 if (GlueChain->getNumValues() <= 1) {
10595 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10596 }
10597
10598 assert(GlueChain->getNumValues() == 3);
10599
10600 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10601 switch (Opcode) {
10602 default: llvm_unreachable("no chain equivalent for opcode");
10603 case ISD::FMA:
10604 Opcode = AMDGPUISD::FMA_W_CHAIN;
10605 break;
10606 }
10607
10608 return DAG.getNode(Opcode, SL, VTList,
10609 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10610 Flags);
10611}
10612
10613SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10614 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10615 return FastLowered;
10616
10617 SDLoc SL(Op);
10618 SDValue Src0 = Op.getOperand(0);
10619 SDValue Src1 = Op.getOperand(1);
10620
10621 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10622 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10623
10624 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10625 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10626
10627 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10628 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10629
10630 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10631}
10632
10633// Faster 2.5 ULP division that does not support denormals.
10634SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10635 SDNodeFlags Flags = Op->getFlags();
10636 SDLoc SL(Op);
10637 SDValue LHS = Op.getOperand(1);
10638 SDValue RHS = Op.getOperand(2);
10639
10640 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10641
10642 const APFloat K0Val(0x1p+96f);
10643 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10644
10645 const APFloat K1Val(0x1p-32f);
10646 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10647
10648 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10649
10650 EVT SetCCVT =
10651 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10652
10653 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10654
10655 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10656
10657 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10658
10659 // rcp does not support denormals.
10660 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10661
10662 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10663
10664 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10665}
10666
10667// Returns immediate value for setting the F32 denorm mode when using the
10668// S_DENORM_MODE instruction.
10670 const SIMachineFunctionInfo *Info,
10671 const GCNSubtarget *ST) {
10672 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10673 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10674 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10675 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10676}
10677
10678SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10679 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10680 return FastLowered;
10681
10682 // The selection matcher assumes anything with a chain selecting to a
10683 // mayRaiseFPException machine instruction. Since we're introducing a chain
10684 // here, we need to explicitly report nofpexcept for the regular fdiv
10685 // lowering.
10686 SDNodeFlags Flags = Op->getFlags();
10687 Flags.setNoFPExcept(true);
10688
10689 SDLoc SL(Op);
10690 SDValue LHS = Op.getOperand(0);
10691 SDValue RHS = Op.getOperand(1);
10692
10693 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10694
10695 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10696
10697 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10698 {RHS, RHS, LHS}, Flags);
10699 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10700 {LHS, RHS, LHS}, Flags);
10701
10702 // Denominator is scaled to not be denormal, so using rcp is ok.
10703 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10704 DenominatorScaled, Flags);
10705 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10706 DenominatorScaled, Flags);
10707
10708 using namespace AMDGPU::Hwreg;
10709 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10710 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10711
10712 const MachineFunction &MF = DAG.getMachineFunction();
10714 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10715
10716 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10717 const bool HasDynamicDenormals =
10718 (DenormMode.Input == DenormalMode::Dynamic) ||
10719 (DenormMode.Output == DenormalMode::Dynamic);
10720
10721 SDValue SavedDenormMode;
10722
10723 if (!PreservesDenormals) {
10724 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10725 // lowering. The chain dependence is insufficient, and we need glue. We do
10726 // not need the glue variants in a strictfp function.
10727
10728 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10729
10730 SDValue Glue = DAG.getEntryNode();
10731 if (HasDynamicDenormals) {
10732 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10733 DAG.getVTList(MVT::i32, MVT::Glue),
10734 {BitField, Glue});
10735 SavedDenormMode = SDValue(GetReg, 0);
10736
10737 Glue = DAG.getMergeValues(
10738 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10739 }
10740
10741 SDNode *EnableDenorm;
10742 if (Subtarget->hasDenormModeInst()) {
10743 const SDValue EnableDenormValue =
10744 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10745
10746 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10747 EnableDenormValue)
10748 .getNode();
10749 } else {
10750 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10751 SL, MVT::i32);
10752 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10753 {EnableDenormValue, BitField, Glue});
10754 }
10755
10756 SDValue Ops[3] = {
10757 NegDivScale0,
10758 SDValue(EnableDenorm, 0),
10759 SDValue(EnableDenorm, 1)
10760 };
10761
10762 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10763 }
10764
10765 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10766 ApproxRcp, One, NegDivScale0, Flags);
10767
10768 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10769 ApproxRcp, Fma0, Flags);
10770
10771 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10772 Fma1, Fma1, Flags);
10773
10774 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10775 NumeratorScaled, Mul, Flags);
10776
10777 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10778 Fma2, Fma1, Mul, Fma2, Flags);
10779
10780 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10781 NumeratorScaled, Fma3, Flags);
10782
10783 if (!PreservesDenormals) {
10784 SDNode *DisableDenorm;
10785 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10786 const SDValue DisableDenormValue = getSPDenormModeValue(
10787 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10788
10789 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10790 Fma4.getValue(1), DisableDenormValue,
10791 Fma4.getValue(2)).getNode();
10792 } else {
10793 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10794 const SDValue DisableDenormValue =
10795 HasDynamicDenormals
10796 ? SavedDenormMode
10797 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10798
10799 DisableDenorm = DAG.getMachineNode(
10800 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10801 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10802 }
10803
10804 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10805 SDValue(DisableDenorm, 0), DAG.getRoot());
10806 DAG.setRoot(OutputChain);
10807 }
10808
10809 SDValue Scale = NumeratorScaled.getValue(1);
10810 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10811 {Fma4, Fma1, Fma3, Scale}, Flags);
10812
10813 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10814}
10815
10816SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10817 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10818 return FastLowered;
10819
10820 SDLoc SL(Op);
10821 SDValue X = Op.getOperand(0);
10822 SDValue Y = Op.getOperand(1);
10823
10824 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10825
10826 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10827
10828 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10829
10830 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10831
10832 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10833
10834 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10835
10836 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10837
10838 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10839
10840 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10841
10842 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10843 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10844
10845 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10846 NegDivScale0, Mul, DivScale1);
10847
10848 SDValue Scale;
10849
10850 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10851 // Workaround a hardware bug on SI where the condition output from div_scale
10852 // is not usable.
10853
10854 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10855
10856 // Figure out if the scale to use for div_fmas.
10857 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10858 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10859 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10860 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10861
10862 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10863 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10864
10865 SDValue Scale0Hi
10866 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10867 SDValue Scale1Hi
10868 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10869
10870 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10871 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10872 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10873 } else {
10874 Scale = DivScale1.getValue(1);
10875 }
10876
10877 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10878 Fma4, Fma3, Mul, Scale);
10879
10880 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10881}
10882
10883SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10884 EVT VT = Op.getValueType();
10885
10886 if (VT == MVT::f32)
10887 return LowerFDIV32(Op, DAG);
10888
10889 if (VT == MVT::f64)
10890 return LowerFDIV64(Op, DAG);
10891
10892 if (VT == MVT::f16)
10893 return LowerFDIV16(Op, DAG);
10894
10895 llvm_unreachable("Unexpected type for fdiv");
10896}
10897
10898SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10899 SDLoc dl(Op);
10900 SDValue Val = Op.getOperand(0);
10901 EVT VT = Val.getValueType();
10902 EVT ResultExpVT = Op->getValueType(1);
10903 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10904
10905 SDValue Mant = DAG.getNode(
10907 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10908
10909 SDValue Exp = DAG.getNode(
10910 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10911 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10912
10913 if (Subtarget->hasFractBug()) {
10914 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10915 SDValue Inf = DAG.getConstantFP(
10917
10918 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10919 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10920 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10921 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10922 }
10923
10924 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10925 return DAG.getMergeValues({Mant, CastExp}, dl);
10926}
10927
10928SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10929 SDLoc DL(Op);
10930 StoreSDNode *Store = cast<StoreSDNode>(Op);
10931 EVT VT = Store->getMemoryVT();
10932
10933 if (VT == MVT::i1) {
10934 return DAG.getTruncStore(Store->getChain(), DL,
10935 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10936 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10937 }
10938
10939 assert(VT.isVector() &&
10940 Store->getValue().getValueType().getScalarType() == MVT::i32);
10941
10942 unsigned AS = Store->getAddressSpace();
10943 if (Subtarget->hasLDSMisalignedBug() &&
10944 AS == AMDGPUAS::FLAT_ADDRESS &&
10945 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10946 return SplitVectorStore(Op, DAG);
10947 }
10948
10951 // If there is a possibility that flat instruction access scratch memory
10952 // then we need to use the same legalization rules we use for private.
10953 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10955 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10957
10958 unsigned NumElements = VT.getVectorNumElements();
10959 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10960 AS == AMDGPUAS::FLAT_ADDRESS) {
10961 if (NumElements > 4)
10962 return SplitVectorStore(Op, DAG);
10963 // v3 stores not supported on SI.
10964 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10965 return SplitVectorStore(Op, DAG);
10966
10968 VT, *Store->getMemOperand()))
10969 return expandUnalignedStore(Store, DAG);
10970
10971 return SDValue();
10972 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10973 switch (Subtarget->getMaxPrivateElementSize()) {
10974 case 4:
10975 return scalarizeVectorStore(Store, DAG);
10976 case 8:
10977 if (NumElements > 2)
10978 return SplitVectorStore(Op, DAG);
10979 return SDValue();
10980 case 16:
10981 if (NumElements > 4 ||
10982 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10983 return SplitVectorStore(Op, DAG);
10984 return SDValue();
10985 default:
10986 llvm_unreachable("unsupported private_element_size");
10987 }
10988 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10989 unsigned Fast = 0;
10990 auto Flags = Store->getMemOperand()->getFlags();
10992 Store->getAlign(), Flags, &Fast) &&
10993 Fast > 1)
10994 return SDValue();
10995
10996 if (VT.isVector())
10997 return SplitVectorStore(Op, DAG);
10998
10999 return expandUnalignedStore(Store, DAG);
11000 }
11001
11002 // Probably an invalid store. If so we'll end up emitting a selection error.
11003 return SDValue();
11004}
11005
11006// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11007SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11008 SDLoc SL(Op);
11009 assert(!Subtarget->has16BitInsts());
11010 SDNodeFlags Flags = Op->getFlags();
11011 SDValue Ext =
11012 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11013
11014 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11015 SDValue Sqrt =
11016 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11017
11018 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11019 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11020}
11021
11022SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11023 SDLoc DL(Op);
11024 SDNodeFlags Flags = Op->getFlags();
11025 MVT VT = Op.getValueType().getSimpleVT();
11026 const SDValue X = Op.getOperand(0);
11027
11028 if (allowApproxFunc(DAG, Flags)) {
11029 // Instruction is 1ulp but ignores denormals.
11030 return DAG.getNode(
11032 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11033 }
11034
11035 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11036 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11037
11038 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11039
11040 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11041
11042 SDValue SqrtX =
11043 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11044
11045 SDValue SqrtS;
11046 if (needsDenormHandlingF32(DAG, X, Flags)) {
11047 SDValue SqrtID =
11048 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11049 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11050
11051 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11052 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11053 DAG.getConstant(-1, DL, MVT::i32));
11054 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11055
11056 SDValue NegSqrtSNextDown =
11057 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11058
11059 SDValue SqrtVP =
11060 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11061
11062 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11063 DAG.getConstant(1, DL, MVT::i32));
11064 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11065
11066 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11067 SDValue SqrtVS =
11068 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11069
11070 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11071 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11072
11073 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11074 Flags);
11075
11076 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11077 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11078 Flags);
11079 } else {
11080 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11081
11082 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11083
11084 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11085 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11086 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11087
11088 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11089 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11090 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11091
11092 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11093 SDValue SqrtD =
11094 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11095 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11096 }
11097
11098 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11099
11100 SDValue ScaledDown =
11101 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11102
11103 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11104 SDValue IsZeroOrInf =
11105 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11106 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11107
11108 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11109}
11110
11111SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11112 // For double type, the SQRT and RSQ instructions don't have required
11113 // precision, we apply Goldschmidt's algorithm to improve the result:
11114 //
11115 // y0 = rsq(x)
11116 // g0 = x * y0
11117 // h0 = 0.5 * y0
11118 //
11119 // r0 = 0.5 - h0 * g0
11120 // g1 = g0 * r0 + g0
11121 // h1 = h0 * r0 + h0
11122 //
11123 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11124 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11125 // h2 = h1 * r1 + h1
11126 //
11127 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11128 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11129 //
11130 // sqrt(x) = g3
11131
11132 SDNodeFlags Flags = Op->getFlags();
11133
11134 SDLoc DL(Op);
11135
11136 SDValue X = Op.getOperand(0);
11137 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11138
11139 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11140
11141 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11142
11143 // Scale up input if it is too small.
11144 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11145 SDValue ScaleUp =
11146 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11147 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11148
11149 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11150
11151 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11152
11153 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11154 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11155
11156 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11157 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11158
11159 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11160
11161 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11162
11163 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11164 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11165
11166 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11167
11168 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11169 SDValue SqrtD1 =
11170 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11171
11172 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11173
11174 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11175 SDValue ScaleDown =
11176 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11177 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11178
11179 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11180 // with finite only or nsz because rsq(+/-0) = +/-inf
11181
11182 // TODO: Check for DAZ and expand to subnormals
11183 SDValue IsZeroOrInf =
11184 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11185 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11186
11187 // If x is +INF, +0, or -0, use its original value
11188 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11189 Flags);
11190}
11191
11192SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11193 SDLoc DL(Op);
11194 EVT VT = Op.getValueType();
11195 SDValue Arg = Op.getOperand(0);
11196 SDValue TrigVal;
11197
11198 // Propagate fast-math flags so that the multiply we introduce can be folded
11199 // if Arg is already the result of a multiply by constant.
11200 auto Flags = Op->getFlags();
11201
11202 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11203
11204 if (Subtarget->hasTrigReducedRange()) {
11205 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11206 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11207 } else {
11208 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11209 }
11210
11211 switch (Op.getOpcode()) {
11212 case ISD::FCOS:
11213 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11214 case ISD::FSIN:
11215 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11216 default:
11217 llvm_unreachable("Wrong trig opcode");
11218 }
11219}
11220
11221SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11222 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11223 assert(AtomicNode->isCompareAndSwap());
11224 unsigned AS = AtomicNode->getAddressSpace();
11225
11226 // No custom lowering required for local address space
11228 return Op;
11229
11230 // Non-local address space requires custom lowering for atomic compare
11231 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11232 SDLoc DL(Op);
11233 SDValue ChainIn = Op.getOperand(0);
11234 SDValue Addr = Op.getOperand(1);
11235 SDValue Old = Op.getOperand(2);
11236 SDValue New = Op.getOperand(3);
11237 EVT VT = Op.getValueType();
11238 MVT SimpleVT = VT.getSimpleVT();
11239 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11240
11241 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11242 SDValue Ops[] = { ChainIn, Addr, NewOld };
11243
11244 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11245 Ops, VT, AtomicNode->getMemOperand());
11246}
11247
11248//===----------------------------------------------------------------------===//
11249// Custom DAG optimizations
11250//===----------------------------------------------------------------------===//
11251
11252SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11253 DAGCombinerInfo &DCI) const {
11254 EVT VT = N->getValueType(0);
11255 EVT ScalarVT = VT.getScalarType();
11256 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11257 return SDValue();
11258
11259 SelectionDAG &DAG = DCI.DAG;
11260 SDLoc DL(N);
11261
11262 SDValue Src = N->getOperand(0);
11263 EVT SrcVT = Src.getValueType();
11264
11265 // TODO: We could try to match extracting the higher bytes, which would be
11266 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11267 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11268 // about in practice.
11269 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11270 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11271 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11272 DCI.AddToWorklist(Cvt.getNode());
11273
11274 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11275 if (ScalarVT != MVT::f32) {
11276 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11277 DAG.getTargetConstant(0, DL, MVT::i32));
11278 }
11279 return Cvt;
11280 }
11281 }
11282
11283 return SDValue();
11284}
11285
11286SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11287 DAGCombinerInfo &DCI) const {
11288 SDValue MagnitudeOp = N->getOperand(0);
11289 SDValue SignOp = N->getOperand(1);
11290 SelectionDAG &DAG = DCI.DAG;
11291 SDLoc DL(N);
11292
11293 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11294 // lower half with a copy.
11295 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11296 if (MagnitudeOp.getValueType() == MVT::f64) {
11297 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11298 SDValue MagLo =
11299 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11300 DAG.getConstant(0, DL, MVT::i32));
11301 SDValue MagHi =
11302 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11303 DAG.getConstant(1, DL, MVT::i32));
11304
11305 SDValue HiOp =
11306 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11307
11308 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11309
11310 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11311 }
11312
11313 if (SignOp.getValueType() != MVT::f64)
11314 return SDValue();
11315
11316 // Reduce width of sign operand, we only need the highest bit.
11317 //
11318 // fcopysign f64:x, f64:y ->
11319 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11320 // TODO: In some cases it might make sense to go all the way to f16.
11321 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11322 SDValue SignAsF32 =
11323 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11324 DAG.getConstant(1, DL, MVT::i32));
11325
11326 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11327 SignAsF32);
11328}
11329
11330// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11331// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11332// bits
11333
11334// This is a variant of
11335// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11336//
11337// The normal DAG combiner will do this, but only if the add has one use since
11338// that would increase the number of instructions.
11339//
11340// This prevents us from seeing a constant offset that can be folded into a
11341// memory instruction's addressing mode. If we know the resulting add offset of
11342// a pointer can be folded into an addressing offset, we can replace the pointer
11343// operand with the add of new constant offset. This eliminates one of the uses,
11344// and may allow the remaining use to also be simplified.
11345//
11346SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11347 unsigned AddrSpace,
11348 EVT MemVT,
11349 DAGCombinerInfo &DCI) const {
11350 SDValue N0 = N->getOperand(0);
11351 SDValue N1 = N->getOperand(1);
11352
11353 // We only do this to handle cases where it's profitable when there are
11354 // multiple uses of the add, so defer to the standard combine.
11355 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11356 N0->hasOneUse())
11357 return SDValue();
11358
11359 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11360 if (!CN1)
11361 return SDValue();
11362
11363 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11364 if (!CAdd)
11365 return SDValue();
11366
11367 SelectionDAG &DAG = DCI.DAG;
11368
11369 if (N0->getOpcode() == ISD::OR &&
11370 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11371 return SDValue();
11372
11373 // If the resulting offset is too large, we can't fold it into the
11374 // addressing mode offset.
11375 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11376 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11377
11378 AddrMode AM;
11379 AM.HasBaseReg = true;
11380 AM.BaseOffs = Offset.getSExtValue();
11381 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11382 return SDValue();
11383
11384 SDLoc SL(N);
11385 EVT VT = N->getValueType(0);
11386
11387 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11388 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11389
11391 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11392 (N0.getOpcode() == ISD::OR ||
11393 N0->getFlags().hasNoUnsignedWrap()));
11394
11395 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11396}
11397
11398/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11399/// by the chain and intrinsic ID. Theoretically we would also need to check the
11400/// specific intrinsic, but they all place the pointer operand first.
11401static unsigned getBasePtrIndex(const MemSDNode *N) {
11402 switch (N->getOpcode()) {
11403 case ISD::STORE:
11406 return 2;
11407 default:
11408 return 1;
11409 }
11410}
11411
11412SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11413 DAGCombinerInfo &DCI) const {
11414 SelectionDAG &DAG = DCI.DAG;
11415 SDLoc SL(N);
11416
11417 unsigned PtrIdx = getBasePtrIndex(N);
11418 SDValue Ptr = N->getOperand(PtrIdx);
11419
11420 // TODO: We could also do this for multiplies.
11421 if (Ptr.getOpcode() == ISD::SHL) {
11422 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11423 N->getMemoryVT(), DCI);
11424 if (NewPtr) {
11425 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11426
11427 NewOps[PtrIdx] = NewPtr;
11428 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11429 }
11430 }
11431
11432 return SDValue();
11433}
11434
11435static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11436 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11437 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11438 (Opc == ISD::XOR && Val == 0);
11439}
11440
11441// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11442// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11443// integer combine opportunities since most 64-bit operations are decomposed
11444// this way. TODO: We won't want this for SALU especially if it is an inline
11445// immediate.
11446SDValue SITargetLowering::splitBinaryBitConstantOp(
11447 DAGCombinerInfo &DCI,
11448 const SDLoc &SL,
11449 unsigned Opc, SDValue LHS,
11450 const ConstantSDNode *CRHS) const {
11451 uint64_t Val = CRHS->getZExtValue();
11452 uint32_t ValLo = Lo_32(Val);
11453 uint32_t ValHi = Hi_32(Val);
11455
11456 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11457 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11458 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11459 // If we need to materialize a 64-bit immediate, it will be split up later
11460 // anyway. Avoid creating the harder to understand 64-bit immediate
11461 // materialization.
11462 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11463 }
11464
11465 return SDValue();
11466}
11467
11469 if (V.getValueType() != MVT::i1)
11470 return false;
11471 switch (V.getOpcode()) {
11472 default:
11473 break;
11474 case ISD::SETCC:
11476 return true;
11477 case ISD::AND:
11478 case ISD::OR:
11479 case ISD::XOR:
11480 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11481 }
11482 return false;
11483}
11484
11485// If a constant has all zeroes or all ones within each byte return it.
11486// Otherwise return 0.
11488 // 0xff for any zero byte in the mask
11489 uint32_t ZeroByteMask = 0;
11490 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11491 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11492 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11493 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11494 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11495 if ((NonZeroByteMask & C) != NonZeroByteMask)
11496 return 0; // Partial bytes selected.
11497 return C;
11498}
11499
11500// Check if a node selects whole bytes from its operand 0 starting at a byte
11501// boundary while masking the rest. Returns select mask as in the v_perm_b32
11502// or -1 if not succeeded.
11503// Note byte select encoding:
11504// value 0-3 selects corresponding source byte;
11505// value 0xc selects zero;
11506// value 0xff selects 0xff.
11508 assert(V.getValueSizeInBits() == 32);
11509
11510 if (V.getNumOperands() != 2)
11511 return ~0;
11512
11513 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11514 if (!N1)
11515 return ~0;
11516
11517 uint32_t C = N1->getZExtValue();
11518
11519 switch (V.getOpcode()) {
11520 default:
11521 break;
11522 case ISD::AND:
11523 if (uint32_t ConstMask = getConstantPermuteMask(C))
11524 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11525 break;
11526
11527 case ISD::OR:
11528 if (uint32_t ConstMask = getConstantPermuteMask(C))
11529 return (0x03020100 & ~ConstMask) | ConstMask;
11530 break;
11531
11532 case ISD::SHL:
11533 if (C % 8)
11534 return ~0;
11535
11536 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11537
11538 case ISD::SRL:
11539 if (C % 8)
11540 return ~0;
11541
11542 return uint32_t(0x0c0c0c0c03020100ull >> C);
11543 }
11544
11545 return ~0;
11546}
11547
11548SDValue SITargetLowering::performAndCombine(SDNode *N,
11549 DAGCombinerInfo &DCI) const {
11550 if (DCI.isBeforeLegalize())
11551 return SDValue();
11552
11553 SelectionDAG &DAG = DCI.DAG;
11554 EVT VT = N->getValueType(0);
11555 SDValue LHS = N->getOperand(0);
11556 SDValue RHS = N->getOperand(1);
11557
11558
11559 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11560 if (VT == MVT::i64 && CRHS) {
11561 if (SDValue Split
11562 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11563 return Split;
11564 }
11565
11566 if (CRHS && VT == MVT::i32) {
11567 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11568 // nb = number of trailing zeroes in mask
11569 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11570 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11571 uint64_t Mask = CRHS->getZExtValue();
11572 unsigned Bits = llvm::popcount(Mask);
11573 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11574 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11575 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11576 unsigned Shift = CShift->getZExtValue();
11577 unsigned NB = CRHS->getAPIntValue().countr_zero();
11578 unsigned Offset = NB + Shift;
11579 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11580 SDLoc SL(N);
11581 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11582 LHS->getOperand(0),
11583 DAG.getConstant(Offset, SL, MVT::i32),
11584 DAG.getConstant(Bits, SL, MVT::i32));
11585 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11586 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11587 DAG.getValueType(NarrowVT));
11588 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11589 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11590 return Shl;
11591 }
11592 }
11593 }
11594
11595 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11596 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11597 isa<ConstantSDNode>(LHS.getOperand(2))) {
11598 uint32_t Sel = getConstantPermuteMask(Mask);
11599 if (!Sel)
11600 return SDValue();
11601
11602 // Select 0xc for all zero bytes
11603 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11604 SDLoc DL(N);
11605 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11606 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11607 }
11608 }
11609
11610 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11611 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11612 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11613 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11614 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11615
11616 SDValue X = LHS.getOperand(0);
11617 SDValue Y = RHS.getOperand(0);
11618 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11619 !isTypeLegal(X.getValueType()))
11620 return SDValue();
11621
11622 if (LCC == ISD::SETO) {
11623 if (X != LHS.getOperand(1))
11624 return SDValue();
11625
11626 if (RCC == ISD::SETUNE) {
11627 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11628 if (!C1 || !C1->isInfinity() || C1->isNegative())
11629 return SDValue();
11630
11637
11638 static_assert(((~(SIInstrFlags::S_NAN |
11641 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11642 "mask not equal");
11643
11644 SDLoc DL(N);
11645 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11646 X, DAG.getConstant(Mask, DL, MVT::i32));
11647 }
11648 }
11649 }
11650
11651 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11652 std::swap(LHS, RHS);
11653
11654 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11655 RHS.hasOneUse()) {
11656 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11657 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11658 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11659 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11660 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11661 (RHS.getOperand(0) == LHS.getOperand(0) &&
11662 LHS.getOperand(0) == LHS.getOperand(1))) {
11663 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11664 unsigned NewMask = LCC == ISD::SETO ?
11665 Mask->getZExtValue() & ~OrdMask :
11666 Mask->getZExtValue() & OrdMask;
11667
11668 SDLoc DL(N);
11669 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11670 DAG.getConstant(NewMask, DL, MVT::i32));
11671 }
11672 }
11673
11674 if (VT == MVT::i32 &&
11675 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11676 // and x, (sext cc from i1) => select cc, x, 0
11677 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11678 std::swap(LHS, RHS);
11679 if (isBoolSGPR(RHS.getOperand(0)))
11680 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11681 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11682 }
11683
11684 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11686 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11687 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11688 uint32_t LHSMask = getPermuteMask(LHS);
11689 uint32_t RHSMask = getPermuteMask(RHS);
11690 if (LHSMask != ~0u && RHSMask != ~0u) {
11691 // Canonicalize the expression in an attempt to have fewer unique masks
11692 // and therefore fewer registers used to hold the masks.
11693 if (LHSMask > RHSMask) {
11694 std::swap(LHSMask, RHSMask);
11695 std::swap(LHS, RHS);
11696 }
11697
11698 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11699 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11700 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11701 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11702
11703 // Check of we need to combine values from two sources within a byte.
11704 if (!(LHSUsedLanes & RHSUsedLanes) &&
11705 // If we select high and lower word keep it for SDWA.
11706 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11707 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11708 // Each byte in each mask is either selector mask 0-3, or has higher
11709 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11710 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11711 // mask which is not 0xff wins. By anding both masks we have a correct
11712 // result except that 0x0c shall be corrected to give 0x0c only.
11713 uint32_t Mask = LHSMask & RHSMask;
11714 for (unsigned I = 0; I < 32; I += 8) {
11715 uint32_t ByteSel = 0xff << I;
11716 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11717 Mask &= (0x0c << I) & 0xffffffff;
11718 }
11719
11720 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11721 // or 0x0c.
11722 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11723 SDLoc DL(N);
11724
11725 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11726 LHS.getOperand(0), RHS.getOperand(0),
11727 DAG.getConstant(Sel, DL, MVT::i32));
11728 }
11729 }
11730 }
11731
11732 return SDValue();
11733}
11734
11735// A key component of v_perm is a mapping between byte position of the src
11736// operands, and the byte position of the dest. To provide such, we need: 1. the
11737// node that provides x byte of the dest of the OR, and 2. the byte of the node
11738// used to provide that x byte. calculateByteProvider finds which node provides
11739// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11740// and finds an ultimate src and byte position For example: The supported
11741// LoadCombine pattern for vector loads is as follows
11742// t1
11743// or
11744// / \
11745// t2 t3
11746// zext shl
11747// | | \
11748// t4 t5 16
11749// or anyext
11750// / \ |
11751// t6 t7 t8
11752// srl shl or
11753// / | / \ / \
11754// t9 t10 t11 t12 t13 t14
11755// trunc* 8 trunc* 8 and and
11756// | | / | | \
11757// t15 t16 t17 t18 t19 t20
11758// trunc* 255 srl -256
11759// | / \
11760// t15 t15 16
11761//
11762// *In this example, the truncs are from i32->i16
11763//
11764// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11765// respectively. calculateSrcByte would find (given node) -> ultimate src &
11766// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11767// After finding the mapping, we can combine the tree into vperm t15, t16,
11768// 0x05000407
11769
11770// Find the source and byte position from a node.
11771// \p DestByte is the byte position of the dest of the or that the src
11772// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11773// dest of the or byte. \p Depth tracks how many recursive iterations we have
11774// performed.
11775static const std::optional<ByteProvider<SDValue>>
11776calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11777 unsigned Depth = 0) {
11778 // We may need to recursively traverse a series of SRLs
11779 if (Depth >= 6)
11780 return std::nullopt;
11781
11782 if (Op.getValueSizeInBits() < 8)
11783 return std::nullopt;
11784
11785 if (Op.getValueType().isVector())
11786 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11787
11788 switch (Op->getOpcode()) {
11789 case ISD::TRUNCATE: {
11790 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11791 }
11792
11793 case ISD::SIGN_EXTEND:
11794 case ISD::ZERO_EXTEND:
11796 SDValue NarrowOp = Op->getOperand(0);
11797 auto NarrowVT = NarrowOp.getValueType();
11798 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11799 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11800 NarrowVT = VTSign->getVT();
11801 }
11802 if (!NarrowVT.isByteSized())
11803 return std::nullopt;
11804 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11805
11806 if (SrcIndex >= NarrowByteWidth)
11807 return std::nullopt;
11808 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11809 }
11810
11811 case ISD::SRA:
11812 case ISD::SRL: {
11813 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11814 if (!ShiftOp)
11815 return std::nullopt;
11816
11817 uint64_t BitShift = ShiftOp->getZExtValue();
11818
11819 if (BitShift % 8 != 0)
11820 return std::nullopt;
11821
11822 SrcIndex += BitShift / 8;
11823
11824 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11825 }
11826
11827 default: {
11828 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11829 }
11830 }
11831 llvm_unreachable("fully handled switch");
11832}
11833
11834// For a byte position in the result of an Or, traverse the tree and find the
11835// node (and the byte of the node) which ultimately provides this {Or,
11836// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11837// the byte position of the Op that corresponds with the originally requested
11838// byte of the Or \p Depth tracks how many recursive iterations we have
11839// performed. \p StartingIndex is the originally requested byte of the Or
11840static const std::optional<ByteProvider<SDValue>>
11841calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11842 unsigned StartingIndex = 0) {
11843 // Finding Src tree of RHS of or typically requires at least 1 additional
11844 // depth
11845 if (Depth > 6)
11846 return std::nullopt;
11847
11848 unsigned BitWidth = Op.getScalarValueSizeInBits();
11849 if (BitWidth % 8 != 0)
11850 return std::nullopt;
11851 if (Index > BitWidth / 8 - 1)
11852 return std::nullopt;
11853
11854 bool IsVec = Op.getValueType().isVector();
11855 switch (Op.getOpcode()) {
11856 case ISD::OR: {
11857 if (IsVec)
11858 return std::nullopt;
11859
11860 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11861 StartingIndex);
11862 if (!RHS)
11863 return std::nullopt;
11864 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11865 StartingIndex);
11866 if (!LHS)
11867 return std::nullopt;
11868 // A well formed Or will have two ByteProviders for each byte, one of which
11869 // is constant zero
11870 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11871 return std::nullopt;
11872 if (!LHS || LHS->isConstantZero())
11873 return RHS;
11874 if (!RHS || RHS->isConstantZero())
11875 return LHS;
11876 return std::nullopt;
11877 }
11878
11879 case ISD::AND: {
11880 if (IsVec)
11881 return std::nullopt;
11882
11883 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11884 if (!BitMaskOp)
11885 return std::nullopt;
11886
11887 uint32_t BitMask = BitMaskOp->getZExtValue();
11888 // Bits we expect for our StartingIndex
11889 uint32_t IndexMask = 0xFF << (Index * 8);
11890
11891 if ((IndexMask & BitMask) != IndexMask) {
11892 // If the result of the and partially provides the byte, then it
11893 // is not well formatted
11894 if (IndexMask & BitMask)
11895 return std::nullopt;
11897 }
11898
11899 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11900 }
11901
11902 case ISD::FSHR: {
11903 if (IsVec)
11904 return std::nullopt;
11905
11906 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11907 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11908 if (!ShiftOp || Op.getValueType().isVector())
11909 return std::nullopt;
11910
11911 uint64_t BitsProvided = Op.getValueSizeInBits();
11912 if (BitsProvided % 8 != 0)
11913 return std::nullopt;
11914
11915 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11916 if (BitShift % 8)
11917 return std::nullopt;
11918
11919 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11920 uint64_t ByteShift = BitShift / 8;
11921
11922 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11923 uint64_t BytesProvided = BitsProvided / 8;
11924 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11925 NewIndex %= BytesProvided;
11926 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11927 }
11928
11929 case ISD::SRA:
11930 case ISD::SRL: {
11931 if (IsVec)
11932 return std::nullopt;
11933
11934 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11935 if (!ShiftOp)
11936 return std::nullopt;
11937
11938 uint64_t BitShift = ShiftOp->getZExtValue();
11939 if (BitShift % 8)
11940 return std::nullopt;
11941
11942 auto BitsProvided = Op.getScalarValueSizeInBits();
11943 if (BitsProvided % 8 != 0)
11944 return std::nullopt;
11945
11946 uint64_t BytesProvided = BitsProvided / 8;
11947 uint64_t ByteShift = BitShift / 8;
11948 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11949 // If the byte we are trying to provide (as tracked by index) falls in this
11950 // range, then the SRL provides the byte. The byte of interest of the src of
11951 // the SRL is Index + ByteShift
11952 return BytesProvided - ByteShift > Index
11953 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11954 Index + ByteShift)
11956 }
11957
11958 case ISD::SHL: {
11959 if (IsVec)
11960 return std::nullopt;
11961
11962 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11963 if (!ShiftOp)
11964 return std::nullopt;
11965
11966 uint64_t BitShift = ShiftOp->getZExtValue();
11967 if (BitShift % 8 != 0)
11968 return std::nullopt;
11969 uint64_t ByteShift = BitShift / 8;
11970
11971 // If we are shifting by an amount greater than (or equal to)
11972 // the index we are trying to provide, then it provides 0s. If not,
11973 // then this bytes are not definitively 0s, and the corresponding byte
11974 // of interest is Index - ByteShift of the src
11975 return Index < ByteShift
11977 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11978 Depth + 1, StartingIndex);
11979 }
11980 case ISD::ANY_EXTEND:
11981 case ISD::SIGN_EXTEND:
11982 case ISD::ZERO_EXTEND:
11984 case ISD::AssertZext:
11985 case ISD::AssertSext: {
11986 if (IsVec)
11987 return std::nullopt;
11988
11989 SDValue NarrowOp = Op->getOperand(0);
11990 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11991 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11992 Op->getOpcode() == ISD::AssertZext ||
11993 Op->getOpcode() == ISD::AssertSext) {
11994 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11995 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11996 }
11997 if (NarrowBitWidth % 8 != 0)
11998 return std::nullopt;
11999 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12000
12001 if (Index >= NarrowByteWidth)
12002 return Op.getOpcode() == ISD::ZERO_EXTEND
12003 ? std::optional<ByteProvider<SDValue>>(
12005 : std::nullopt;
12006 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12007 }
12008
12009 case ISD::TRUNCATE: {
12010 if (IsVec)
12011 return std::nullopt;
12012
12013 uint64_t NarrowByteWidth = BitWidth / 8;
12014
12015 if (NarrowByteWidth >= Index) {
12016 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12017 StartingIndex);
12018 }
12019
12020 return std::nullopt;
12021 }
12022
12023 case ISD::CopyFromReg: {
12024 if (BitWidth / 8 > Index)
12025 return calculateSrcByte(Op, StartingIndex, Index);
12026
12027 return std::nullopt;
12028 }
12029
12030 case ISD::LOAD: {
12031 auto L = cast<LoadSDNode>(Op.getNode());
12032
12033 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12034 if (NarrowBitWidth % 8 != 0)
12035 return std::nullopt;
12036 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12037
12038 // If the width of the load does not reach byte we are trying to provide for
12039 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12040 // question
12041 if (Index >= NarrowByteWidth) {
12042 return L->getExtensionType() == ISD::ZEXTLOAD
12043 ? std::optional<ByteProvider<SDValue>>(
12045 : std::nullopt;
12046 }
12047
12048 if (NarrowByteWidth > Index) {
12049 return calculateSrcByte(Op, StartingIndex, Index);
12050 }
12051
12052 return std::nullopt;
12053 }
12054
12055 case ISD::BSWAP: {
12056 if (IsVec)
12057 return std::nullopt;
12058
12059 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12060 Depth + 1, StartingIndex);
12061 }
12062
12064 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12065 if (!IdxOp)
12066 return std::nullopt;
12067 auto VecIdx = IdxOp->getZExtValue();
12068 auto ScalarSize = Op.getScalarValueSizeInBits();
12069 if (ScalarSize != 32) {
12070 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12071 }
12072
12073 return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
12074 StartingIndex, Index);
12075 }
12076
12077 case AMDGPUISD::PERM: {
12078 if (IsVec)
12079 return std::nullopt;
12080
12081 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12082 if (!PermMask)
12083 return std::nullopt;
12084
12085 auto IdxMask =
12086 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12087 if (IdxMask > 0x07 && IdxMask != 0x0c)
12088 return std::nullopt;
12089
12090 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12091 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12092
12093 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12096 }
12097
12098 default: {
12099 return std::nullopt;
12100 }
12101 }
12102
12103 llvm_unreachable("fully handled switch");
12104}
12105
12106// Returns true if the Operand is a scalar and is 16 bits
12107static bool isExtendedFrom16Bits(SDValue &Operand) {
12108
12109 switch (Operand.getOpcode()) {
12110 case ISD::ANY_EXTEND:
12111 case ISD::SIGN_EXTEND:
12112 case ISD::ZERO_EXTEND: {
12113 auto OpVT = Operand.getOperand(0).getValueType();
12114 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12115 }
12116 case ISD::LOAD: {
12117 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12118 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12119 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12120 ExtType == ISD::EXTLOAD) {
12121 auto MemVT = L->getMemoryVT();
12122 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12123 }
12124 return L->getMemoryVT().getSizeInBits() == 16;
12125 }
12126 default:
12127 return false;
12128 }
12129}
12130
12131// Returns true if the mask matches consecutive bytes, and the first byte
12132// begins at a power of 2 byte offset from 0th byte
12133static bool addresses16Bits(int Mask) {
12134 int Low8 = Mask & 0xff;
12135 int Hi8 = (Mask & 0xff00) >> 8;
12136
12137 assert(Low8 < 8 && Hi8 < 8);
12138 // Are the bytes contiguous in the order of increasing addresses.
12139 bool IsConsecutive = (Hi8 - Low8 == 1);
12140 // Is the first byte at location that is aligned for 16 bit instructions.
12141 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12142 // In this case, we still need code to extract the 16 bit operand, so it
12143 // is better to use i8 v_perm
12144 bool Is16Aligned = !(Low8 % 2);
12145
12146 return IsConsecutive && Is16Aligned;
12147}
12148
12149// Do not lower into v_perm if the operands are actually 16 bit
12150// and the selected bits (based on PermMask) correspond with two
12151// easily addressable 16 bit operands.
12153 SDValue &OtherOp) {
12154 int Low16 = PermMask & 0xffff;
12155 int Hi16 = (PermMask & 0xffff0000) >> 16;
12156
12157 auto TempOp = peekThroughBitcasts(Op);
12158 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12159
12160 auto OpIs16Bit =
12161 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12162 if (!OpIs16Bit)
12163 return true;
12164
12165 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12166 isExtendedFrom16Bits(TempOtherOp);
12167 if (!OtherOpIs16Bit)
12168 return true;
12169
12170 // Do we cleanly address both
12171 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12172}
12173
12175 unsigned DWordOffset) {
12176 SDValue Ret;
12177
12178 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12179 // ByteProvider must be at least 8 bits
12180 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12181
12182 if (TypeSize <= 32)
12183 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12184
12185 if (Src.getValueType().isVector()) {
12186 auto ScalarTySize = Src.getScalarValueSizeInBits();
12187 auto ScalarTy = Src.getValueType().getScalarType();
12188 if (ScalarTySize == 32) {
12189 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12190 DAG.getConstant(DWordOffset, SL, MVT::i32));
12191 }
12192 if (ScalarTySize > 32) {
12193 Ret = DAG.getNode(
12194 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12195 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12196 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12197 if (ShiftVal)
12198 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12199 DAG.getConstant(ShiftVal, SL, MVT::i32));
12200 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12201 }
12202
12203 assert(ScalarTySize < 32);
12204 auto NumElements = TypeSize / ScalarTySize;
12205 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12206 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12207 auto NumElementsIn32 = 32 / ScalarTySize;
12208 auto NumAvailElements = DWordOffset < Trunc32Elements
12209 ? NumElementsIn32
12210 : NumElements - NormalizedTrunc;
12211
12213 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12214 NumAvailElements);
12215
12216 Ret = DAG.getBuildVector(
12217 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12218 VecSrcs);
12219 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12220 }
12221
12222 /// Scalar Type
12223 auto ShiftVal = 32 * DWordOffset;
12224 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12225 DAG.getConstant(ShiftVal, SL, MVT::i32));
12226 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12227}
12228
12230 SelectionDAG &DAG = DCI.DAG;
12231 [[maybe_unused]] EVT VT = N->getValueType(0);
12233
12234 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12235 assert(VT == MVT::i32);
12236 for (int i = 0; i < 4; i++) {
12237 // Find the ByteProvider that provides the ith byte of the result of OR
12238 std::optional<ByteProvider<SDValue>> P =
12239 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12240 // TODO support constantZero
12241 if (!P || P->isConstantZero())
12242 return SDValue();
12243
12244 PermNodes.push_back(*P);
12245 }
12246 if (PermNodes.size() != 4)
12247 return SDValue();
12248
12249 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12250 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12251 uint64_t PermMask = 0x00000000;
12252 for (size_t i = 0; i < PermNodes.size(); i++) {
12253 auto PermOp = PermNodes[i];
12254 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12255 // by sizeof(Src2) = 4
12256 int SrcByteAdjust = 4;
12257
12258 // If the Src uses a byte from a different DWORD, then it corresponds
12259 // with a difference source
12260 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12261 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12262 if (SecondSrc)
12263 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12264 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12265 return SDValue();
12266
12267 // Set the index of the second distinct Src node
12268 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12269 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12270 SrcByteAdjust = 0;
12271 }
12272 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12274 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12275 }
12276 SDLoc DL(N);
12277 SDValue Op = *PermNodes[FirstSrc.first].Src;
12278 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12279 assert(Op.getValueSizeInBits() == 32);
12280
12281 // Check that we are not just extracting the bytes in order from an op
12282 if (!SecondSrc) {
12283 int Low16 = PermMask & 0xffff;
12284 int Hi16 = (PermMask & 0xffff0000) >> 16;
12285
12286 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12287 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12288
12289 // The perm op would really just produce Op. So combine into Op
12290 if (WellFormedLow && WellFormedHi)
12291 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12292 }
12293
12294 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12295
12296 if (SecondSrc) {
12297 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12298 assert(OtherOp.getValueSizeInBits() == 32);
12299 }
12300
12301 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12302
12303 assert(Op.getValueType().isByteSized() &&
12304 OtherOp.getValueType().isByteSized());
12305
12306 // If the ultimate src is less than 32 bits, then we will only be
12307 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12308 // CalculateByteProvider would not have returned Op as source if we
12309 // used a byte that is outside its ValueType. Thus, we are free to
12310 // ANY_EXTEND as the extended bits are dont-cares.
12311 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12312 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12313
12314 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12315 DAG.getConstant(PermMask, DL, MVT::i32));
12316 }
12317 return SDValue();
12318}
12319
12320SDValue SITargetLowering::performOrCombine(SDNode *N,
12321 DAGCombinerInfo &DCI) const {
12322 SelectionDAG &DAG = DCI.DAG;
12323 SDValue LHS = N->getOperand(0);
12324 SDValue RHS = N->getOperand(1);
12325
12326 EVT VT = N->getValueType(0);
12327 if (VT == MVT::i1) {
12328 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12329 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12330 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12331 SDValue Src = LHS.getOperand(0);
12332 if (Src != RHS.getOperand(0))
12333 return SDValue();
12334
12335 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12336 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12337 if (!CLHS || !CRHS)
12338 return SDValue();
12339
12340 // Only 10 bits are used.
12341 static const uint32_t MaxMask = 0x3ff;
12342
12343 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12344 SDLoc DL(N);
12345 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12346 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12347 }
12348
12349 return SDValue();
12350 }
12351
12352 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12353 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12354 LHS.getOpcode() == AMDGPUISD::PERM &&
12355 isa<ConstantSDNode>(LHS.getOperand(2))) {
12356 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12357 if (!Sel)
12358 return SDValue();
12359
12360 Sel |= LHS.getConstantOperandVal(2);
12361 SDLoc DL(N);
12362 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12363 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12364 }
12365
12366 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12368 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12369 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12370
12371 // If all the uses of an or need to extract the individual elements, do not
12372 // attempt to lower into v_perm
12373 auto usesCombinedOperand = [](SDNode *OrUse) {
12374 // If we have any non-vectorized use, then it is a candidate for v_perm
12375 if (OrUse->getOpcode() != ISD::BITCAST ||
12376 !OrUse->getValueType(0).isVector())
12377 return true;
12378
12379 // If we have any non-vectorized use, then it is a candidate for v_perm
12380 for (auto VUse : OrUse->uses()) {
12381 if (!VUse->getValueType(0).isVector())
12382 return true;
12383
12384 // If the use of a vector is a store, then combining via a v_perm
12385 // is beneficial.
12386 // TODO -- whitelist more uses
12387 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12388 if (VUse->getOpcode() == VectorwiseOp)
12389 return true;
12390 }
12391 return false;
12392 };
12393
12394 if (!any_of(N->uses(), usesCombinedOperand))
12395 return SDValue();
12396
12397 uint32_t LHSMask = getPermuteMask(LHS);
12398 uint32_t RHSMask = getPermuteMask(RHS);
12399
12400 if (LHSMask != ~0u && RHSMask != ~0u) {
12401 // Canonicalize the expression in an attempt to have fewer unique masks
12402 // and therefore fewer registers used to hold the masks.
12403 if (LHSMask > RHSMask) {
12404 std::swap(LHSMask, RHSMask);
12405 std::swap(LHS, RHS);
12406 }
12407
12408 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12409 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12410 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12411 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12412
12413 // Check of we need to combine values from two sources within a byte.
12414 if (!(LHSUsedLanes & RHSUsedLanes) &&
12415 // If we select high and lower word keep it for SDWA.
12416 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12417 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12418 // Kill zero bytes selected by other mask. Zero value is 0xc.
12419 LHSMask &= ~RHSUsedLanes;
12420 RHSMask &= ~LHSUsedLanes;
12421 // Add 4 to each active LHS lane
12422 LHSMask |= LHSUsedLanes & 0x04040404;
12423 // Combine masks
12424 uint32_t Sel = LHSMask | RHSMask;
12425 SDLoc DL(N);
12426
12427 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12428 LHS.getOperand(0), RHS.getOperand(0),
12429 DAG.getConstant(Sel, DL, MVT::i32));
12430 }
12431 }
12432 if (LHSMask == ~0u || RHSMask == ~0u) {
12433 if (SDValue Perm = matchPERM(N, DCI))
12434 return Perm;
12435 }
12436 }
12437
12438 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12439 return SDValue();
12440
12441 // TODO: This could be a generic combine with a predicate for extracting the
12442 // high half of an integer being free.
12443
12444 // (or i64:x, (zero_extend i32:y)) ->
12445 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12446 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12447 RHS.getOpcode() != ISD::ZERO_EXTEND)
12448 std::swap(LHS, RHS);
12449
12450 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12451 SDValue ExtSrc = RHS.getOperand(0);
12452 EVT SrcVT = ExtSrc.getValueType();
12453 if (SrcVT == MVT::i32) {
12454 SDLoc SL(N);
12455 SDValue LowLHS, HiBits;
12456 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12457 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12458
12459 DCI.AddToWorklist(LowOr.getNode());
12460 DCI.AddToWorklist(HiBits.getNode());
12461
12462 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12463 LowOr, HiBits);
12464 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12465 }
12466 }
12467
12468 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12469 if (CRHS) {
12470 if (SDValue Split
12471 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12472 N->getOperand(0), CRHS))
12473 return Split;
12474 }
12475
12476 return SDValue();
12477}
12478
12479SDValue SITargetLowering::performXorCombine(SDNode *N,
12480 DAGCombinerInfo &DCI) const {
12481 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12482 return RV;
12483
12484 SDValue LHS = N->getOperand(0);
12485 SDValue RHS = N->getOperand(1);
12486
12487 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12488 SelectionDAG &DAG = DCI.DAG;
12489
12490 EVT VT = N->getValueType(0);
12491 if (CRHS && VT == MVT::i64) {
12492 if (SDValue Split
12493 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12494 return Split;
12495 }
12496
12497 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12498 // fneg-like xors into 64-bit select.
12499 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12500 // This looks like an fneg, try to fold as a source modifier.
12501 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12502 shouldFoldFNegIntoSrc(N, LHS)) {
12503 // xor (select c, a, b), 0x80000000 ->
12504 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12505 SDLoc DL(N);
12506 SDValue CastLHS =
12507 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12508 SDValue CastRHS =
12509 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12510 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12511 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12512 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12513 LHS->getOperand(0), FNegLHS, FNegRHS);
12514 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12515 }
12516 }
12517
12518 return SDValue();
12519}
12520
12521SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12522 DAGCombinerInfo &DCI) const {
12523 if (!Subtarget->has16BitInsts() ||
12524 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12525 return SDValue();
12526
12527 EVT VT = N->getValueType(0);
12528 if (VT != MVT::i32)
12529 return SDValue();
12530
12531 SDValue Src = N->getOperand(0);
12532 if (Src.getValueType() != MVT::i16)
12533 return SDValue();
12534
12535 return SDValue();
12536}
12537
12538SDValue
12539SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12540 DAGCombinerInfo &DCI) const {
12541 SDValue Src = N->getOperand(0);
12542 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12543
12544 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12545 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12546 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12547 VTSign->getVT() == MVT::i8) ||
12548 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12549 VTSign->getVT() == MVT::i16))) {
12550 assert(Subtarget->hasScalarSubwordLoads() &&
12551 "s_buffer_load_{u8, i8} are supported "
12552 "in GFX12 (or newer) architectures.");
12553 EVT VT = Src.getValueType();
12554 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12557 SDLoc DL(N);
12558 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12559 SDValue Ops[] = {
12560 Src.getOperand(0), // source register
12561 Src.getOperand(1), // offset
12562 Src.getOperand(2) // cachePolicy
12563 };
12564 auto *M = cast<MemSDNode>(Src);
12565 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12566 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12567 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12568 return LoadVal;
12569 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12570 VTSign->getVT() == MVT::i8) ||
12571 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12572 VTSign->getVT() == MVT::i16)) &&
12573 Src.hasOneUse()) {
12574 auto *M = cast<MemSDNode>(Src);
12575 SDValue Ops[] = {
12576 Src.getOperand(0), // Chain
12577 Src.getOperand(1), // rsrc
12578 Src.getOperand(2), // vindex
12579 Src.getOperand(3), // voffset
12580 Src.getOperand(4), // soffset
12581 Src.getOperand(5), // offset
12582 Src.getOperand(6),
12583 Src.getOperand(7)
12584 };
12585 // replace with BUFFER_LOAD_BYTE/SHORT
12586 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12587 Src.getOperand(0).getValueType());
12588 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12590 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12591 ResList,
12592 Ops, M->getMemoryVT(),
12593 M->getMemOperand());
12594 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12595 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12596 }
12597 return SDValue();
12598}
12599
12600SDValue SITargetLowering::performClassCombine(SDNode *N,
12601 DAGCombinerInfo &DCI) const {
12602 SelectionDAG &DAG = DCI.DAG;
12603 SDValue Mask = N->getOperand(1);
12604
12605 // fp_class x, 0 -> false
12606 if (isNullConstant(Mask))
12607 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12608
12609 if (N->getOperand(0).isUndef())
12610 return DAG.getUNDEF(MVT::i1);
12611
12612 return SDValue();
12613}
12614
12615SDValue SITargetLowering::performRcpCombine(SDNode *N,
12616 DAGCombinerInfo &DCI) const {
12617 EVT VT = N->getValueType(0);
12618 SDValue N0 = N->getOperand(0);
12619
12620 if (N0.isUndef()) {
12621 return DCI.DAG.getConstantFP(
12623 VT);
12624 }
12625
12626 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12627 N0.getOpcode() == ISD::SINT_TO_FP)) {
12628 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12629 N->getFlags());
12630 }
12631
12632 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12633 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12634 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12635 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12636 N0.getOperand(0), N->getFlags());
12637 }
12638
12640}
12641
12643 unsigned MaxDepth) const {
12644 unsigned Opcode = Op.getOpcode();
12645 if (Opcode == ISD::FCANONICALIZE)
12646 return true;
12647
12648 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12649 const auto &F = CFP->getValueAPF();
12650 if (F.isNaN() && F.isSignaling())
12651 return false;
12652 if (!F.isDenormal())
12653 return true;
12654
12655 DenormalMode Mode =
12656 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12657 return Mode == DenormalMode::getIEEE();
12658 }
12659
12660 // If source is a result of another standard FP operation it is already in
12661 // canonical form.
12662 if (MaxDepth == 0)
12663 return false;
12664
12665 switch (Opcode) {
12666 // These will flush denorms if required.
12667 case ISD::FADD:
12668 case ISD::FSUB:
12669 case ISD::FMUL:
12670 case ISD::FCEIL:
12671 case ISD::FFLOOR:
12672 case ISD::FMA:
12673 case ISD::FMAD:
12674 case ISD::FSQRT:
12675 case ISD::FDIV:
12676 case ISD::FREM:
12677 case ISD::FP_ROUND:
12678 case ISD::FP_EXTEND:
12679 case ISD::FP16_TO_FP:
12680 case ISD::FP_TO_FP16:
12681 case ISD::BF16_TO_FP:
12682 case ISD::FP_TO_BF16:
12683 case ISD::FLDEXP:
12686 case AMDGPUISD::RCP:
12687 case AMDGPUISD::RSQ:
12691 case AMDGPUISD::LOG:
12692 case AMDGPUISD::EXP:
12696 case AMDGPUISD::FRACT:
12703 case AMDGPUISD::SIN_HW:
12704 case AMDGPUISD::COS_HW:
12705 return true;
12706
12707 // It can/will be lowered or combined as a bit operation.
12708 // Need to check their input recursively to handle.
12709 case ISD::FNEG:
12710 case ISD::FABS:
12711 case ISD::FCOPYSIGN:
12712 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12713
12714 case ISD::AND:
12715 if (Op.getValueType() == MVT::i32) {
12716 // Be careful as we only know it is a bitcast floating point type. It
12717 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12718 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12719 // is valid to optimize for all types.
12720 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12721 if (RHS->getZExtValue() == 0xffff0000) {
12722 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12723 }
12724 }
12725 }
12726 break;
12727
12728 case ISD::FSIN:
12729 case ISD::FCOS:
12730 case ISD::FSINCOS:
12731 return Op.getValueType().getScalarType() != MVT::f16;
12732
12733 case ISD::FMINNUM:
12734 case ISD::FMAXNUM:
12735 case ISD::FMINNUM_IEEE:
12736 case ISD::FMAXNUM_IEEE:
12737 case ISD::FMINIMUM:
12738 case ISD::FMAXIMUM:
12739 case AMDGPUISD::CLAMP:
12740 case AMDGPUISD::FMED3:
12741 case AMDGPUISD::FMAX3:
12742 case AMDGPUISD::FMIN3:
12744 case AMDGPUISD::FMINIMUM3: {
12745 // FIXME: Shouldn't treat the generic operations different based these.
12746 // However, we aren't really required to flush the result from
12747 // minnum/maxnum..
12748
12749 // snans will be quieted, so we only need to worry about denormals.
12750 if (Subtarget->supportsMinMaxDenormModes() ||
12751 // FIXME: denormalsEnabledForType is broken for dynamic
12752 denormalsEnabledForType(DAG, Op.getValueType()))
12753 return true;
12754
12755 // Flushing may be required.
12756 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12757 // targets need to check their input recursively.
12758
12759 // FIXME: Does this apply with clamp? It's implemented with max.
12760 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12761 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12762 return false;
12763 }
12764
12765 return true;
12766 }
12767 case ISD::SELECT: {
12768 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12769 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12770 }
12771 case ISD::BUILD_VECTOR: {
12772 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12773 SDValue SrcOp = Op.getOperand(i);
12774 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12775 return false;
12776 }
12777
12778 return true;
12779 }
12782 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12783 }
12785 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12786 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12787 }
12788 case ISD::UNDEF:
12789 // Could be anything.
12790 return false;
12791
12792 case ISD::BITCAST:
12793 // TODO: This is incorrect as it loses track of the operand's type. We may
12794 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12795 // same bits that are canonicalized in one type need not be in the other.
12796 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12797 case ISD::TRUNCATE: {
12798 // Hack round the mess we make when legalizing extract_vector_elt
12799 if (Op.getValueType() == MVT::i16) {
12800 SDValue TruncSrc = Op.getOperand(0);
12801 if (TruncSrc.getValueType() == MVT::i32 &&
12802 TruncSrc.getOpcode() == ISD::BITCAST &&
12803 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12804 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12805 }
12806 }
12807 return false;
12808 }
12810 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12811 // TODO: Handle more intrinsics
12812 switch (IntrinsicID) {
12813 case Intrinsic::amdgcn_cvt_pkrtz:
12814 case Intrinsic::amdgcn_cubeid:
12815 case Intrinsic::amdgcn_frexp_mant:
12816 case Intrinsic::amdgcn_fdot2:
12817 case Intrinsic::amdgcn_rcp:
12818 case Intrinsic::amdgcn_rsq:
12819 case Intrinsic::amdgcn_rsq_clamp:
12820 case Intrinsic::amdgcn_rcp_legacy:
12821 case Intrinsic::amdgcn_rsq_legacy:
12822 case Intrinsic::amdgcn_trig_preop:
12823 case Intrinsic::amdgcn_log:
12824 case Intrinsic::amdgcn_exp2:
12825 case Intrinsic::amdgcn_sqrt:
12826 return true;
12827 default:
12828 break;
12829 }
12830
12831 break;
12832 }
12833 default:
12834 break;
12835 }
12836
12837 // FIXME: denormalsEnabledForType is broken for dynamic
12838 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12839 DAG.isKnownNeverSNaN(Op);
12840}
12841
12843 unsigned MaxDepth) const {
12844 const MachineRegisterInfo &MRI = MF.getRegInfo();
12845 MachineInstr *MI = MRI.getVRegDef(Reg);
12846 unsigned Opcode = MI->getOpcode();
12847
12848 if (Opcode == AMDGPU::G_FCANONICALIZE)
12849 return true;
12850
12851 std::optional<FPValueAndVReg> FCR;
12852 // Constant splat (can be padded with undef) or scalar constant.
12853 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12854 if (FCR->Value.isSignaling())
12855 return false;
12856 if (!FCR->Value.isDenormal())
12857 return true;
12858
12859 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12860 return Mode == DenormalMode::getIEEE();
12861 }
12862
12863 if (MaxDepth == 0)
12864 return false;
12865
12866 switch (Opcode) {
12867 case AMDGPU::G_FADD:
12868 case AMDGPU::G_FSUB:
12869 case AMDGPU::G_FMUL:
12870 case AMDGPU::G_FCEIL:
12871 case AMDGPU::G_FFLOOR:
12872 case AMDGPU::G_FRINT:
12873 case AMDGPU::G_FNEARBYINT:
12874 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12875 case AMDGPU::G_INTRINSIC_TRUNC:
12876 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12877 case AMDGPU::G_FMA:
12878 case AMDGPU::G_FMAD:
12879 case AMDGPU::G_FSQRT:
12880 case AMDGPU::G_FDIV:
12881 case AMDGPU::G_FREM:
12882 case AMDGPU::G_FPOW:
12883 case AMDGPU::G_FPEXT:
12884 case AMDGPU::G_FLOG:
12885 case AMDGPU::G_FLOG2:
12886 case AMDGPU::G_FLOG10:
12887 case AMDGPU::G_FPTRUNC:
12888 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12889 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12890 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12891 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12892 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12893 return true;
12894 case AMDGPU::G_FNEG:
12895 case AMDGPU::G_FABS:
12896 case AMDGPU::G_FCOPYSIGN:
12897 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12898 case AMDGPU::G_FMINNUM:
12899 case AMDGPU::G_FMAXNUM:
12900 case AMDGPU::G_FMINNUM_IEEE:
12901 case AMDGPU::G_FMAXNUM_IEEE:
12902 case AMDGPU::G_FMINIMUM:
12903 case AMDGPU::G_FMAXIMUM: {
12904 if (Subtarget->supportsMinMaxDenormModes() ||
12905 // FIXME: denormalsEnabledForType is broken for dynamic
12906 denormalsEnabledForType(MRI.getType(Reg), MF))
12907 return true;
12908
12909 [[fallthrough]];
12910 }
12911 case AMDGPU::G_BUILD_VECTOR:
12912 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12913 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12914 return false;
12915 return true;
12916 case AMDGPU::G_INTRINSIC:
12917 case AMDGPU::G_INTRINSIC_CONVERGENT:
12918 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12919 case Intrinsic::amdgcn_fmul_legacy:
12920 case Intrinsic::amdgcn_fmad_ftz:
12921 case Intrinsic::amdgcn_sqrt:
12922 case Intrinsic::amdgcn_fmed3:
12923 case Intrinsic::amdgcn_sin:
12924 case Intrinsic::amdgcn_cos:
12925 case Intrinsic::amdgcn_log:
12926 case Intrinsic::amdgcn_exp2:
12927 case Intrinsic::amdgcn_log_clamp:
12928 case Intrinsic::amdgcn_rcp:
12929 case Intrinsic::amdgcn_rcp_legacy:
12930 case Intrinsic::amdgcn_rsq:
12931 case Intrinsic::amdgcn_rsq_clamp:
12932 case Intrinsic::amdgcn_rsq_legacy:
12933 case Intrinsic::amdgcn_div_scale:
12934 case Intrinsic::amdgcn_div_fmas:
12935 case Intrinsic::amdgcn_div_fixup:
12936 case Intrinsic::amdgcn_fract:
12937 case Intrinsic::amdgcn_cvt_pkrtz:
12938 case Intrinsic::amdgcn_cubeid:
12939 case Intrinsic::amdgcn_cubema:
12940 case Intrinsic::amdgcn_cubesc:
12941 case Intrinsic::amdgcn_cubetc:
12942 case Intrinsic::amdgcn_frexp_mant:
12943 case Intrinsic::amdgcn_fdot2:
12944 case Intrinsic::amdgcn_trig_preop:
12945 return true;
12946 default:
12947 break;
12948 }
12949
12950 [[fallthrough]];
12951 default:
12952 return false;
12953 }
12954
12955 llvm_unreachable("invalid operation");
12956}
12957
12958// Constant fold canonicalize.
12959SDValue SITargetLowering::getCanonicalConstantFP(
12960 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12961 // Flush denormals to 0 if not enabled.
12962 if (C.isDenormal()) {
12963 DenormalMode Mode =
12964 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12965 if (Mode == DenormalMode::getPreserveSign()) {
12966 return DAG.getConstantFP(
12967 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12968 }
12969
12970 if (Mode != DenormalMode::getIEEE())
12971 return SDValue();
12972 }
12973
12974 if (C.isNaN()) {
12975 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12976 if (C.isSignaling()) {
12977 // Quiet a signaling NaN.
12978 // FIXME: Is this supposed to preserve payload bits?
12979 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12980 }
12981
12982 // Make sure it is the canonical NaN bitpattern.
12983 //
12984 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12985 // immediate?
12986 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12987 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12988 }
12989
12990 // Already canonical.
12991 return DAG.getConstantFP(C, SL, VT);
12992}
12993
12995 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12996}
12997
12998SDValue SITargetLowering::performFCanonicalizeCombine(
12999 SDNode *N,
13000 DAGCombinerInfo &DCI) const {
13001 SelectionDAG &DAG = DCI.DAG;
13002 SDValue N0 = N->getOperand(0);
13003 EVT VT = N->getValueType(0);
13004
13005 // fcanonicalize undef -> qnan
13006 if (N0.isUndef()) {
13008 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13009 }
13010
13011 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13012 EVT VT = N->getValueType(0);
13013 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13014 }
13015
13016 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13017 // (fcanonicalize k)
13018 //
13019 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13020
13021 // TODO: This could be better with wider vectors that will be split to v2f16,
13022 // and to consider uses since there aren't that many packed operations.
13023 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13024 isTypeLegal(MVT::v2f16)) {
13025 SDLoc SL(N);
13026 SDValue NewElts[2];
13027 SDValue Lo = N0.getOperand(0);
13028 SDValue Hi = N0.getOperand(1);
13029 EVT EltVT = Lo.getValueType();
13030
13032 for (unsigned I = 0; I != 2; ++I) {
13033 SDValue Op = N0.getOperand(I);
13034 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13035 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
13036 CFP->getValueAPF());
13037 } else if (Op.isUndef()) {
13038 // Handled below based on what the other operand is.
13039 NewElts[I] = Op;
13040 } else {
13041 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13042 }
13043 }
13044
13045 // If one half is undef, and one is constant, prefer a splat vector rather
13046 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13047 // cheaper to use and may be free with a packed operation.
13048 if (NewElts[0].isUndef()) {
13049 if (isa<ConstantFPSDNode>(NewElts[1]))
13050 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13051 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13052 }
13053
13054 if (NewElts[1].isUndef()) {
13055 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13056 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13057 }
13058
13059 return DAG.getBuildVector(VT, SL, NewElts);
13060 }
13061 }
13062
13063 return SDValue();
13064}
13065
13066static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13067 switch (Opc) {
13068 case ISD::FMAXNUM:
13069 case ISD::FMAXNUM_IEEE:
13070 return AMDGPUISD::FMAX3;
13071 case ISD::FMAXIMUM:
13072 return AMDGPUISD::FMAXIMUM3;
13073 case ISD::SMAX:
13074 return AMDGPUISD::SMAX3;
13075 case ISD::UMAX:
13076 return AMDGPUISD::UMAX3;
13077 case ISD::FMINNUM:
13078 case ISD::FMINNUM_IEEE:
13079 return AMDGPUISD::FMIN3;
13080 case ISD::FMINIMUM:
13081 return AMDGPUISD::FMINIMUM3;
13082 case ISD::SMIN:
13083 return AMDGPUISD::SMIN3;
13084 case ISD::UMIN:
13085 return AMDGPUISD::UMIN3;
13086 default:
13087 llvm_unreachable("Not a min/max opcode");
13088 }
13089}
13090
13091SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13092 const SDLoc &SL, SDValue Src,
13093 SDValue MinVal,
13094 SDValue MaxVal,
13095 bool Signed) const {
13096
13097 // med3 comes from
13098 // min(max(x, K0), K1), K0 < K1
13099 // max(min(x, K0), K1), K1 < K0
13100 //
13101 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13102 // min/max op.
13103 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13104 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13105
13106 if (!MinK || !MaxK)
13107 return SDValue();
13108
13109 if (Signed) {
13110 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13111 return SDValue();
13112 } else {
13113 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13114 return SDValue();
13115 }
13116
13117 EVT VT = MinK->getValueType(0);
13118 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13119 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13120 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13121
13122 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13123 // not available, but this is unlikely to be profitable as constants
13124 // will often need to be materialized & extended, especially on
13125 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13126 return SDValue();
13127}
13128
13130 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13131 return C;
13132
13133 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13134 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13135 return C;
13136 }
13137
13138 return nullptr;
13139}
13140
13141SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13142 const SDLoc &SL,
13143 SDValue Op0,
13144 SDValue Op1) const {
13146 if (!K1)
13147 return SDValue();
13148
13150 if (!K0)
13151 return SDValue();
13152
13153 // Ordered >= (although NaN inputs should have folded away by now).
13154 if (K0->getValueAPF() > K1->getValueAPF())
13155 return SDValue();
13156
13157 const MachineFunction &MF = DAG.getMachineFunction();
13159
13160 // TODO: Check IEEE bit enabled?
13161 EVT VT = Op0.getValueType();
13162 if (Info->getMode().DX10Clamp) {
13163 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13164 // hardware fmed3 behavior converting to a min.
13165 // FIXME: Should this be allowing -0.0?
13166 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13167 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13168 }
13169
13170 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13171 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13172 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13173 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13174 // then give the other result, which is different from med3 with a NaN
13175 // input.
13176 SDValue Var = Op0.getOperand(0);
13177 if (!DAG.isKnownNeverSNaN(Var))
13178 return SDValue();
13179
13181
13182 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13183 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13184 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13185 Var, SDValue(K0, 0), SDValue(K1, 0));
13186 }
13187 }
13188
13189 return SDValue();
13190}
13191
13192SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13193 DAGCombinerInfo &DCI) const {
13194 SelectionDAG &DAG = DCI.DAG;
13195
13196 EVT VT = N->getValueType(0);
13197 unsigned Opc = N->getOpcode();
13198 SDValue Op0 = N->getOperand(0);
13199 SDValue Op1 = N->getOperand(1);
13200
13201 // Only do this if the inner op has one use since this will just increases
13202 // register pressure for no benefit.
13203
13204 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
13205 !VT.isVector() &&
13206 (VT == MVT::i32 || VT == MVT::f32 ||
13207 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
13208 // max(max(a, b), c) -> max3(a, b, c)
13209 // min(min(a, b), c) -> min3(a, b, c)
13210 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13211 SDLoc DL(N);
13212 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13213 DL,
13214 N->getValueType(0),
13215 Op0.getOperand(0),
13216 Op0.getOperand(1),
13217 Op1);
13218 }
13219
13220 // Try commuted.
13221 // max(a, max(b, c)) -> max3(a, b, c)
13222 // min(a, min(b, c)) -> min3(a, b, c)
13223 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13224 SDLoc DL(N);
13225 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13226 DL,
13227 N->getValueType(0),
13228 Op0,
13229 Op1.getOperand(0),
13230 Op1.getOperand(1));
13231 }
13232 }
13233
13234 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13235 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13236 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13237 if (SDValue Med3 = performIntMed3ImmCombine(
13238 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13239 return Med3;
13240 }
13241 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13242 if (SDValue Med3 = performIntMed3ImmCombine(
13243 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13244 return Med3;
13245 }
13246
13247 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13248 if (SDValue Med3 = performIntMed3ImmCombine(
13249 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13250 return Med3;
13251 }
13252 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13253 if (SDValue Med3 = performIntMed3ImmCombine(
13254 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13255 return Med3;
13256 }
13257
13258 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13259 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13260 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13261 (Opc == AMDGPUISD::FMIN_LEGACY &&
13262 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13263 (VT == MVT::f32 || VT == MVT::f64 ||
13264 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13265 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13266 Op0.hasOneUse()) {
13267 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13268 return Res;
13269 }
13270
13271 return SDValue();
13272}
13273
13275 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13276 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13277 // FIXME: Should this be allowing -0.0?
13278 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13279 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13280 }
13281 }
13282
13283 return false;
13284}
13285
13286// FIXME: Should only worry about snans for version with chain.
13287SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13288 DAGCombinerInfo &DCI) const {
13289 EVT VT = N->getValueType(0);
13290 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13291 // NaNs. With a NaN input, the order of the operands may change the result.
13292
13293 SelectionDAG &DAG = DCI.DAG;
13294 SDLoc SL(N);
13295
13296 SDValue Src0 = N->getOperand(0);
13297 SDValue Src1 = N->getOperand(1);
13298 SDValue Src2 = N->getOperand(2);
13299
13300 if (isClampZeroToOne(Src0, Src1)) {
13301 // const_a, const_b, x -> clamp is safe in all cases including signaling
13302 // nans.
13303 // FIXME: Should this be allowing -0.0?
13304 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13305 }
13306
13307 const MachineFunction &MF = DAG.getMachineFunction();
13309
13310 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13311 // handling no dx10-clamp?
13312 if (Info->getMode().DX10Clamp) {
13313 // If NaNs is clamped to 0, we are free to reorder the inputs.
13314
13315 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13316 std::swap(Src0, Src1);
13317
13318 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13319 std::swap(Src1, Src2);
13320
13321 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13322 std::swap(Src0, Src1);
13323
13324 if (isClampZeroToOne(Src1, Src2))
13325 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13326 }
13327
13328 return SDValue();
13329}
13330
13331SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13332 DAGCombinerInfo &DCI) const {
13333 SDValue Src0 = N->getOperand(0);
13334 SDValue Src1 = N->getOperand(1);
13335 if (Src0.isUndef() && Src1.isUndef())
13336 return DCI.DAG.getUNDEF(N->getValueType(0));
13337 return SDValue();
13338}
13339
13340// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13341// expanded into a set of cmp/select instructions.
13343 unsigned NumElem,
13344 bool IsDivergentIdx,
13345 const GCNSubtarget *Subtarget) {
13347 return false;
13348
13349 unsigned VecSize = EltSize * NumElem;
13350
13351 // Sub-dword vectors of size 2 dword or less have better implementation.
13352 if (VecSize <= 64 && EltSize < 32)
13353 return false;
13354
13355 // Always expand the rest of sub-dword instructions, otherwise it will be
13356 // lowered via memory.
13357 if (EltSize < 32)
13358 return true;
13359
13360 // Always do this if var-idx is divergent, otherwise it will become a loop.
13361 if (IsDivergentIdx)
13362 return true;
13363
13364 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13365 unsigned NumInsts = NumElem /* Number of compares */ +
13366 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13367
13368 // On some architectures (GFX9) movrel is not available and it's better
13369 // to expand.
13370 if (!Subtarget->hasMovrel())
13371 return NumInsts <= 16;
13372
13373 // If movrel is available, use it instead of expanding for vector of 8
13374 // elements.
13375 return NumInsts <= 15;
13376}
13377
13379 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13380 if (isa<ConstantSDNode>(Idx))
13381 return false;
13382
13383 SDValue Vec = N->getOperand(0);
13384 EVT VecVT = Vec.getValueType();
13385 EVT EltVT = VecVT.getVectorElementType();
13386 unsigned EltSize = EltVT.getSizeInBits();
13387 unsigned NumElem = VecVT.getVectorNumElements();
13388
13390 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13391}
13392
13393SDValue SITargetLowering::performExtractVectorEltCombine(
13394 SDNode *N, DAGCombinerInfo &DCI) const {
13395 SDValue Vec = N->getOperand(0);
13396 SelectionDAG &DAG = DCI.DAG;
13397
13398 EVT VecVT = Vec.getValueType();
13399 EVT VecEltVT = VecVT.getVectorElementType();
13400 EVT ResVT = N->getValueType(0);
13401
13402 unsigned VecSize = VecVT.getSizeInBits();
13403 unsigned VecEltSize = VecEltVT.getSizeInBits();
13404
13405 if ((Vec.getOpcode() == ISD::FNEG ||
13407 SDLoc SL(N);
13408 SDValue Idx = N->getOperand(1);
13409 SDValue Elt =
13410 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13411 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13412 }
13413
13414 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13415 // =>
13416 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13417 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13418 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13419 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13420 SDLoc SL(N);
13421 SDValue Idx = N->getOperand(1);
13422 unsigned Opc = Vec.getOpcode();
13423
13424 switch(Opc) {
13425 default:
13426 break;
13427 // TODO: Support other binary operations.
13428 case ISD::FADD:
13429 case ISD::FSUB:
13430 case ISD::FMUL:
13431 case ISD::ADD:
13432 case ISD::UMIN:
13433 case ISD::UMAX:
13434 case ISD::SMIN:
13435 case ISD::SMAX:
13436 case ISD::FMAXNUM:
13437 case ISD::FMINNUM:
13438 case ISD::FMAXNUM_IEEE:
13439 case ISD::FMINNUM_IEEE:
13440 case ISD::FMAXIMUM:
13441 case ISD::FMINIMUM: {
13442 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13443 Vec.getOperand(0), Idx);
13444 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13445 Vec.getOperand(1), Idx);
13446
13447 DCI.AddToWorklist(Elt0.getNode());
13448 DCI.AddToWorklist(Elt1.getNode());
13449 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13450 }
13451 }
13452 }
13453
13454 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13456 SDLoc SL(N);
13457 SDValue Idx = N->getOperand(1);
13458 SDValue V;
13459 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13460 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13461 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13462 if (I == 0)
13463 V = Elt;
13464 else
13465 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13466 }
13467 return V;
13468 }
13469
13470 if (!DCI.isBeforeLegalize())
13471 return SDValue();
13472
13473 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13474 // elements. This exposes more load reduction opportunities by replacing
13475 // multiple small extract_vector_elements with a single 32-bit extract.
13476 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13477 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13478 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13479 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13480
13481 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13482 unsigned EltIdx = BitIndex / 32;
13483 unsigned LeftoverBitIdx = BitIndex % 32;
13484 SDLoc SL(N);
13485
13486 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13487 DCI.AddToWorklist(Cast.getNode());
13488
13489 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13490 DAG.getConstant(EltIdx, SL, MVT::i32));
13491 DCI.AddToWorklist(Elt.getNode());
13492 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13493 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13494 DCI.AddToWorklist(Srl.getNode());
13495
13496 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13497 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13498 DCI.AddToWorklist(Trunc.getNode());
13499
13500 if (VecEltVT == ResVT) {
13501 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13502 }
13503
13504 assert(ResVT.isScalarInteger());
13505 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13506 }
13507
13508 return SDValue();
13509}
13510
13511SDValue
13512SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13513 DAGCombinerInfo &DCI) const {
13514 SDValue Vec = N->getOperand(0);
13515 SDValue Idx = N->getOperand(2);
13516 EVT VecVT = Vec.getValueType();
13517 EVT EltVT = VecVT.getVectorElementType();
13518
13519 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13520 // => BUILD_VECTOR n x select (e, const-idx)
13522 return SDValue();
13523
13524 SelectionDAG &DAG = DCI.DAG;
13525 SDLoc SL(N);
13526 SDValue Ins = N->getOperand(1);
13527 EVT IdxVT = Idx.getValueType();
13528
13530 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13531 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13532 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13533 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13534 Ops.push_back(V);
13535 }
13536
13537 return DAG.getBuildVector(VecVT, SL, Ops);
13538}
13539
13540/// Return the source of an fp_extend from f16 to f32, or a converted FP
13541/// constant.
13543 if (Src.getOpcode() == ISD::FP_EXTEND &&
13544 Src.getOperand(0).getValueType() == MVT::f16) {
13545 return Src.getOperand(0);
13546 }
13547
13548 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13549 APFloat Val = CFP->getValueAPF();
13550 bool LosesInfo = true;
13552 if (!LosesInfo)
13553 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13554 }
13555
13556 return SDValue();
13557}
13558
13559SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13560 DAGCombinerInfo &DCI) const {
13561 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13562 "combine only useful on gfx8");
13563
13564 SDValue TruncSrc = N->getOperand(0);
13565 EVT VT = N->getValueType(0);
13566 if (VT != MVT::f16)
13567 return SDValue();
13568
13569 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13570 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13571 return SDValue();
13572
13573 SelectionDAG &DAG = DCI.DAG;
13574 SDLoc SL(N);
13575
13576 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13577 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13578 // casting back.
13579
13580 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13581 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13582 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13583 if (!A)
13584 return SDValue();
13585
13586 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13587 if (!B)
13588 return SDValue();
13589
13590 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13591 if (!C)
13592 return SDValue();
13593
13594 // This changes signaling nan behavior. If an input is a signaling nan, it
13595 // would have been quieted by the fpext originally. We don't care because
13596 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13597 // we would be worse off than just doing the promotion.
13598 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13599 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13600 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13601 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13602}
13603
13604unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13605 const SDNode *N0,
13606 const SDNode *N1) const {
13607 EVT VT = N0->getValueType(0);
13608
13609 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13610 // support denormals ever.
13611 if (((VT == MVT::f32 &&
13613 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13616 return ISD::FMAD;
13617
13618 const TargetOptions &Options = DAG.getTarget().Options;
13619 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13620 (N0->getFlags().hasAllowContract() &&
13621 N1->getFlags().hasAllowContract())) &&
13623 return ISD::FMA;
13624 }
13625
13626 return 0;
13627}
13628
13629// For a reassociatable opcode perform:
13630// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13631SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13632 SelectionDAG &DAG) const {
13633 EVT VT = N->getValueType(0);
13634 if (VT != MVT::i32 && VT != MVT::i64)
13635 return SDValue();
13636
13637 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13638 return SDValue();
13639
13640 unsigned Opc = N->getOpcode();
13641 SDValue Op0 = N->getOperand(0);
13642 SDValue Op1 = N->getOperand(1);
13643
13644 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13645 return SDValue();
13646
13647 if (Op0->isDivergent())
13648 std::swap(Op0, Op1);
13649
13650 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13651 return SDValue();
13652
13653 SDValue Op2 = Op1.getOperand(1);
13654 Op1 = Op1.getOperand(0);
13655 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13656 return SDValue();
13657
13658 if (Op1->isDivergent())
13659 std::swap(Op1, Op2);
13660
13661 SDLoc SL(N);
13662 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13663 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13664}
13665
13666static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13667 EVT VT,
13668 SDValue N0, SDValue N1, SDValue N2,
13669 bool Signed) {
13671 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13672 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13673 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13674}
13675
13676// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13677// multiplies, if any.
13678//
13679// Full 64-bit multiplies that feed into an addition are lowered here instead
13680// of using the generic expansion. The generic expansion ends up with
13681// a tree of ADD nodes that prevents us from using the "add" part of the
13682// MAD instruction. The expansion produced here results in a chain of ADDs
13683// instead of a tree.
13684SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13685 DAGCombinerInfo &DCI) const {
13686 assert(N->getOpcode() == ISD::ADD);
13687
13688 SelectionDAG &DAG = DCI.DAG;
13689 EVT VT = N->getValueType(0);
13690 SDLoc SL(N);
13691 SDValue LHS = N->getOperand(0);
13692 SDValue RHS = N->getOperand(1);
13693
13694 if (VT.isVector())
13695 return SDValue();
13696
13697 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13698 // result in scalar registers for uniform values.
13699 if (!N->isDivergent() && Subtarget->hasSMulHi())
13700 return SDValue();
13701
13702 unsigned NumBits = VT.getScalarSizeInBits();
13703 if (NumBits <= 32 || NumBits > 64)
13704 return SDValue();
13705
13706 if (LHS.getOpcode() != ISD::MUL) {
13707 assert(RHS.getOpcode() == ISD::MUL);
13708 std::swap(LHS, RHS);
13709 }
13710
13711 // Avoid the fold if it would unduly increase the number of multiplies due to
13712 // multiple uses, except on hardware with full-rate multiply-add (which is
13713 // part of full-rate 64-bit ops).
13714 if (!Subtarget->hasFullRate64Ops()) {
13715 unsigned NumUsers = 0;
13716 for (SDNode *Use : LHS->uses()) {
13717 // There is a use that does not feed into addition, so the multiply can't
13718 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13719 if (Use->getOpcode() != ISD::ADD)
13720 return SDValue();
13721
13722 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13723 // MUL + 3xADD + 3xADDC over 3xMAD.
13724 ++NumUsers;
13725 if (NumUsers >= 3)
13726 return SDValue();
13727 }
13728 }
13729
13730 SDValue MulLHS = LHS.getOperand(0);
13731 SDValue MulRHS = LHS.getOperand(1);
13732 SDValue AddRHS = RHS;
13733
13734 // Always check whether operands are small unsigned values, since that
13735 // knowledge is useful in more cases. Check for small signed values only if
13736 // doing so can unlock a shorter code sequence.
13737 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13738 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13739
13740 bool MulSignedLo = false;
13741 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13742 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13743 numBitsSigned(MulRHS, DAG) <= 32;
13744 }
13745
13746 // The operands and final result all have the same number of bits. If
13747 // operands need to be extended, they can be extended with garbage. The
13748 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13749 // truncated away in the end.
13750 if (VT != MVT::i64) {
13751 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13752 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13753 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13754 }
13755
13756 // The basic code generated is conceptually straightforward. Pseudo code:
13757 //
13758 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13759 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13760 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13761 //
13762 // The second and third lines are optional, depending on whether the factors
13763 // are {sign,zero}-extended or not.
13764 //
13765 // The actual DAG is noisier than the pseudo code, but only due to
13766 // instructions that disassemble values into low and high parts, and
13767 // assemble the final result.
13768 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13769
13770 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13771 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13772 SDValue Accum =
13773 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13774
13775 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13776 SDValue AccumLo, AccumHi;
13777 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13778
13779 if (!MulLHSUnsigned32) {
13780 auto MulLHSHi =
13781 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13782 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13783 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13784 }
13785
13786 if (!MulRHSUnsigned32) {
13787 auto MulRHSHi =
13788 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13789 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13790 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13791 }
13792
13793 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13794 Accum = DAG.getBitcast(MVT::i64, Accum);
13795 }
13796
13797 if (VT != MVT::i64)
13798 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13799 return Accum;
13800}
13801
13802// Collect the ultimate src of each of the mul node's operands, and confirm
13803// each operand is 8 bytes.
13804static std::optional<ByteProvider<SDValue>>
13805handleMulOperand(const SDValue &MulOperand) {
13806 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13807 if (!Byte0 || Byte0->isConstantZero()) {
13808 return std::nullopt;
13809 }
13810 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13811 if (Byte1 && !Byte1->isConstantZero()) {
13812 return std::nullopt;
13813 }
13814 return Byte0;
13815}
13816
13817static unsigned addPermMasks(unsigned First, unsigned Second) {
13818 unsigned FirstCs = First & 0x0c0c0c0c;
13819 unsigned SecondCs = Second & 0x0c0c0c0c;
13820 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13821 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13822
13823 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13824 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13825 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13826 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13827
13828 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13829}
13830
13831struct DotSrc {
13833 int64_t PermMask;
13835};
13836
13840 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13841
13842 assert(Src0.Src.has_value() && Src1.Src.has_value());
13843 // Src0s and Src1s are empty, just place arbitrarily.
13844 if (Step == 0) {
13845 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13846 Src0.SrcOffset / 4});
13847 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13848 Src1.SrcOffset / 4});
13849 return;
13850 }
13851
13852 for (int BPI = 0; BPI < 2; BPI++) {
13853 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13854 if (BPI == 1) {
13855 BPP = {Src1, Src0};
13856 }
13857 unsigned ZeroMask = 0x0c0c0c0c;
13858 unsigned FMask = 0xFF << (8 * (3 - Step));
13859
13860 unsigned FirstMask =
13861 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13862 unsigned SecondMask =
13863 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13864 // Attempt to find Src vector which contains our SDValue, if so, add our
13865 // perm mask to the existing one. If we are unable to find a match for the
13866 // first SDValue, attempt to find match for the second.
13867 int FirstGroup = -1;
13868 for (int I = 0; I < 2; I++) {
13869 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13870 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13871 return IterElt.SrcOp == *BPP.first.Src &&
13872 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13873 };
13874
13875 auto Match = llvm::find_if(Srcs, MatchesFirst);
13876 if (Match != Srcs.end()) {
13877 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13878 FirstGroup = I;
13879 break;
13880 }
13881 }
13882 if (FirstGroup != -1) {
13883 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13884 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13885 return IterElt.SrcOp == *BPP.second.Src &&
13886 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13887 };
13888 auto Match = llvm::find_if(Srcs, MatchesSecond);
13889 if (Match != Srcs.end()) {
13890 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13891 } else
13892 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13893 return;
13894 }
13895 }
13896
13897 // If we have made it here, then we could not find a match in Src0s or Src1s
13898 // for either Src0 or Src1, so just place them arbitrarily.
13899
13900 unsigned ZeroMask = 0x0c0c0c0c;
13901 unsigned FMask = 0xFF << (8 * (3 - Step));
13902
13903 Src0s.push_back(
13904 {*Src0.Src,
13905 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13906 Src1.SrcOffset / 4});
13907 Src1s.push_back(
13908 {*Src1.Src,
13909 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13910 Src1.SrcOffset / 4});
13911
13912 return;
13913}
13914
13916 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13917 bool IsAny) {
13918
13919 // If we just have one source, just permute it accordingly.
13920 if (Srcs.size() == 1) {
13921 auto Elt = Srcs.begin();
13922 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13923
13924 // v_perm will produce the original value
13925 if (Elt->PermMask == 0x3020100)
13926 return EltOp;
13927
13928 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13929 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13930 }
13931
13932 auto FirstElt = Srcs.begin();
13933 auto SecondElt = std::next(FirstElt);
13934
13936
13937 // If we have multiple sources in the chain, combine them via perms (using
13938 // calculated perm mask) and Ors.
13939 while (true) {
13940 auto FirstMask = FirstElt->PermMask;
13941 auto SecondMask = SecondElt->PermMask;
13942
13943 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13944 unsigned FirstPlusFour = FirstMask | 0x04040404;
13945 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13946 // original 0x0C.
13947 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13948
13949 auto PermMask = addPermMasks(FirstMask, SecondMask);
13950 auto FirstVal =
13951 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13952 auto SecondVal =
13953 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13954
13955 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13956 SecondVal,
13957 DAG.getConstant(PermMask, SL, MVT::i32)));
13958
13959 FirstElt = std::next(SecondElt);
13960 if (FirstElt == Srcs.end())
13961 break;
13962
13963 SecondElt = std::next(FirstElt);
13964 // If we only have a FirstElt, then just combine that into the cumulative
13965 // source node.
13966 if (SecondElt == Srcs.end()) {
13967 auto EltOp =
13968 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13969
13970 Perms.push_back(
13971 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13972 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13973 break;
13974 }
13975 }
13976
13977 assert(Perms.size() == 1 || Perms.size() == 2);
13978 return Perms.size() == 2
13979 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13980 : Perms[0];
13981}
13982
13983static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13984 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13985 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13986 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13987 EntryMask += ZeroMask;
13988 }
13989}
13990
13991static bool isMul(const SDValue Op) {
13992 auto Opcode = Op.getOpcode();
13993
13994 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13995 Opcode == AMDGPUISD::MUL_I24);
13996}
13997
13998static std::optional<bool>
14000 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14001 const SDValue &S1Op, const SelectionDAG &DAG) {
14002 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14003 // of the dot4 is irrelevant.
14004 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14005 return false;
14006
14007 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14008 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14009 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14010 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14011 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14012 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14013
14014 assert(!(S0IsUnsigned && S0IsSigned));
14015 assert(!(S1IsUnsigned && S1IsSigned));
14016
14017 // There are 9 possible permutations of
14018 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14019
14020 // In two permutations, the sign bits are known to be the same for both Ops,
14021 // so simply return Signed / Unsigned corresponding to the MSB
14022
14023 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14024 return S0IsSigned;
14025
14026 // In another two permutations, the sign bits are known to be opposite. In
14027 // this case return std::nullopt to indicate a bad match.
14028
14029 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14030 return std::nullopt;
14031
14032 // In the remaining five permutations, we don't know the value of the sign
14033 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14034 // the upper bits must be extension bits. Thus, the only ways for the sign
14035 // bit to be unknown is if it was sign extended from unknown value, or if it
14036 // was any extended. In either case, it is correct to use the signed
14037 // version of the signedness semantics of dot4
14038
14039 // In two of such permutations, we known the sign bit is set for
14040 // one op, and the other is unknown. It is okay to used signed version of
14041 // dot4.
14042 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14043 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14044 return true;
14045
14046 // In one such permutation, we don't know either of the sign bits. It is okay
14047 // to used the signed version of dot4.
14048 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14049 return true;
14050
14051 // In two of such permutations, we known the sign bit is unset for
14052 // one op, and the other is unknown. Return std::nullopt to indicate a
14053 // bad match.
14054 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14055 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14056 return std::nullopt;
14057
14058 llvm_unreachable("Fully covered condition");
14059}
14060
14061SDValue SITargetLowering::performAddCombine(SDNode *N,
14062 DAGCombinerInfo &DCI) const {
14063 SelectionDAG &DAG = DCI.DAG;
14064 EVT VT = N->getValueType(0);
14065 SDLoc SL(N);
14066 SDValue LHS = N->getOperand(0);
14067 SDValue RHS = N->getOperand(1);
14068
14069 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14070 if (Subtarget->hasMad64_32()) {
14071 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14072 return Folded;
14073 }
14074 }
14075
14076 if (SDValue V = reassociateScalarOps(N, DAG)) {
14077 return V;
14078 }
14079
14080 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14081 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14082 SDValue TempNode(N, 0);
14083 std::optional<bool> IsSigned;
14087
14088 // Match the v_dot4 tree, while collecting src nodes.
14089 int ChainLength = 0;
14090 for (int I = 0; I < 4; I++) {
14091 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14092 if (MulIdx == -1)
14093 break;
14094 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14095 if (!Src0)
14096 break;
14097 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14098 if (!Src1)
14099 break;
14100
14101 auto IterIsSigned = checkDot4MulSignedness(
14102 TempNode->getOperand(MulIdx), *Src0, *Src1,
14103 TempNode->getOperand(MulIdx)->getOperand(0),
14104 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14105 if (!IterIsSigned)
14106 break;
14107 if (!IsSigned)
14108 IsSigned = *IterIsSigned;
14109 if (*IterIsSigned != *IsSigned)
14110 break;
14111 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14112 auto AddIdx = 1 - MulIdx;
14113 // Allow the special case where add (add (mul24, 0), mul24) became ->
14114 // add (mul24, mul24).
14115 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14116 Src2s.push_back(TempNode->getOperand(AddIdx));
14117 auto Src0 =
14118 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14119 if (!Src0)
14120 break;
14121 auto Src1 =
14122 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14123 if (!Src1)
14124 break;
14125 auto IterIsSigned = checkDot4MulSignedness(
14126 TempNode->getOperand(AddIdx), *Src0, *Src1,
14127 TempNode->getOperand(AddIdx)->getOperand(0),
14128 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14129 if (!IterIsSigned)
14130 break;
14131 assert(IsSigned);
14132 if (*IterIsSigned != *IsSigned)
14133 break;
14134 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14135 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14136 ChainLength = I + 2;
14137 break;
14138 }
14139
14140 TempNode = TempNode->getOperand(AddIdx);
14141 Src2s.push_back(TempNode);
14142 ChainLength = I + 1;
14143 if (TempNode->getNumOperands() < 2)
14144 break;
14145 LHS = TempNode->getOperand(0);
14146 RHS = TempNode->getOperand(1);
14147 }
14148
14149 if (ChainLength < 2)
14150 return SDValue();
14151
14152 // Masks were constructed with assumption that we would find a chain of
14153 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14154 // 0x0c) so they do not affect dot calculation.
14155 if (ChainLength < 4) {
14156 fixMasks(Src0s, ChainLength);
14157 fixMasks(Src1s, ChainLength);
14158 }
14159
14160 SDValue Src0, Src1;
14161
14162 // If we are just using a single source for both, and have permuted the
14163 // bytes consistently, we can just use the sources without permuting
14164 // (commutation).
14165 bool UseOriginalSrc = false;
14166 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14167 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14168 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14169 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14170 SmallVector<unsigned, 4> SrcBytes;
14171 auto Src0Mask = Src0s.begin()->PermMask;
14172 SrcBytes.push_back(Src0Mask & 0xFF000000);
14173 bool UniqueEntries = true;
14174 for (auto I = 1; I < 4; I++) {
14175 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14176
14177 if (is_contained(SrcBytes, NextByte)) {
14178 UniqueEntries = false;
14179 break;
14180 }
14181 SrcBytes.push_back(NextByte);
14182 }
14183
14184 if (UniqueEntries) {
14185 UseOriginalSrc = true;
14186
14187 auto FirstElt = Src0s.begin();
14188 auto FirstEltOp =
14189 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14190
14191 auto SecondElt = Src1s.begin();
14192 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14193 SecondElt->DWordOffset);
14194
14195 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14196 MVT::getIntegerVT(32));
14197 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14198 MVT::getIntegerVT(32));
14199 }
14200 }
14201
14202 if (!UseOriginalSrc) {
14203 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14204 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14205 }
14206
14207 assert(IsSigned);
14208 SDValue Src2 =
14209 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14210
14211 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14212 : Intrinsic::amdgcn_udot4,
14213 SL, MVT::i64);
14214
14215 assert(!VT.isVector());
14216 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14217 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14218
14219 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14220 }
14221
14222 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14223 return SDValue();
14224
14225 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14226 // add x, sext (setcc) => usubo_carry x, 0, setcc
14227 unsigned Opc = LHS.getOpcode();
14228 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14229 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14230 std::swap(RHS, LHS);
14231
14232 Opc = RHS.getOpcode();
14233 switch (Opc) {
14234 default: break;
14235 case ISD::ZERO_EXTEND:
14236 case ISD::SIGN_EXTEND:
14237 case ISD::ANY_EXTEND: {
14238 auto Cond = RHS.getOperand(0);
14239 // If this won't be a real VOPC output, we would still need to insert an
14240 // extra instruction anyway.
14241 if (!isBoolSGPR(Cond))
14242 break;
14243 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14244 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14246 return DAG.getNode(Opc, SL, VTList, Args);
14247 }
14248 case ISD::UADDO_CARRY: {
14249 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14250 if (!isNullConstant(RHS.getOperand(1)))
14251 break;
14252 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14253 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14254 }
14255 }
14256 return SDValue();
14257}
14258
14259SDValue SITargetLowering::performSubCombine(SDNode *N,
14260 DAGCombinerInfo &DCI) const {
14261 SelectionDAG &DAG = DCI.DAG;
14262 EVT VT = N->getValueType(0);
14263
14264 if (VT != MVT::i32)
14265 return SDValue();
14266
14267 SDLoc SL(N);
14268 SDValue LHS = N->getOperand(0);
14269 SDValue RHS = N->getOperand(1);
14270
14271 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14272 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14273 unsigned Opc = RHS.getOpcode();
14274 switch (Opc) {
14275 default: break;
14276 case ISD::ZERO_EXTEND:
14277 case ISD::SIGN_EXTEND:
14278 case ISD::ANY_EXTEND: {
14279 auto Cond = RHS.getOperand(0);
14280 // If this won't be a real VOPC output, we would still need to insert an
14281 // extra instruction anyway.
14282 if (!isBoolSGPR(Cond))
14283 break;
14284 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14285 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14287 return DAG.getNode(Opc, SL, VTList, Args);
14288 }
14289 }
14290
14291 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14292 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14293 if (!isNullConstant(LHS.getOperand(1)))
14294 return SDValue();
14295 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14296 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14297 }
14298 return SDValue();
14299}
14300
14301SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14302 DAGCombinerInfo &DCI) const {
14303
14304 if (N->getValueType(0) != MVT::i32)
14305 return SDValue();
14306
14307 if (!isNullConstant(N->getOperand(1)))
14308 return SDValue();
14309
14310 SelectionDAG &DAG = DCI.DAG;
14311 SDValue LHS = N->getOperand(0);
14312
14313 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14314 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14315 unsigned LHSOpc = LHS.getOpcode();
14316 unsigned Opc = N->getOpcode();
14317 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14318 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14319 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14320 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14321 }
14322 return SDValue();
14323}
14324
14325SDValue SITargetLowering::performFAddCombine(SDNode *N,
14326 DAGCombinerInfo &DCI) const {
14327 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14328 return SDValue();
14329
14330 SelectionDAG &DAG = DCI.DAG;
14331 EVT VT = N->getValueType(0);
14332
14333 SDLoc SL(N);
14334 SDValue LHS = N->getOperand(0);
14335 SDValue RHS = N->getOperand(1);
14336
14337 // These should really be instruction patterns, but writing patterns with
14338 // source modifiers is a pain.
14339
14340 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14341 if (LHS.getOpcode() == ISD::FADD) {
14342 SDValue A = LHS.getOperand(0);
14343 if (A == LHS.getOperand(1)) {
14344 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14345 if (FusedOp != 0) {
14346 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14347 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14348 }
14349 }
14350 }
14351
14352 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14353 if (RHS.getOpcode() == ISD::FADD) {
14354 SDValue A = RHS.getOperand(0);
14355 if (A == RHS.getOperand(1)) {
14356 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14357 if (FusedOp != 0) {
14358 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14359 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14360 }
14361 }
14362 }
14363
14364 return SDValue();
14365}
14366
14367SDValue SITargetLowering::performFSubCombine(SDNode *N,
14368 DAGCombinerInfo &DCI) const {
14369 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14370 return SDValue();
14371
14372 SelectionDAG &DAG = DCI.DAG;
14373 SDLoc SL(N);
14374 EVT VT = N->getValueType(0);
14375 assert(!VT.isVector());
14376
14377 // Try to get the fneg to fold into the source modifier. This undoes generic
14378 // DAG combines and folds them into the mad.
14379 //
14380 // Only do this if we are not trying to support denormals. v_mad_f32 does
14381 // not support denormals ever.
14382 SDValue LHS = N->getOperand(0);
14383 SDValue RHS = N->getOperand(1);
14384 if (LHS.getOpcode() == ISD::FADD) {
14385 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14386 SDValue A = LHS.getOperand(0);
14387 if (A == LHS.getOperand(1)) {
14388 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14389 if (FusedOp != 0){
14390 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14391 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14392
14393 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14394 }
14395 }
14396 }
14397
14398 if (RHS.getOpcode() == ISD::FADD) {
14399 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14400
14401 SDValue A = RHS.getOperand(0);
14402 if (A == RHS.getOperand(1)) {
14403 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14404 if (FusedOp != 0){
14405 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14406 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14407 }
14408 }
14409 }
14410
14411 return SDValue();
14412}
14413
14414SDValue SITargetLowering::performFDivCombine(SDNode *N,
14415 DAGCombinerInfo &DCI) const {
14416 SelectionDAG &DAG = DCI.DAG;
14417 SDLoc SL(N);
14418 EVT VT = N->getValueType(0);
14419 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14420 return SDValue();
14421
14422 SDValue LHS = N->getOperand(0);
14423 SDValue RHS = N->getOperand(1);
14424
14425 SDNodeFlags Flags = N->getFlags();
14426 SDNodeFlags RHSFlags = RHS->getFlags();
14427 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14428 !RHS->hasOneUse())
14429 return SDValue();
14430
14431 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14432 bool IsNegative = false;
14433 if (CLHS->isExactlyValue(1.0) ||
14434 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14435 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14436 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14437 if (RHS.getOpcode() == ISD::FSQRT) {
14438 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14439 SDValue Rsq =
14440 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14441 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14442 }
14443 }
14444 }
14445
14446 return SDValue();
14447}
14448
14449SDValue SITargetLowering::performFMACombine(SDNode *N,
14450 DAGCombinerInfo &DCI) const {
14451 SelectionDAG &DAG = DCI.DAG;
14452 EVT VT = N->getValueType(0);
14453 SDLoc SL(N);
14454
14455 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14456 return SDValue();
14457
14458 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14459 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14460 SDValue Op1 = N->getOperand(0);
14461 SDValue Op2 = N->getOperand(1);
14462 SDValue FMA = N->getOperand(2);
14463
14464 if (FMA.getOpcode() != ISD::FMA ||
14465 Op1.getOpcode() != ISD::FP_EXTEND ||
14466 Op2.getOpcode() != ISD::FP_EXTEND)
14467 return SDValue();
14468
14469 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14470 // regardless of the denorm mode setting. Therefore,
14471 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14472 const TargetOptions &Options = DAG.getTarget().Options;
14473 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14474 (N->getFlags().hasAllowContract() &&
14475 FMA->getFlags().hasAllowContract())) {
14476 Op1 = Op1.getOperand(0);
14477 Op2 = Op2.getOperand(0);
14478 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14480 return SDValue();
14481
14482 SDValue Vec1 = Op1.getOperand(0);
14483 SDValue Idx1 = Op1.getOperand(1);
14484 SDValue Vec2 = Op2.getOperand(0);
14485
14486 SDValue FMAOp1 = FMA.getOperand(0);
14487 SDValue FMAOp2 = FMA.getOperand(1);
14488 SDValue FMAAcc = FMA.getOperand(2);
14489
14490 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14491 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14492 return SDValue();
14493
14494 FMAOp1 = FMAOp1.getOperand(0);
14495 FMAOp2 = FMAOp2.getOperand(0);
14496 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14498 return SDValue();
14499
14500 SDValue Vec3 = FMAOp1.getOperand(0);
14501 SDValue Vec4 = FMAOp2.getOperand(0);
14502 SDValue Idx2 = FMAOp1.getOperand(1);
14503
14504 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14505 // Idx1 and Idx2 cannot be the same.
14506 Idx1 == Idx2)
14507 return SDValue();
14508
14509 if (Vec1 == Vec2 || Vec3 == Vec4)
14510 return SDValue();
14511
14512 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14513 return SDValue();
14514
14515 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14516 (Vec1 == Vec4 && Vec2 == Vec3)) {
14517 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14518 DAG.getTargetConstant(0, SL, MVT::i1));
14519 }
14520 }
14521 return SDValue();
14522}
14523
14524SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14525 DAGCombinerInfo &DCI) const {
14526 SelectionDAG &DAG = DCI.DAG;
14527 SDLoc SL(N);
14528
14529 SDValue LHS = N->getOperand(0);
14530 SDValue RHS = N->getOperand(1);
14531 EVT VT = LHS.getValueType();
14532 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14533
14534 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14535 if (!CRHS) {
14536 CRHS = dyn_cast<ConstantSDNode>(LHS);
14537 if (CRHS) {
14538 std::swap(LHS, RHS);
14540 }
14541 }
14542
14543 if (CRHS) {
14544 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14545 isBoolSGPR(LHS.getOperand(0))) {
14546 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14547 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14548 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14549 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14550 if ((CRHS->isAllOnes() &&
14551 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14552 (CRHS->isZero() &&
14553 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14554 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14555 DAG.getConstant(-1, SL, MVT::i1));
14556 if ((CRHS->isAllOnes() &&
14557 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14558 (CRHS->isZero() &&
14559 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14560 return LHS.getOperand(0);
14561 }
14562
14563 const APInt &CRHSVal = CRHS->getAPIntValue();
14564 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14565 LHS.getOpcode() == ISD::SELECT &&
14566 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14567 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14568 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14569 isBoolSGPR(LHS.getOperand(0))) {
14570 // Given CT != FT:
14571 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14572 // setcc (select cc, CT, CF), CF, ne => cc
14573 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14574 // setcc (select cc, CT, CF), CT, eq => cc
14575 const APInt &CT = LHS.getConstantOperandAPInt(1);
14576 const APInt &CF = LHS.getConstantOperandAPInt(2);
14577
14578 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14579 (CT == CRHSVal && CC == ISD::SETNE))
14580 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14581 DAG.getConstant(-1, SL, MVT::i1));
14582 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14583 (CT == CRHSVal && CC == ISD::SETEQ))
14584 return LHS.getOperand(0);
14585 }
14586 }
14587
14588 if (VT != MVT::f32 && VT != MVT::f64 &&
14589 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14590 return SDValue();
14591
14592 // Match isinf/isfinite pattern
14593 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14594 // (fcmp one (fabs x), inf) -> (fp_class x,
14595 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14596 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14597 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14598 if (!CRHS)
14599 return SDValue();
14600
14601 const APFloat &APF = CRHS->getValueAPF();
14602 if (APF.isInfinity() && !APF.isNegative()) {
14603 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14605 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14611 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14612 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14613 DAG.getConstant(Mask, SL, MVT::i32));
14614 }
14615 }
14616
14617 return SDValue();
14618}
14619
14620SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14621 DAGCombinerInfo &DCI) const {
14622 SelectionDAG &DAG = DCI.DAG;
14623 SDLoc SL(N);
14624 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14625
14626 SDValue Src = N->getOperand(0);
14627 SDValue Shift = N->getOperand(0);
14628
14629 // TODO: Extend type shouldn't matter (assuming legal types).
14630 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14631 Shift = Shift.getOperand(0);
14632
14633 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14634 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14635 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14636 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14637 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14638 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14639 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14640 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14641 SDLoc(Shift.getOperand(0)), MVT::i32);
14642
14643 unsigned ShiftOffset = 8 * Offset;
14644 if (Shift.getOpcode() == ISD::SHL)
14645 ShiftOffset -= C->getZExtValue();
14646 else
14647 ShiftOffset += C->getZExtValue();
14648
14649 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14650 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14651 MVT::f32, Shifted);
14652 }
14653 }
14654 }
14655
14656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14657 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14658 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14659 // We simplified Src. If this node is not dead, visit it again so it is
14660 // folded properly.
14661 if (N->getOpcode() != ISD::DELETED_NODE)
14662 DCI.AddToWorklist(N);
14663 return SDValue(N, 0);
14664 }
14665
14666 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14667 if (SDValue DemandedSrc =
14669 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14670
14671 return SDValue();
14672}
14673
14674SDValue SITargetLowering::performClampCombine(SDNode *N,
14675 DAGCombinerInfo &DCI) const {
14676 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14677 if (!CSrc)
14678 return SDValue();
14679
14680 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14681 const APFloat &F = CSrc->getValueAPF();
14682 APFloat Zero = APFloat::getZero(F.getSemantics());
14683 if (F < Zero ||
14684 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14685 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14686 }
14687
14688 APFloat One(F.getSemantics(), "1.0");
14689 if (F > One)
14690 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14691
14692 return SDValue(CSrc, 0);
14693}
14694
14695
14697 DAGCombinerInfo &DCI) const {
14698 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14699 return SDValue();
14700 switch (N->getOpcode()) {
14701 case ISD::ADD:
14702 return performAddCombine(N, DCI);
14703 case ISD::SUB:
14704 return performSubCombine(N, DCI);
14705 case ISD::UADDO_CARRY:
14706 case ISD::USUBO_CARRY:
14707 return performAddCarrySubCarryCombine(N, DCI);
14708 case ISD::FADD:
14709 return performFAddCombine(N, DCI);
14710 case ISD::FSUB:
14711 return performFSubCombine(N, DCI);
14712 case ISD::FDIV:
14713 return performFDivCombine(N, DCI);
14714 case ISD::SETCC:
14715 return performSetCCCombine(N, DCI);
14716 case ISD::FMAXNUM:
14717 case ISD::FMINNUM:
14718 case ISD::FMAXNUM_IEEE:
14719 case ISD::FMINNUM_IEEE:
14720 case ISD::FMAXIMUM:
14721 case ISD::FMINIMUM:
14722 case ISD::SMAX:
14723 case ISD::SMIN:
14724 case ISD::UMAX:
14725 case ISD::UMIN:
14728 return performMinMaxCombine(N, DCI);
14729 case ISD::FMA:
14730 return performFMACombine(N, DCI);
14731 case ISD::AND:
14732 return performAndCombine(N, DCI);
14733 case ISD::OR:
14734 return performOrCombine(N, DCI);
14735 case ISD::FSHR: {
14737 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14738 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14739 return matchPERM(N, DCI);
14740 }
14741 break;
14742 }
14743 case ISD::XOR:
14744 return performXorCombine(N, DCI);
14745 case ISD::ZERO_EXTEND:
14746 return performZeroExtendCombine(N, DCI);
14748 return performSignExtendInRegCombine(N , DCI);
14750 return performClassCombine(N, DCI);
14751 case ISD::FCANONICALIZE:
14752 return performFCanonicalizeCombine(N, DCI);
14753 case AMDGPUISD::RCP:
14754 return performRcpCombine(N, DCI);
14755 case ISD::FLDEXP:
14756 case AMDGPUISD::FRACT:
14757 case AMDGPUISD::RSQ:
14760 case AMDGPUISD::RSQ_CLAMP: {
14761 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14762 SDValue Src = N->getOperand(0);
14763 if (Src.isUndef())
14764 return Src;
14765 break;
14766 }
14767 case ISD::SINT_TO_FP:
14768 case ISD::UINT_TO_FP:
14769 return performUCharToFloatCombine(N, DCI);
14770 case ISD::FCOPYSIGN:
14771 return performFCopySignCombine(N, DCI);
14776 return performCvtF32UByteNCombine(N, DCI);
14777 case AMDGPUISD::FMED3:
14778 return performFMed3Combine(N, DCI);
14780 return performCvtPkRTZCombine(N, DCI);
14781 case AMDGPUISD::CLAMP:
14782 return performClampCombine(N, DCI);
14783 case ISD::SCALAR_TO_VECTOR: {
14784 SelectionDAG &DAG = DCI.DAG;
14785 EVT VT = N->getValueType(0);
14786
14787 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14788 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14789 SDLoc SL(N);
14790 SDValue Src = N->getOperand(0);
14791 EVT EltVT = Src.getValueType();
14792 if (EltVT != MVT::i16)
14793 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14794
14795 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14796 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14797 }
14798
14799 break;
14800 }
14802 return performExtractVectorEltCombine(N, DCI);
14804 return performInsertVectorEltCombine(N, DCI);
14805 case ISD::FP_ROUND:
14806 return performFPRoundCombine(N, DCI);
14807 case ISD::LOAD: {
14808 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14809 return Widened;
14810 [[fallthrough]];
14811 }
14812 default: {
14813 if (!DCI.isBeforeLegalize()) {
14814 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14815 return performMemSDNodeCombine(MemNode, DCI);
14816 }
14817
14818 break;
14819 }
14820 }
14821
14823}
14824
14825/// Helper function for adjustWritemask
14826static unsigned SubIdx2Lane(unsigned Idx) {
14827 switch (Idx) {
14828 default: return ~0u;
14829 case AMDGPU::sub0: return 0;
14830 case AMDGPU::sub1: return 1;
14831 case AMDGPU::sub2: return 2;
14832 case AMDGPU::sub3: return 3;
14833 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14834 }
14835}
14836
14837/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14838SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14839 SelectionDAG &DAG) const {
14840 unsigned Opcode = Node->getMachineOpcode();
14841
14842 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14843 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14844 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14845 return Node; // not implemented for D16
14846
14847 SDNode *Users[5] = { nullptr };
14848 unsigned Lane = 0;
14849 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14850 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14851 unsigned NewDmask = 0;
14852 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14853 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14854 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14855 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14856 ? true
14857 : false;
14858 unsigned TFCLane = 0;
14859 bool HasChain = Node->getNumValues() > 1;
14860
14861 if (OldDmask == 0) {
14862 // These are folded out, but on the chance it happens don't assert.
14863 return Node;
14864 }
14865
14866 unsigned OldBitsSet = llvm::popcount(OldDmask);
14867 // Work out which is the TFE/LWE lane if that is enabled.
14868 if (UsesTFC) {
14869 TFCLane = OldBitsSet;
14870 }
14871
14872 // Try to figure out the used register components
14873 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14874 I != E; ++I) {
14875
14876 // Don't look at users of the chain.
14877 if (I.getUse().getResNo() != 0)
14878 continue;
14879
14880 // Abort if we can't understand the usage
14881 if (!I->isMachineOpcode() ||
14882 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14883 return Node;
14884
14885 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14886 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14887 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14888 // set, etc.
14889 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14890 if (Lane == ~0u)
14891 return Node;
14892
14893 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14894 if (UsesTFC && Lane == TFCLane) {
14895 Users[Lane] = *I;
14896 } else {
14897 // Set which texture component corresponds to the lane.
14898 unsigned Comp;
14899 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14900 Comp = llvm::countr_zero(Dmask);
14901 Dmask &= ~(1 << Comp);
14902 }
14903
14904 // Abort if we have more than one user per component.
14905 if (Users[Lane])
14906 return Node;
14907
14908 Users[Lane] = *I;
14909 NewDmask |= 1 << Comp;
14910 }
14911 }
14912
14913 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14914 bool NoChannels = !NewDmask;
14915 if (NoChannels) {
14916 if (!UsesTFC) {
14917 // No uses of the result and not using TFC. Then do nothing.
14918 return Node;
14919 }
14920 // If the original dmask has one channel - then nothing to do
14921 if (OldBitsSet == 1)
14922 return Node;
14923 // Use an arbitrary dmask - required for the instruction to work
14924 NewDmask = 1;
14925 }
14926 // Abort if there's no change
14927 if (NewDmask == OldDmask)
14928 return Node;
14929
14930 unsigned BitsSet = llvm::popcount(NewDmask);
14931
14932 // Check for TFE or LWE - increase the number of channels by one to account
14933 // for the extra return value
14934 // This will need adjustment for D16 if this is also included in
14935 // adjustWriteMask (this function) but at present D16 are excluded.
14936 unsigned NewChannels = BitsSet + UsesTFC;
14937
14938 int NewOpcode =
14939 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14940 assert(NewOpcode != -1 &&
14941 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14942 "failed to find equivalent MIMG op");
14943
14944 // Adjust the writemask in the node
14946 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14947 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14948 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14949
14950 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14951
14952 MVT ResultVT = NewChannels == 1 ?
14953 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14954 NewChannels == 5 ? 8 : NewChannels);
14955 SDVTList NewVTList = HasChain ?
14956 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14957
14958
14959 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14960 NewVTList, Ops);
14961
14962 if (HasChain) {
14963 // Update chain.
14964 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14965 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14966 }
14967
14968 if (NewChannels == 1) {
14969 assert(Node->hasNUsesOfValue(1, 0));
14970 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14971 SDLoc(Node), Users[Lane]->getValueType(0),
14972 SDValue(NewNode, 0));
14973 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14974 return nullptr;
14975 }
14976
14977 // Update the users of the node with the new indices
14978 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14979 SDNode *User = Users[i];
14980 if (!User) {
14981 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14982 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14983 if (i || !NoChannels)
14984 continue;
14985 } else {
14986 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14987 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14988 if (NewUser != User) {
14989 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14990 DAG.RemoveDeadNode(User);
14991 }
14992 }
14993
14994 switch (Idx) {
14995 default: break;
14996 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14997 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14998 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14999 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
15000 }
15001 }
15002
15003 DAG.RemoveDeadNode(Node);
15004 return nullptr;
15005}
15006
15008 if (Op.getOpcode() == ISD::AssertZext)
15009 Op = Op.getOperand(0);
15010
15011 return isa<FrameIndexSDNode>(Op);
15012}
15013
15014/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15015/// with frame index operands.
15016/// LLVM assumes that inputs are to these instructions are registers.
15018 SelectionDAG &DAG) const {
15019 if (Node->getOpcode() == ISD::CopyToReg) {
15020 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15021 SDValue SrcVal = Node->getOperand(2);
15022
15023 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15024 // to try understanding copies to physical registers.
15025 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15026 SDLoc SL(Node);
15028 SDValue VReg = DAG.getRegister(
15029 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15030
15031 SDNode *Glued = Node->getGluedNode();
15032 SDValue ToVReg
15033 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15034 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15035 SDValue ToResultReg
15036 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15037 VReg, ToVReg.getValue(1));
15038 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15039 DAG.RemoveDeadNode(Node);
15040 return ToResultReg.getNode();
15041 }
15042 }
15043
15045 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15046 if (!isFrameIndexOp(Node->getOperand(i))) {
15047 Ops.push_back(Node->getOperand(i));
15048 continue;
15049 }
15050
15051 SDLoc DL(Node);
15052 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15053 Node->getOperand(i).getValueType(),
15054 Node->getOperand(i)), 0));
15055 }
15056
15057 return DAG.UpdateNodeOperands(Node, Ops);
15058}
15059
15060/// Fold the instructions after selecting them.
15061/// Returns null if users were already updated.
15063 SelectionDAG &DAG) const {
15065 unsigned Opcode = Node->getMachineOpcode();
15066
15067 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15068 !TII->isGather4(Opcode) &&
15069 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15070 return adjustWritemask(Node, DAG);
15071 }
15072
15073 if (Opcode == AMDGPU::INSERT_SUBREG ||
15074 Opcode == AMDGPU::REG_SEQUENCE) {
15076 return Node;
15077 }
15078
15079 switch (Opcode) {
15080 case AMDGPU::V_DIV_SCALE_F32_e64:
15081 case AMDGPU::V_DIV_SCALE_F64_e64: {
15082 // Satisfy the operand register constraint when one of the inputs is
15083 // undefined. Ordinarily each undef value will have its own implicit_def of
15084 // a vreg, so force these to use a single register.
15085 SDValue Src0 = Node->getOperand(1);
15086 SDValue Src1 = Node->getOperand(3);
15087 SDValue Src2 = Node->getOperand(5);
15088
15089 if ((Src0.isMachineOpcode() &&
15090 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15091 (Src0 == Src1 || Src0 == Src2))
15092 break;
15093
15094 MVT VT = Src0.getValueType().getSimpleVT();
15095 const TargetRegisterClass *RC =
15096 getRegClassFor(VT, Src0.getNode()->isDivergent());
15097
15099 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15100
15101 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15102 UndefReg, Src0, SDValue());
15103
15104 // src0 must be the same register as src1 or src2, even if the value is
15105 // undefined, so make sure we don't violate this constraint.
15106 if (Src0.isMachineOpcode() &&
15107 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15108 if (Src1.isMachineOpcode() &&
15109 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15110 Src0 = Src1;
15111 else if (Src2.isMachineOpcode() &&
15112 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15113 Src0 = Src2;
15114 else {
15115 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15116 Src0 = UndefReg;
15117 Src1 = UndefReg;
15118 }
15119 } else
15120 break;
15121
15122 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15123 Ops[1] = Src0;
15124 Ops[3] = Src1;
15125 Ops[5] = Src2;
15126 Ops.push_back(ImpDef.getValue(1));
15127 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15128 }
15129 default:
15130 break;
15131 }
15132
15133 return Node;
15134}
15135
15136// Any MIMG instructions that use tfe or lwe require an initialization of the
15137// result register that will be written in the case of a memory access failure.
15138// The required code is also added to tie this init code to the result of the
15139// img instruction.
15142 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15143 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15144 MachineBasicBlock &MBB = *MI.getParent();
15145
15146 int DstIdx =
15147 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15148 unsigned InitIdx = 0;
15149
15150 if (TII->isImage(MI)) {
15151 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15152 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15153 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15154
15155 if (!TFE && !LWE) // intersect_ray
15156 return;
15157
15158 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15159 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15160 unsigned D16Val = D16 ? D16->getImm() : 0;
15161
15162 if (!TFEVal && !LWEVal)
15163 return;
15164
15165 // At least one of TFE or LWE are non-zero
15166 // We have to insert a suitable initialization of the result value and
15167 // tie this to the dest of the image instruction.
15168
15169 // Calculate which dword we have to initialize to 0.
15170 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15171
15172 // check that dmask operand is found.
15173 assert(MO_Dmask && "Expected dmask operand in instruction");
15174
15175 unsigned dmask = MO_Dmask->getImm();
15176 // Determine the number of active lanes taking into account the
15177 // Gather4 special case
15178 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15179
15180 bool Packed = !Subtarget->hasUnpackedD16VMem();
15181
15182 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15183
15184 // Abandon attempt if the dst size isn't large enough
15185 // - this is in fact an error but this is picked up elsewhere and
15186 // reported correctly.
15187 uint32_t DstSize =
15188 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15189 if (DstSize < InitIdx)
15190 return;
15191 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15192 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15193 } else {
15194 return;
15195 }
15196
15197 const DebugLoc &DL = MI.getDebugLoc();
15198
15199 // Create a register for the initialization value.
15200 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15201 unsigned NewDst = 0; // Final initialized value will be in here
15202
15203 // If PRTStrictNull feature is enabled (the default) then initialize
15204 // all the result registers to 0, otherwise just the error indication
15205 // register (VGPRn+1)
15206 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15207 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15208
15209 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15210 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15211 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15212 // Initialize dword
15213 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15214 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15215 .addImm(0);
15216 // Insert into the super-reg
15217 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15218 .addReg(PrevDst)
15219 .addReg(SubReg)
15221
15222 PrevDst = NewDst;
15223 }
15224
15225 // Add as an implicit operand
15226 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15227
15228 // Tie the just added implicit operand to the dst
15229 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15230}
15231
15232/// Assign the register class depending on the number of
15233/// bits set in the writemask
15235 SDNode *Node) const {
15237
15238 MachineFunction *MF = MI.getParent()->getParent();
15241
15242 if (TII->isVOP3(MI.getOpcode())) {
15243 // Make sure constant bus requirements are respected.
15244 TII->legalizeOperandsVOP3(MRI, MI);
15245
15246 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15247 // This saves a chain-copy of registers and better balance register
15248 // use between vgpr and agpr as agpr tuples tend to be big.
15249 if (!MI.getDesc().operands().empty()) {
15250 unsigned Opc = MI.getOpcode();
15251 bool HasAGPRs = Info->mayNeedAGPRs();
15252 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15253 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15254 for (auto I :
15255 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15256 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15257 if (I == -1)
15258 break;
15259 if ((I == Src2Idx) && (HasAGPRs))
15260 break;
15261 MachineOperand &Op = MI.getOperand(I);
15262 if (!Op.isReg() || !Op.getReg().isVirtual())
15263 continue;
15264 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15265 if (!TRI->hasAGPRs(RC))
15266 continue;
15267 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15268 if (!Src || !Src->isCopy() ||
15269 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15270 continue;
15271 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15272 // All uses of agpr64 and agpr32 can also accept vgpr except for
15273 // v_accvgpr_read, but we do not produce agpr reads during selection,
15274 // so no use checks are needed.
15275 MRI.setRegClass(Op.getReg(), NewRC);
15276 }
15277
15278 if (!HasAGPRs)
15279 return;
15280
15281 // Resolve the rest of AV operands to AGPRs.
15282 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15283 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15284 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15285 if (TRI->isVectorSuperClass(RC)) {
15286 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15287 MRI.setRegClass(Src2->getReg(), NewRC);
15288 if (Src2->isTied())
15289 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15290 }
15291 }
15292 }
15293 }
15294
15295 return;
15296 }
15297
15298 if (TII->isImage(MI))
15299 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15300}
15301
15303 uint64_t Val) {
15304 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15305 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15306}
15307
15309 const SDLoc &DL,
15310 SDValue Ptr) const {
15312
15313 // Build the half of the subregister with the constants before building the
15314 // full 128-bit register. If we are building multiple resource descriptors,
15315 // this will allow CSEing of the 2-component register.
15316 const SDValue Ops0[] = {
15317 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15318 buildSMovImm32(DAG, DL, 0),
15319 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15320 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15321 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15322 };
15323
15324 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15325 MVT::v2i32, Ops0), 0);
15326
15327 // Combine the constants and the pointer.
15328 const SDValue Ops1[] = {
15329 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15330 Ptr,
15331 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15332 SubRegHi,
15333 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15334 };
15335
15336 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15337}
15338
15339/// Return a resource descriptor with the 'Add TID' bit enabled
15340/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15341/// of the resource descriptor) to create an offset, which is added to
15342/// the resource pointer.
15344 SDValue Ptr, uint32_t RsrcDword1,
15345 uint64_t RsrcDword2And3) const {
15346 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15347 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15348 if (RsrcDword1) {
15349 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15350 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15351 0);
15352 }
15353
15354 SDValue DataLo = buildSMovImm32(DAG, DL,
15355 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15356 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15357
15358 const SDValue Ops[] = {
15359 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15360 PtrLo,
15361 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15362 PtrHi,
15363 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15364 DataLo,
15365 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15366 DataHi,
15367 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15368 };
15369
15370 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15371}
15372
15373//===----------------------------------------------------------------------===//
15374// SI Inline Assembly Support
15375//===----------------------------------------------------------------------===//
15376
15377std::pair<unsigned, const TargetRegisterClass *>
15379 StringRef Constraint,
15380 MVT VT) const {
15381 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15382
15383 const TargetRegisterClass *RC = nullptr;
15384 if (Constraint.size() == 1) {
15385 const unsigned BitWidth = VT.getSizeInBits();
15386 switch (Constraint[0]) {
15387 default:
15388 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15389 case 's':
15390 case 'r':
15391 switch (BitWidth) {
15392 case 16:
15393 RC = &AMDGPU::SReg_32RegClass;
15394 break;
15395 case 64:
15396 RC = &AMDGPU::SGPR_64RegClass;
15397 break;
15398 default:
15400 if (!RC)
15401 return std::pair(0U, nullptr);
15402 break;
15403 }
15404 break;
15405 case 'v':
15406 switch (BitWidth) {
15407 case 16:
15408 RC = &AMDGPU::VGPR_32RegClass;
15409 break;
15410 default:
15411 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15412 if (!RC)
15413 return std::pair(0U, nullptr);
15414 break;
15415 }
15416 break;
15417 case 'a':
15418 if (!Subtarget->hasMAIInsts())
15419 break;
15420 switch (BitWidth) {
15421 case 16:
15422 RC = &AMDGPU::AGPR_32RegClass;
15423 break;
15424 default:
15425 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15426 if (!RC)
15427 return std::pair(0U, nullptr);
15428 break;
15429 }
15430 break;
15431 }
15432 // We actually support i128, i16 and f16 as inline parameters
15433 // even if they are not reported as legal
15434 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15435 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15436 return std::pair(0U, RC);
15437 }
15438
15439 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15440 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15441 if (RegName.consume_front("v")) {
15442 RC = &AMDGPU::VGPR_32RegClass;
15443 } else if (RegName.consume_front("s")) {
15444 RC = &AMDGPU::SGPR_32RegClass;
15445 } else if (RegName.consume_front("a")) {
15446 RC = &AMDGPU::AGPR_32RegClass;
15447 }
15448
15449 if (RC) {
15450 uint32_t Idx;
15451 if (RegName.consume_front("[")) {
15452 uint32_t End;
15453 bool Failed = RegName.consumeInteger(10, Idx);
15454 Failed |= !RegName.consume_front(":");
15455 Failed |= RegName.consumeInteger(10, End);
15456 Failed |= !RegName.consume_back("]");
15457 if (!Failed) {
15458 uint32_t Width = (End - Idx + 1) * 32;
15459 MCRegister Reg = RC->getRegister(Idx);
15461 RC = TRI->getVGPRClassForBitWidth(Width);
15462 else if (SIRegisterInfo::isSGPRClass(RC))
15463 RC = TRI->getSGPRClassForBitWidth(Width);
15464 else if (SIRegisterInfo::isAGPRClass(RC))
15465 RC = TRI->getAGPRClassForBitWidth(Width);
15466 if (RC) {
15467 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15468 return std::pair(Reg, RC);
15469 }
15470 }
15471 } else {
15472 bool Failed = RegName.getAsInteger(10, Idx);
15473 if (!Failed && Idx < RC->getNumRegs())
15474 return std::pair(RC->getRegister(Idx), RC);
15475 }
15476 }
15477 }
15478
15479 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15480 if (Ret.first)
15481 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15482
15483 return Ret;
15484}
15485
15486static bool isImmConstraint(StringRef Constraint) {
15487 if (Constraint.size() == 1) {
15488 switch (Constraint[0]) {
15489 default: break;
15490 case 'I':
15491 case 'J':
15492 case 'A':
15493 case 'B':
15494 case 'C':
15495 return true;
15496 }
15497 } else if (Constraint == "DA" ||
15498 Constraint == "DB") {
15499 return true;
15500 }
15501 return false;
15502}
15503
15506 if (Constraint.size() == 1) {
15507 switch (Constraint[0]) {
15508 default: break;
15509 case 's':
15510 case 'v':
15511 case 'a':
15512 return C_RegisterClass;
15513 }
15514 }
15515 if (isImmConstraint(Constraint)) {
15516 return C_Other;
15517 }
15518 return TargetLowering::getConstraintType(Constraint);
15519}
15520
15521static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15523 Val = Val & maskTrailingOnes<uint64_t>(Size);
15524 }
15525 return Val;
15526}
15527
15529 StringRef Constraint,
15530 std::vector<SDValue> &Ops,
15531 SelectionDAG &DAG) const {
15532 if (isImmConstraint(Constraint)) {
15533 uint64_t Val;
15534 if (getAsmOperandConstVal(Op, Val) &&
15535 checkAsmConstraintVal(Op, Constraint, Val)) {
15536 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15537 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15538 }
15539 } else {
15540 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15541 }
15542}
15543
15545 unsigned Size = Op.getScalarValueSizeInBits();
15546 if (Size > 64)
15547 return false;
15548
15549 if (Size == 16 && !Subtarget->has16BitInsts())
15550 return false;
15551
15552 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15553 Val = C->getSExtValue();
15554 return true;
15555 }
15556 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15557 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15558 return true;
15559 }
15560 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15561 if (Size != 16 || Op.getNumOperands() != 2)
15562 return false;
15563 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15564 return false;
15565 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15566 Val = C->getSExtValue();
15567 return true;
15568 }
15569 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15570 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15571 return true;
15572 }
15573 }
15574
15575 return false;
15576}
15577
15579 uint64_t Val) const {
15580 if (Constraint.size() == 1) {
15581 switch (Constraint[0]) {
15582 case 'I':
15584 case 'J':
15585 return isInt<16>(Val);
15586 case 'A':
15587 return checkAsmConstraintValA(Op, Val);
15588 case 'B':
15589 return isInt<32>(Val);
15590 case 'C':
15591 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15593 default:
15594 break;
15595 }
15596 } else if (Constraint.size() == 2) {
15597 if (Constraint == "DA") {
15598 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15599 int64_t LoBits = static_cast<int32_t>(Val);
15600 return checkAsmConstraintValA(Op, HiBits, 32) &&
15601 checkAsmConstraintValA(Op, LoBits, 32);
15602 }
15603 if (Constraint == "DB") {
15604 return true;
15605 }
15606 }
15607 llvm_unreachable("Invalid asm constraint");
15608}
15609
15611 unsigned MaxSize) const {
15612 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15613 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15614 if (Size == 16) {
15615 MVT VT = Op.getSimpleValueType();
15616 switch (VT.SimpleTy) {
15617 default:
15618 return false;
15619 case MVT::i16:
15620 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15621 case MVT::f16:
15622 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15623 case MVT::bf16:
15624 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15625 case MVT::v2i16:
15626 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15627 case MVT::v2f16:
15628 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15629 case MVT::v2bf16:
15630 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15631 }
15632 }
15633 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15634 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15635 return true;
15636 return false;
15637}
15638
15639static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15640 switch (UnalignedClassID) {
15641 case AMDGPU::VReg_64RegClassID:
15642 return AMDGPU::VReg_64_Align2RegClassID;
15643 case AMDGPU::VReg_96RegClassID:
15644 return AMDGPU::VReg_96_Align2RegClassID;
15645 case AMDGPU::VReg_128RegClassID:
15646 return AMDGPU::VReg_128_Align2RegClassID;
15647 case AMDGPU::VReg_160RegClassID:
15648 return AMDGPU::VReg_160_Align2RegClassID;
15649 case AMDGPU::VReg_192RegClassID:
15650 return AMDGPU::VReg_192_Align2RegClassID;
15651 case AMDGPU::VReg_224RegClassID:
15652 return AMDGPU::VReg_224_Align2RegClassID;
15653 case AMDGPU::VReg_256RegClassID:
15654 return AMDGPU::VReg_256_Align2RegClassID;
15655 case AMDGPU::VReg_288RegClassID:
15656 return AMDGPU::VReg_288_Align2RegClassID;
15657 case AMDGPU::VReg_320RegClassID:
15658 return AMDGPU::VReg_320_Align2RegClassID;
15659 case AMDGPU::VReg_352RegClassID:
15660 return AMDGPU::VReg_352_Align2RegClassID;
15661 case AMDGPU::VReg_384RegClassID:
15662 return AMDGPU::VReg_384_Align2RegClassID;
15663 case AMDGPU::VReg_512RegClassID:
15664 return AMDGPU::VReg_512_Align2RegClassID;
15665 case AMDGPU::VReg_1024RegClassID:
15666 return AMDGPU::VReg_1024_Align2RegClassID;
15667 case AMDGPU::AReg_64RegClassID:
15668 return AMDGPU::AReg_64_Align2RegClassID;
15669 case AMDGPU::AReg_96RegClassID:
15670 return AMDGPU::AReg_96_Align2RegClassID;
15671 case AMDGPU::AReg_128RegClassID:
15672 return AMDGPU::AReg_128_Align2RegClassID;
15673 case AMDGPU::AReg_160RegClassID:
15674 return AMDGPU::AReg_160_Align2RegClassID;
15675 case AMDGPU::AReg_192RegClassID:
15676 return AMDGPU::AReg_192_Align2RegClassID;
15677 case AMDGPU::AReg_256RegClassID:
15678 return AMDGPU::AReg_256_Align2RegClassID;
15679 case AMDGPU::AReg_512RegClassID:
15680 return AMDGPU::AReg_512_Align2RegClassID;
15681 case AMDGPU::AReg_1024RegClassID:
15682 return AMDGPU::AReg_1024_Align2RegClassID;
15683 default:
15684 return -1;
15685 }
15686}
15687
15688// Figure out which registers should be reserved for stack access. Only after
15689// the function is legalized do we know all of the non-spill stack objects or if
15690// calls are present.
15694 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15695 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15696 const SIInstrInfo *TII = ST.getInstrInfo();
15697
15698 if (Info->isEntryFunction()) {
15699 // Callable functions have fixed registers used for stack access.
15701 }
15702
15703 // TODO: Move this logic to getReservedRegs()
15704 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15705 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15706 Register SReg = ST.isWave32()
15707 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15708 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15709 &AMDGPU::SGPR_64RegClass);
15710 Info->setSGPRForEXECCopy(SReg);
15711
15712 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15713 Info->getStackPtrOffsetReg()));
15714 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15715 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15716
15717 // We need to worry about replacing the default register with itself in case
15718 // of MIR testcases missing the MFI.
15719 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15720 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15721
15722 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15723 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15724
15725 Info->limitOccupancy(MF);
15726
15727 if (ST.isWave32() && !MF.empty()) {
15728 for (auto &MBB : MF) {
15729 for (auto &MI : MBB) {
15730 TII->fixImplicitOperands(MI);
15731 }
15732 }
15733 }
15734
15735 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15736 // classes if required. Ideally the register class constraints would differ
15737 // per-subtarget, but there's no easy way to achieve that right now. This is
15738 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15739 // from using them as the register class for legal types.
15740 if (ST.needsAlignedVGPRs()) {
15741 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15742 const Register Reg = Register::index2VirtReg(I);
15743 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15744 if (!RC)
15745 continue;
15746 int NewClassID = getAlignedAGPRClassID(RC->getID());
15747 if (NewClassID != -1)
15748 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15749 }
15750 }
15751
15753}
15754
15756 KnownBits &Known,
15757 const APInt &DemandedElts,
15758 const SelectionDAG &DAG,
15759 unsigned Depth) const {
15760 Known.resetAll();
15761 unsigned Opc = Op.getOpcode();
15762 switch (Opc) {
15764 unsigned IID = Op.getConstantOperandVal(0);
15765 switch (IID) {
15766 case Intrinsic::amdgcn_mbcnt_lo:
15767 case Intrinsic::amdgcn_mbcnt_hi: {
15768 const GCNSubtarget &ST =
15770 // These return at most the (wavefront size - 1) + src1
15771 // As long as src1 is an immediate we can calc known bits
15772 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15773 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15774 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15775 // Cater for potential carry
15776 MaxActiveBits += Src1ValBits ? 1 : 0;
15777 unsigned Size = Op.getValueType().getSizeInBits();
15778 if (MaxActiveBits < Size)
15779 Known.Zero.setHighBits(Size - MaxActiveBits);
15780 return;
15781 }
15782 }
15783 break;
15784 }
15785 }
15787 Op, Known, DemandedElts, DAG, Depth);
15788}
15789
15791 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15793
15794 // Set the high bits to zero based on the maximum allowed scratch size per
15795 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15796 // calculation won't overflow, so assume the sign bit is never set.
15797 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15798}
15799
15801 KnownBits &Known, unsigned Dim) {
15802 unsigned MaxValue =
15803 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15804 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15805}
15806
15808 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15809 const MachineRegisterInfo &MRI, unsigned Depth) const {
15810 const MachineInstr *MI = MRI.getVRegDef(R);
15811 switch (MI->getOpcode()) {
15812 case AMDGPU::G_INTRINSIC:
15813 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15814 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15815 case Intrinsic::amdgcn_workitem_id_x:
15816 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15817 break;
15818 case Intrinsic::amdgcn_workitem_id_y:
15819 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15820 break;
15821 case Intrinsic::amdgcn_workitem_id_z:
15822 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15823 break;
15824 case Intrinsic::amdgcn_mbcnt_lo:
15825 case Intrinsic::amdgcn_mbcnt_hi: {
15826 // These return at most the wavefront size - 1.
15827 unsigned Size = MRI.getType(R).getSizeInBits();
15828 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15829 break;
15830 }
15831 case Intrinsic::amdgcn_groupstaticsize: {
15832 // We can report everything over the maximum size as 0. We can't report
15833 // based on the actual size because we don't know if it's accurate or not
15834 // at any given point.
15835 Known.Zero.setHighBits(
15836 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15837 break;
15838 }
15839 }
15840 break;
15841 }
15842 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15843 Known.Zero.setHighBits(24);
15844 break;
15845 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15846 Known.Zero.setHighBits(16);
15847 break;
15848 case AMDGPU::G_AMDGPU_SMED3:
15849 case AMDGPU::G_AMDGPU_UMED3: {
15850 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15851
15852 KnownBits Known2;
15853 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15854 if (Known2.isUnknown())
15855 break;
15856
15857 KnownBits Known1;
15858 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15859 if (Known1.isUnknown())
15860 break;
15861
15862 KnownBits Known0;
15863 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15864 if (Known0.isUnknown())
15865 break;
15866
15867 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15868 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15869 Known.One = Known0.One & Known1.One & Known2.One;
15870 break;
15871 }
15872 }
15873}
15874
15877 unsigned Depth) const {
15878 const MachineInstr *MI = MRI.getVRegDef(R);
15879 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15880 // FIXME: Can this move to generic code? What about the case where the call
15881 // site specifies a lower alignment?
15882 Intrinsic::ID IID = GI->getIntrinsicID();
15884 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15885 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15886 return *RetAlign;
15887 }
15888 return Align(1);
15889}
15890
15893 const Align CacheLineAlign = Align(64);
15894
15895 // Pre-GFX10 target did not benefit from loop alignment
15896 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15897 getSubtarget()->hasInstFwdPrefetchBug())
15898 return PrefAlign;
15899
15900 // On GFX10 I$ is 4 x 64 bytes cache lines.
15901 // By default prefetcher keeps one cache line behind and reads two ahead.
15902 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15903 // behind and one ahead.
15904 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15905 // If loop fits 64 bytes it always spans no more than two cache lines and
15906 // does not need an alignment.
15907 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15908 // Else if loop is less or equal 192 bytes we need two lines behind.
15909
15911 const MachineBasicBlock *Header = ML->getHeader();
15912 if (Header->getAlignment() != PrefAlign)
15913 return Header->getAlignment(); // Already processed.
15914
15915 unsigned LoopSize = 0;
15916 for (const MachineBasicBlock *MBB : ML->blocks()) {
15917 // If inner loop block is aligned assume in average half of the alignment
15918 // size to be added as nops.
15919 if (MBB != Header)
15920 LoopSize += MBB->getAlignment().value() / 2;
15921
15922 for (const MachineInstr &MI : *MBB) {
15923 LoopSize += TII->getInstSizeInBytes(MI);
15924 if (LoopSize > 192)
15925 return PrefAlign;
15926 }
15927 }
15928
15929 if (LoopSize <= 64)
15930 return PrefAlign;
15931
15932 if (LoopSize <= 128)
15933 return CacheLineAlign;
15934
15935 // If any of parent loops is surrounded by prefetch instructions do not
15936 // insert new for inner loop, which would reset parent's settings.
15937 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15938 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15939 auto I = Exit->getFirstNonDebugInstr();
15940 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15941 return CacheLineAlign;
15942 }
15943 }
15944
15945 MachineBasicBlock *Pre = ML->getLoopPreheader();
15946 MachineBasicBlock *Exit = ML->getExitBlock();
15947
15948 if (Pre && Exit) {
15949 auto PreTerm = Pre->getFirstTerminator();
15950 if (PreTerm == Pre->begin() ||
15951 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15952 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15953 .addImm(1); // prefetch 2 lines behind PC
15954
15955 auto ExitHead = Exit->getFirstNonDebugInstr();
15956 if (ExitHead == Exit->end() ||
15957 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15958 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15959 .addImm(2); // prefetch 1 line behind PC
15960 }
15961
15962 return CacheLineAlign;
15963}
15964
15966static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15967 assert(N->getOpcode() == ISD::CopyFromReg);
15968 do {
15969 // Follow the chain until we find an INLINEASM node.
15970 N = N->getOperand(0).getNode();
15971 if (N->getOpcode() == ISD::INLINEASM ||
15972 N->getOpcode() == ISD::INLINEASM_BR)
15973 return true;
15974 } while (N->getOpcode() == ISD::CopyFromReg);
15975 return false;
15976}
15977
15980 UniformityInfo *UA) const {
15981 switch (N->getOpcode()) {
15982 case ISD::CopyFromReg: {
15983 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15984 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15985 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15986 Register Reg = R->getReg();
15987
15988 // FIXME: Why does this need to consider isLiveIn?
15989 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15990 return !TRI->isSGPRReg(MRI, Reg);
15991
15992 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15993 return UA->isDivergent(V);
15994
15996 return !TRI->isSGPRReg(MRI, Reg);
15997 }
15998 case ISD::LOAD: {
15999 const LoadSDNode *L = cast<LoadSDNode>(N);
16000 unsigned AS = L->getAddressSpace();
16001 // A flat load may access private memory.
16003 }
16004 case ISD::CALLSEQ_END:
16005 return true;
16007 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16009 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16031 // Target-specific read-modify-write atomics are sources of divergence.
16032 return true;
16033 default:
16034 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16035 // Generic read-modify-write atomics are sources of divergence.
16036 return A->readMem() && A->writeMem();
16037 }
16038 return false;
16039 }
16040}
16041
16043 EVT VT) const {
16044 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16045 case MVT::f32:
16047 case MVT::f64:
16048 case MVT::f16:
16050 default:
16051 return false;
16052 }
16053}
16054
16056 LLT Ty, const MachineFunction &MF) const {
16057 switch (Ty.getScalarSizeInBits()) {
16058 case 32:
16059 return !denormalModeIsFlushAllF32(MF);
16060 case 64:
16061 case 16:
16062 return !denormalModeIsFlushAllF64F16(MF);
16063 default:
16064 return false;
16065 }
16066}
16067
16069 const SelectionDAG &DAG,
16070 bool SNaN,
16071 unsigned Depth) const {
16072 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16073 const MachineFunction &MF = DAG.getMachineFunction();
16075
16076 if (Info->getMode().DX10Clamp)
16077 return true; // Clamped to 0.
16078 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16079 }
16080
16082 SNaN, Depth);
16083}
16084
16085#if 0
16086// FIXME: This should be checked before unsafe fp atomics are enabled
16087// Global FP atomic instructions have a hardcoded FP mode and do not support
16088// FP32 denormals, and only support v2f16 denormals.
16089static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16091 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16092 if (&Flt == &APFloat::IEEEsingle())
16093 return DenormMode == DenormalMode::getPreserveSign();
16094 return DenormMode == DenormalMode::getIEEE();
16095}
16096#endif
16097
16098// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16099// floating point atomic instructions. May generate more efficient code,
16100// but may not respect rounding and denormal modes, and may give incorrect
16101// results for certain memory destinations.
16103 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16104 "true";
16105}
16106
16108 LLVMContext &Ctx = RMW->getContext();
16110 Ctx.getSyncScopeNames(SSNs);
16111 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16112 ? "system"
16113 : SSNs[RMW->getSyncScopeID()];
16114
16115 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16116 << "Hardware instruction generated for atomic "
16117 << RMW->getOperationName(RMW->getOperation())
16118 << " operation at memory scope " << MemScope;
16119}
16120
16123 unsigned AS = RMW->getPointerAddressSpace();
16124 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16126
16127 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16129 ORE.emit([=]() {
16130 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16131 });
16132 return Kind;
16133 };
16134
16135 auto SSID = RMW->getSyncScopeID();
16136 bool HasSystemScope =
16137 SSID == SyncScope::System ||
16138 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16139
16140 switch (RMW->getOperation()) {
16141 case AtomicRMWInst::Sub:
16142 case AtomicRMWInst::Or:
16143 case AtomicRMWInst::Xor: {
16144 // Atomic sub/or/xor do not work over PCI express, but atomic add
16145 // does. InstCombine transforms these with 0 to or, so undo that.
16146 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16147 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16148 ConstVal && ConstVal->isNullValue())
16150 }
16151
16152 break;
16153 }
16154 case AtomicRMWInst::FAdd: {
16155 Type *Ty = RMW->getType();
16156
16157 // TODO: Handle REGION_ADDRESS
16158 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16159 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16160 // is fixed to round-to-nearest-even.
16161 //
16162 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16163 // round-to-nearest-even.
16164 //
16165 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16166 // suggests it is OK if the floating-point mode may not match the calling
16167 // thread.
16168 if (Ty->isFloatTy()) {
16171 }
16172
16173 if (Ty->isDoubleTy()) {
16174 // Ignores denormal mode, but we don't consider flushing mandatory.
16177 }
16178
16179 // TODO: Handle v2f16/v2bf16 cases for gfx940
16181 }
16182
16186
16187 // TODO: gfx940 supports v2f16 and v2bf16
16188 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16190
16193
16194 // Always expand system scope fp atomics.
16195 if (HasSystemScope)
16197
16198 // global and flat atomic fadd f64: gfx90a, gfx940.
16199 if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
16200 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16201
16202 if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16203 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16204 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16205 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16206 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16207 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16208 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16209 }
16210
16211 // flat atomic fadd f32: gfx940, gfx11+.
16212 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16213 if (Subtarget->hasFlatAtomicFaddF32Inst())
16214 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16215
16216 // If it is in flat address space, and the type is float, we will try to
16217 // expand it, if the target supports global and lds atomic fadd. The
16218 // reason we need that is, in the expansion, we emit the check of address
16219 // space. If it is in global address space, we emit the global atomic
16220 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16221 if (Subtarget->hasLDSFPAtomicAddF32()) {
16222 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16224 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16226 }
16227 }
16228
16230 }
16233 case AtomicRMWInst::Min:
16234 case AtomicRMWInst::Max:
16236 case AtomicRMWInst::UMax: {
16239 if (RMW->getType()->isFloatTy() &&
16242
16243 // Always expand system scope min/max atomics.
16244 if (HasSystemScope)
16246 }
16247 break;
16248 }
16249 default:
16250 break;
16251 }
16252
16254}
16255
16261}
16262
16265 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16268}
16269
16275}
16276
16277const TargetRegisterClass *
16278SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16280 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16281 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16282 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16283 : &AMDGPU::SReg_32RegClass;
16284 if (!TRI->isSGPRClass(RC) && !isDivergent)
16285 return TRI->getEquivalentSGPRClass(RC);
16286 else if (TRI->isSGPRClass(RC) && isDivergent)
16287 return TRI->getEquivalentVGPRClass(RC);
16288
16289 return RC;
16290}
16291
16292// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16293// uniform values (as produced by the mask results of control flow intrinsics)
16294// used outside of divergent blocks. The phi users need to also be treated as
16295// always uniform.
16296//
16297// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16298static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16299 unsigned WaveSize) {
16300 // FIXME: We assume we never cast the mask results of a control flow
16301 // intrinsic.
16302 // Early exit if the type won't be consistent as a compile time hack.
16303 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16304 if (!IT || IT->getBitWidth() != WaveSize)
16305 return false;
16306
16307 if (!isa<Instruction>(V))
16308 return false;
16309 if (!Visited.insert(V).second)
16310 return false;
16311 bool Result = false;
16312 for (const auto *U : V->users()) {
16313 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16314 if (V == U->getOperand(1)) {
16315 switch (Intrinsic->getIntrinsicID()) {
16316 default:
16317 Result = false;
16318 break;
16319 case Intrinsic::amdgcn_if_break:
16320 case Intrinsic::amdgcn_if:
16321 case Intrinsic::amdgcn_else:
16322 Result = true;
16323 break;
16324 }
16325 }
16326 if (V == U->getOperand(0)) {
16327 switch (Intrinsic->getIntrinsicID()) {
16328 default:
16329 Result = false;
16330 break;
16331 case Intrinsic::amdgcn_end_cf:
16332 case Intrinsic::amdgcn_loop:
16333 Result = true;
16334 break;
16335 }
16336 }
16337 } else {
16338 Result = hasCFUser(U, Visited, WaveSize);
16339 }
16340 if (Result)
16341 break;
16342 }
16343 return Result;
16344}
16345
16347 const Value *V) const {
16348 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16349 if (CI->isInlineAsm()) {
16350 // FIXME: This cannot give a correct answer. This should only trigger in
16351 // the case where inline asm returns mixed SGPR and VGPR results, used
16352 // outside the defining block. We don't have a specific result to
16353 // consider, so this assumes if any value is SGPR, the overall register
16354 // also needs to be SGPR.
16355 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16357 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16358 for (auto &TC : TargetConstraints) {
16359 if (TC.Type == InlineAsm::isOutput) {
16362 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16363 if (RC && SIRI->isSGPRClass(RC))
16364 return true;
16365 }
16366 }
16367 }
16368 }
16370 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16371}
16372
16374 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16375 for (; I != E; ++I) {
16376 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16377 if (getBasePtrIndex(M) == I.getOperandNo())
16378 return true;
16379 }
16380 }
16381 return false;
16382}
16383
16385 SDValue N1) const {
16386 if (!N0.hasOneUse())
16387 return false;
16388 // Take care of the opportunity to keep N0 uniform
16389 if (N0->isDivergent() || !N1->isDivergent())
16390 return true;
16391 // Check if we have a good chance to form the memory access pattern with the
16392 // base and offset
16393 return (DAG.isBaseWithConstantOffset(N0) &&
16394 hasMemSDNodeUser(*N0->use_begin()));
16395}
16396
16398 Register N0, Register N1) const {
16399 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16400}
16401
16404 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16406 if (I.getMetadata("amdgpu.noclobber"))
16407 Flags |= MONoClobber;
16408 if (I.getMetadata("amdgpu.last.use"))
16409 Flags |= MOLastUse;
16410 return Flags;
16411}
16412
16414 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16415 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16416 if (User->getOpcode() != ISD::CopyToReg)
16417 return false;
16418 if (!Def->isMachineOpcode())
16419 return false;
16420 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16421 if (!MDef)
16422 return false;
16423
16424 unsigned ResNo = User->getOperand(Op).getResNo();
16425 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16426 return false;
16427 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16428 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16429 PhysReg = AMDGPU::SCC;
16430 const TargetRegisterClass *RC =
16431 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16432 Cost = RC->getCopyCost();
16433 return true;
16434 }
16435 return false;
16436}
16437
16440
16443 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16444 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16445 "this cannot be replaced with add");
16447 return;
16448 }
16449
16450 assert(Subtarget->hasAtomicFaddInsts() &&
16451 "target should have atomic fadd instructions");
16452 assert(AI->getType()->isFloatTy() &&
16454 "generic atomicrmw expansion only supports FP32 operand in flat "
16455 "address space");
16456 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16457
16458 // Given: atomicrmw fadd ptr %addr, float %val ordering
16459 //
16460 // With this expansion we produce the following code:
16461 // [...]
16462 // br label %atomicrmw.check.shared
16463 //
16464 // atomicrmw.check.shared:
16465 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16466 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16467 //
16468 // atomicrmw.shared:
16469 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16470 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16471 // float %val ordering
16472 // br label %atomicrmw.phi
16473 //
16474 // atomicrmw.check.private:
16475 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16476 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16477 //
16478 // atomicrmw.private:
16479 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16480 // %loaded.private = load float, ptr addrspace(5) %cast.private
16481 // %val.new = fadd float %loaded.private, %val
16482 // store float %val.new, ptr addrspace(5) %cast.private
16483 // br label %atomicrmw.phi
16484 //
16485 // atomicrmw.global:
16486 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16487 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16488 // float %val ordering
16489 // br label %atomicrmw.phi
16490 //
16491 // atomicrmw.phi:
16492 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16493 // [ %loaded.private, %atomicrmw.private ],
16494 // [ %loaded.global, %atomicrmw.global ]
16495 // br label %atomicrmw.end
16496 //
16497 // atomicrmw.end:
16498 // [...]
16499
16500 IRBuilder<> Builder(AI);
16501 LLVMContext &Ctx = Builder.getContext();
16502
16503 BasicBlock *BB = Builder.GetInsertBlock();
16504 Function *F = BB->getParent();
16505 BasicBlock *ExitBB =
16506 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16507 BasicBlock *CheckSharedBB =
16508 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16509 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16510 BasicBlock *CheckPrivateBB =
16511 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16512 BasicBlock *PrivateBB =
16513 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16514 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16515 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16516
16517 Value *Val = AI->getValOperand();
16518 Type *ValTy = Val->getType();
16519 Value *Addr = AI->getPointerOperand();
16520
16521 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16522 Value *Val) -> Value * {
16523 AtomicRMWInst *OldVal =
16524 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16525 AI->getOrdering(), AI->getSyncScopeID());
16527 AI->getAllMetadata(MDs);
16528 for (auto &P : MDs)
16529 OldVal->setMetadata(P.first, P.second);
16530 return OldVal;
16531 };
16532
16533 std::prev(BB->end())->eraseFromParent();
16534 Builder.SetInsertPoint(BB);
16535 Builder.CreateBr(CheckSharedBB);
16536
16537 Builder.SetInsertPoint(CheckSharedBB);
16538 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16539 {Addr}, nullptr, "is.shared");
16540 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16541
16542 Builder.SetInsertPoint(SharedBB);
16543 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16545 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16546 Builder.CreateBr(PhiBB);
16547
16548 Builder.SetInsertPoint(CheckPrivateBB);
16549 CallInst *IsPrivate = Builder.CreateIntrinsic(
16550 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16551 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16552
16553 Builder.SetInsertPoint(PrivateBB);
16554 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16556 Value *LoadedPrivate =
16557 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16558 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16559 Builder.CreateStore(NewVal, CastToPrivate);
16560 Builder.CreateBr(PhiBB);
16561
16562 Builder.SetInsertPoint(GlobalBB);
16563 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16565 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16566 Builder.CreateBr(PhiBB);
16567
16568 Builder.SetInsertPoint(PhiBB);
16569 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16570 Loaded->addIncoming(LoadedShared, SharedBB);
16571 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16572 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16573 Builder.CreateBr(ExitBB);
16574
16575 AI->replaceAllUsesWith(Loaded);
16576 AI->eraseFromParent();
16577}
16578
16579LoadInst *
16581 IRBuilder<> Builder(AI);
16582 auto Order = AI->getOrdering();
16583
16584 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16585 // must be flushed if the atomic ordering had a release semantics. This is
16586 // not necessary a fence, a release fence just coincides to do that flush.
16587 // Avoid replacing of an atomicrmw with a release semantics.
16588 if (isReleaseOrStronger(Order))
16589 return nullptr;
16590
16591 LoadInst *LI = Builder.CreateAlignedLoad(
16592 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16593 LI->setAtomic(Order, AI->getSyncScopeID());
16594 LI->copyMetadata(*AI);
16595 LI->takeName(AI);
16596 AI->replaceAllUsesWith(LI);
16597 AI->eraseFromParent();
16598 return LI;
16599}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1168
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:988
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5191
bool isNegative() const
Definition: APFloat.h:1295
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
bool isInfinity() const
Definition: APFloat.h:1292
Class for arbitrary precision integers.
Definition: APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1596
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:684
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
Value * getPointerOperand()
Definition: Instructions.h:910
void setOperation(BinOp Operation)
Definition: Instructions.h:861
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:918
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1828
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
bool isSigned() const
Definition: InstrTypes.h:1265
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:41
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
iterator_range< arg_iterator > args()
Definition: Function.h:842
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:742
bool hasPrefetch() const
Definition: GCNSubtarget.h:895
bool hasD16Images() const
Definition: GCNSubtarget.h:690
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:468
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:459
bool hasDot7Insts() const
Definition: GCNSubtarget.h:789
bool hasApertureRegs() const
Definition: GCNSubtarget.h:588
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:618
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:759
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:402
bool hasMAIInsts() const
Definition: GCNSubtarget.h:809
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:670
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:518
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:576
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
bool hasDot1Insts() const
Definition: GCNSubtarget.h:765
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:831
Align getStackAlignment() const
Definition: GCNSubtarget.h:908
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:446
bool enableFlatScratch() const
Definition: GCNSubtarget.h:643
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:614
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:452
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:847
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:735
bool useDS128() const
Definition: GCNSubtarget.h:528
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:448
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:261
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:580
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:418
bool hasIntClamp() const
Definition: GCNSubtarget.h:348
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:999
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:368
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:592
bool hasLDSFPAtomicAddF64() const
Definition: GCNSubtarget.h:969
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:622
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:921
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:724
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:327
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:875
bool hasFFBL() const
Definition: GCNSubtarget.h:406
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:940
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:550
bool hasMed3_16() const
Definition: GCNSubtarget.h:414
bool hasMovrel() const
Definition: GCNSubtarget.h:944
bool hasBFI() const
Definition: GCNSubtarget.h:394
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:568
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:335
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:513
bool hasFFBH() const
Definition: GCNSubtarget.h:410
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:827
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:833
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:958
bool hasLDSFPAtomicAddF32() const
Definition: GCNSubtarget.h:968
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:538
bool hasDot8Insts() const
Definition: GCNSubtarget.h:793
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:533
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:522
Generation getGeneration() const
Definition: GCNSubtarget.h:308
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:722
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:726
bool hasAddr64() const
Definition: GCNSubtarget.h:372
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:422
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:718
bool hasFractBug() const
Definition: GCNSubtarget.h:386
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:390
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:705
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:510
unsigned getAddressSpace() const
Definition: GlobalValue.h:204
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:295
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1533
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1120
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
LLVMContext & getContext() const
Definition: IRBuilder.h:176
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1854
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1114
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:341
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:377
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
bool isCompare() const
Return true if this instruction is a comparison.
Definition: MCInstrDesc.h:341
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Definition: MCInstrDesc.cpp:32
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
Definition: SelectionDAG.h:470
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:845
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:269
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
constexpr bool isZero() const
Definition: TypeSize.h:156
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:271
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1005
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1276
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1278
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1279
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1261
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:481
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1412
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1281
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1195
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1054
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1277
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:508
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:223
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1023
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1000
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1255
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1280
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1104
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:923
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1286
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:945
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:415
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:908
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1101
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1285
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1535
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1515
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1027
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1542
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:228
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:219
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals