40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
352 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
366 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
380 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
394 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
408 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
526 {MVT::f32, MVT::f64},
Legal);
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
866 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
867 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
868 MVT::i16, MVT::i8, MVT::i128},
872 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
873 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
969 EVT DestVT,
EVT SrcVT)
const {
979 LLT DestTy,
LLT SrcTy)
const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1009 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1036 return (NumElts + 1) / 2;
1042 return NumElts * ((
Size + 31) / 32);
1051 EVT VT,
EVT &IntermediateVT,
1052 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1065 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((
Size + 31) / 32);
1099 return NumIntermediates;
1104 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1108 assert(MaxNumLanes != 0);
1110 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1122 auto *ST = dyn_cast<StructType>(Ty);
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1143 DL.getPointerSizeInBits(AS) == 192)
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1163 unsigned IntrID)
const {
1165 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1179 if (RsrcIntr->IsImage)
1183 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1190 Info.ptrVal = RsrcArg;
1198 unsigned MaxNumLanes = 4;
1200 if (RsrcIntr->IsImage) {
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1286 if (!Vol || !Vol->
isZero())
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1295 Info.ptrVal =
nullptr;
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1382 Info.memVT = MVT::i32;
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1392 case Intrinsic::amdgcn_global_load_lds: {
1394 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1410 Info.memVT = MVT::i32;
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1428 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1441 Type *&AccessTy)
const {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1470 case Intrinsic::amdgcn_global_load_lds:
1481bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM,
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1490 return AM.Scale == 0 &&
1492 AM.BaseOffs, AddrSpace, FlatVariant));
1514 return isLegalMUBUFAddressingMode(AM);
1517bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1528 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1540 if (AM.HasBaseReg) {
1571 return isLegalMUBUFAddressingMode(AM);
1578 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1619 : isLegalMUBUFAddressingMode(AM);
1667 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1681 Alignment < RequiredAlignment)
1702 RequiredAlignment =
Align(4);
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment <
Align(4)) ? 32
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment <
Align(4)) ? 32
1757 RequiredAlignment =
Align(8);
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment <
Align(4)) ? 32
1786 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1788 return Alignment >= RequiredAlignment ||
1793 bool AlignedBy4 = Alignment >=
Align(4);
1795 *IsFast = AlignedBy4;
1797 return AlignedBy4 ||
1807 bool AlignedBy4 = Alignment >=
Align(4);
1809 *IsFast = AlignedBy4;
1820 return Alignment >=
Align(4) ||
1834 return Size >= 32 && Alignment >=
Align(4);
1839 unsigned *IsFast)
const {
1841 Alignment, Flags, IsFast);
1851 if (
Op.size() >= 16 &&
1855 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1863 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1873 unsigned DestAS)
const {
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1885 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1905 unsigned Index)
const {
1952 std::tie(InputPtrReg, RC, ArgTy) =
1962 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1968 const SDLoc &SL)
const {
1975 const SDLoc &SL)
const {
1978 std::optional<uint32_t> KnownSize =
1980 if (KnownSize.has_value())
2007 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2016SDValue SITargetLowering::lowerKernargMemParameter(
2028 int64_t OffsetDiff =
Offset - AlignDownOffset;
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2102 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2131 Reg = &WorkGroupIDX;
2132 RC = &AMDGPU::SReg_32RegClass;
2136 Reg = &WorkGroupIDY;
2137 RC = &AMDGPU::SReg_32RegClass;
2141 Reg = &WorkGroupIDZ;
2142 RC = &AMDGPU::SReg_32RegClass;
2173 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2177 "vector type argument should have been split");
2182 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2191 "unexpected vector split in ps argument type");
2205 Info->markPSInputAllocated(PSInputNum);
2207 Info->markPSInputEnabled(PSInputNum);
2224 if (
Info.hasWorkItemIDX()) {
2230 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2234 if (
Info.hasWorkItemIDY()) {
2240 unsigned Reg = AMDGPU::VGPR1;
2248 if (
Info.hasWorkItemIDZ()) {
2254 unsigned Reg = AMDGPU::VGPR2;
2274 if (RegIdx == ArgVGPRs.
size()) {
2281 unsigned Reg = ArgVGPRs[RegIdx];
2283 assert(Reg != AMDGPU::NoRegister);
2293 unsigned NumArgRegs) {
2296 if (RegIdx == ArgSGPRs.
size())
2299 unsigned Reg = ArgSGPRs[RegIdx];
2301 assert(Reg != AMDGPU::NoRegister);
2315 assert(Reg != AMDGPU::NoRegister);
2341 const unsigned Mask = 0x3ff;
2344 if (
Info.hasWorkItemIDX()) {
2346 Info.setWorkItemIDX(Arg);
2349 if (
Info.hasWorkItemIDY()) {
2351 Info.setWorkItemIDY(Arg);
2354 if (
Info.hasWorkItemIDZ())
2366 const unsigned Mask = 0x3ff;
2391 if (
Info.hasImplicitArgPtr())
2399 if (
Info.hasWorkGroupIDX())
2402 if (
Info.hasWorkGroupIDY())
2405 if (
Info.hasWorkGroupIDZ())
2408 if (
Info.hasLDSKernelId())
2420 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2427 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2433 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2441 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2456 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2462 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2477 unsigned LastExplicitArgOffset =
2480 bool InPreloadSequence =
true;
2482 for (
auto &Arg :
F.args()) {
2483 if (!InPreloadSequence || !Arg.hasInRegAttr())
2486 int ArgIdx = Arg.getArgNo();
2489 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2490 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2493 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2494 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2496 assert(ArgLocs[ArgIdx].isMemLoc());
2497 auto &ArgLoc = ArgLocs[InIdx];
2499 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2501 unsigned NumAllocSGPRs =
2502 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2505 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2506 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2507 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2511 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2512 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2514 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2516 InPreloadSequence =
false;
2522 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2524 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2526 if (PreloadRegs->
size() > 1)
2527 RC = &AMDGPU::SGPR_32RegClass;
2528 for (
auto &Reg : *PreloadRegs) {
2534 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2543 if (
Info.hasLDSKernelId()) {
2545 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2555 bool IsShader)
const {
2563 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2565 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2569 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2570 Info.hasWorkGroupIDY() +
2571 Info.hasWorkGroupIDZ() +
2572 Info.hasWorkGroupInfo();
2573 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2575 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2580 if (!HasArchitectedSGPRs) {
2581 if (
Info.hasWorkGroupIDX()) {
2583 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2587 if (
Info.hasWorkGroupIDY()) {
2589 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2593 if (
Info.hasWorkGroupIDZ()) {
2595 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600 if (
Info.hasWorkGroupInfo()) {
2602 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2606 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2608 unsigned PrivateSegmentWaveByteOffsetReg;
2611 PrivateSegmentWaveByteOffsetReg =
2612 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2616 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2618 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2621 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2623 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2624 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2628 Info.getNumPreloadedSGPRs() >= 16);
2643 if (HasStackObjects)
2644 Info.setHasNonSpillStackObjects(
true);
2649 HasStackObjects =
true;
2653 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2655 if (!ST.enableFlatScratch()) {
2656 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2663 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2665 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2675 Info.setScratchRSrcReg(ReservedBufferReg);
2694 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2695 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2702 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2703 if (!
MRI.isLiveIn(Reg)) {
2704 Info.setStackPtrOffsetReg(Reg);
2709 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2716 if (ST.getFrameLowering()->hasFP(MF)) {
2717 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2723 return !
Info->isEntryFunction();
2735 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2744 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2745 RC = &AMDGPU::SGPR_64RegClass;
2746 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2747 RC = &AMDGPU::SGPR_32RegClass;
2753 Entry->addLiveIn(*
I);
2758 for (
auto *Exit : Exits)
2760 TII->get(TargetOpcode::COPY), *
I)
2778 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2797 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2798 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2806 !
Info->hasWorkGroupIDZ());
2825 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2826 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2829 Info->markPSInputAllocated(0);
2830 Info->markPSInputEnabled(0);
2841 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2842 if ((PsInputBits & 0x7F) == 0 ||
2843 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2846 }
else if (IsKernel) {
2849 Splits.
append(Ins.begin(), Ins.end());
2862 }
else if (!IsGraphics) {
2887 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2897 if (IsEntryFunc && VA.
isMemLoc()) {
2920 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2924 int64_t OffsetDiff =
Offset - AlignDownOffset;
2931 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2942 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2943 Ins[i].Flags.isSExt(), &Ins[i]);
2951 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2954 if (PreloadRegs.
size() == 1) {
2955 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2960 TRI->getRegSizeInBits(*RC)));
2968 for (
auto Reg : PreloadRegs) {
2975 PreloadRegs.size()),
2984 NewArg = convertArgType(DAG, VT, MemVT,
DL, CMemVT,
2985 Ins[i].Flags.isSExt(), &Ins[i]);
2990 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
2991 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2996 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3009 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
3010 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3021 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3022 RC = &AMDGPU::VGPR_32RegClass;
3023 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3024 RC = &AMDGPU::SGPR_32RegClass;
3077 auto &ArgUsageInfo =
3082 Info->setBytesInStackArgArea(StackArgSize);
3084 return Chains.
empty() ? Chain :
3108 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3109 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3110 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3133 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3151 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3152 ++
I, ++RealRVLocIdx) {
3156 SDValue Arg = OutVals[RealRVLocIdx];
3184 if (!
Info->isEntryFunction()) {
3190 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3192 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3208 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3225 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
3291 auto &ArgUsageInfo =
3293 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3322 std::tie(OutgoingArg, ArgRC, ArgTy) =
3330 std::tie(IncomingArg, IncomingArgRC, Ty) =
3332 assert(IncomingArgRC == ArgRC);
3335 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3343 InputReg = getImplicitArgPtr(DAG,
DL);
3345 std::optional<uint32_t> Id =
3347 if (Id.has_value()) {
3359 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3363 unsigned SpecialArgOffset =
3377 std::tie(OutgoingArg, ArgRC, Ty) =
3380 std::tie(OutgoingArg, ArgRC, Ty) =
3383 std::tie(OutgoingArg, ArgRC, Ty) =
3398 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3399 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3400 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3417 InputReg = InputReg.
getNode() ?
3426 InputReg = InputReg.
getNode() ?
3430 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3431 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3441 IncomingArgX ? *IncomingArgX :
3442 IncomingArgY ? *IncomingArgY :
3443 *IncomingArgZ, ~0u);
3450 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3491 if (Callee->isDivergent())
3498 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3502 if (!CallerPreserved)
3505 bool CCMatch = CallerCC == CalleeCC;
3518 if (Arg.hasByValAttr())
3532 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3533 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3542 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3576 if (IsChainCallConv) {
3580 RequestedExec = CLI.
Args.back();
3581 assert(RequestedExec.
Node &&
"No node for EXEC");
3586 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3587 CLI.
Outs.pop_back();
3591 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3592 CLI.
Outs.pop_back();
3597 "Haven't popped all the pieces of the EXEC mask");
3608 bool IsSibCall =
false;
3613 for (
unsigned I = 0, E = CLI.
Ins.size();
I != E; ++
I)
3622 "unsupported call to variadic function ");
3630 "unsupported required tail call to function ");
3635 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3639 "site marked musttail or on llvm.amdgcn.cs.chain");
3646 if (!TailCallOpt && IsTailCall)
3691 if (!IsSibCall || IsChainCallConv) {
3698 RegsToPass.emplace_back(IsChainCallConv
3699 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3700 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3707 MVT PtrVT = MVT::i32;
3710 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3738 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3746 int32_t
Offset = LocMemOffset;
3753 unsigned OpSize = Flags.isByVal() ?
3759 ? Flags.getNonZeroByValAlign()
3786 if (Outs[i].Flags.isByVal()) {
3788 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3791 Outs[i].Flags.getNonZeroByValAlign(),
3799 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3805 if (!MemOpChains.
empty())
3811 for (
auto &RegToPass : RegsToPass) {
3813 RegToPass.second, InGlue);
3822 if (IsTailCall && !IsSibCall) {
3827 std::vector<SDValue> Ops;
3828 Ops.push_back(Chain);
3829 Ops.push_back(Callee);
3846 if (IsChainCallConv)
3847 Ops.push_back(RequestedExec.
Node);
3851 for (
auto &RegToPass : RegsToPass) {
3853 RegToPass.second.getValueType()));
3858 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3859 assert(Mask &&
"Missing call preserved mask for calling convention");
3869 MVT::Glue, GlueOps),
3874 Ops.push_back(InGlue);
3893 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3898 Chain = Call.getValue(0);
3899 InGlue = Call.getValue(1);
3901 uint64_t CalleePopBytes = NumBytes;
3920 EVT VT =
Op.getValueType();
3935 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3946 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3947 if (Alignment && *Alignment > StackAlign) {
3968 if (isa<ConstantSDNode>(
Size))
3975 if (
Op.getValueType() != MVT::i32)
3994 assert(
Op.getValueType() == MVT::i32);
4003 Op.getOperand(0), IntrinID, GetRoundBothImm);
4037 SDValue RoundModeTimesNumBits =
4057 TableEntry, EnumOffset);
4071 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4073 static_cast<uint32_t>(ConstMode->getZExtValue()),
4085 if (UseReducedTable) {
4091 SDValue RoundModeTimesNumBits =
4111 SDValue RoundModeTimesNumBits =
4120 NewMode = TruncTable;
4129 ReadFirstLaneID, NewMode);
4142 IntrinID, RoundBothImm, NewMode);
4148 if (
Op->isDivergent())
4151 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4167 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4168 EVT SrcVT = Src.getValueType();
4177 EVT DstVT =
Op.getValueType();
4186 if (
Op.getValueType() != MVT::i64)
4200 Op.getOperand(0), IntrinID, ModeHwRegImm);
4202 Op.getOperand(0), IntrinID, TrapHwRegImm);
4216 if (
Op.getOperand(1).getValueType() != MVT::i64)
4228 ReadFirstLaneID, NewModeReg);
4230 ReadFirstLaneID, NewTrapReg);
4232 unsigned ModeHwReg =
4235 unsigned TrapHwReg =
4243 IntrinID, ModeHwRegImm, NewModeReg);
4246 IntrinID, TrapHwRegImm, NewTrapReg);
4253 .
Case(
"m0", AMDGPU::M0)
4254 .
Case(
"exec", AMDGPU::EXEC)
4255 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4256 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4257 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4258 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4259 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4262 if (Reg == AMDGPU::NoRegister) {
4276 case AMDGPU::EXEC_LO:
4277 case AMDGPU::EXEC_HI:
4278 case AMDGPU::FLAT_SCR_LO:
4279 case AMDGPU::FLAT_SCR_HI:
4284 case AMDGPU::FLAT_SCR:
4303 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4312static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4334 auto Next = std::next(
I);
4347 return std::pair(LoopBB, RemainderBB);
4354 auto I =
MI.getIterator();
4355 auto E = std::next(
I);
4377 Src->setIsKill(
false);
4393 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4396 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4418 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4419 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4428 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4429 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4430 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4431 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4439 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4446 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4450 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4455 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4456 : AMDGPU::S_AND_SAVEEXEC_B64),
4460 MRI.setSimpleHint(NewExec, CondReg);
4462 if (UseGPRIdxMode) {
4464 SGPRIdxReg = CurrentIdxReg;
4466 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4467 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4474 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4477 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4484 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4486 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4487 : AMDGPU::S_XOR_B64_term), Exec)
4508 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4509 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4517 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4519 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4520 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4521 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4522 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4537 InitResultReg, DstReg, PhiReg, TmpExec,
4538 Offset, UseGPRIdxMode, SGPRIdxReg);
4555static std::pair<unsigned, int>
4560 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4565 return std::pair(AMDGPU::sub0,
Offset);
4579 assert(
Idx->getReg() != AMDGPU::NoRegister);
4600 return Idx->getReg();
4602 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4619 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4620 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4629 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4632 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4636 if (UseGPRIdxMode) {
4643 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4656 MI.eraseFromParent();
4665 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4666 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4672 UseGPRIdxMode, SGPRIdxReg);
4676 if (UseGPRIdxMode) {
4678 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4680 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4685 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4690 MI.eraseFromParent();
4707 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4718 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4720 if (
Idx->getReg() == AMDGPU::NoRegister) {
4731 MI.eraseFromParent();
4736 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4740 if (UseGPRIdxMode) {
4744 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4753 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4754 TRI.getRegSizeInBits(*VecRC), 32,
false);
4760 MI.eraseFromParent();
4770 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4774 UseGPRIdxMode, SGPRIdxReg);
4777 if (UseGPRIdxMode) {
4779 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4781 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4787 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4788 TRI.getRegSizeInBits(*VecRC), 32,
false);
4789 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4795 MI.eraseFromParent();
4810 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4838 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4839 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4841 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4842 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4843 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4845 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4846 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4848 bool IsWave32 = ST.isWave32();
4849 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4850 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4855 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4858 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4863 I = ComputeLoop->end();
4865 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4869 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4870 .
addReg(TmpSReg->getOperand(0).getReg())
4874 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4875 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4876 .
addReg(ActiveBits->getOperand(0).getReg());
4877 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4878 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4880 .
addReg(FF1->getOperand(0).getReg());
4881 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4883 .
addReg(LaneValue->getOperand(0).getReg());
4886 unsigned BITSETOpc =
4887 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4888 auto NewActiveBits =
4889 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4890 .
addReg(FF1->getOperand(0).getReg())
4891 .
addReg(ActiveBits->getOperand(0).getReg());
4894 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4895 .addMBB(ComputeLoop);
4896 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4897 .addMBB(ComputeLoop);
4900 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4902 .
addReg(NewActiveBits->getOperand(0).getReg())
4904 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4909 MI.eraseFromParent();
4920 switch (
MI.getOpcode()) {
4921 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4925 case AMDGPU::S_UADDO_PSEUDO:
4926 case AMDGPU::S_USUBO_PSEUDO: {
4933 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4935 : AMDGPU::S_SUB_I32;
4942 MI.eraseFromParent();
4945 case AMDGPU::S_ADD_U64_PSEUDO:
4946 case AMDGPU::S_SUB_U64_PSEUDO: {
4955 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4957 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4965 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4966 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4969 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4971 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4974 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4976 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4978 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4979 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4992 MI.eraseFromParent();
4995 case AMDGPU::V_ADD_U64_PSEUDO:
4996 case AMDGPU::V_SUB_U64_PSEUDO: {
5002 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5008 if (IsAdd && ST.hasLshlAddB64()) {
5014 TII->legalizeOperands(*
Add);
5015 MI.eraseFromParent();
5019 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5021 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5022 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5024 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5025 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5029 : &AMDGPU::VReg_64RegClass;
5032 : &AMDGPU::VReg_64RegClass;
5035 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5037 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5040 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5042 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5045 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5047 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5049 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5056 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5070 TII->legalizeOperands(*LoHalf);
5071 TII->legalizeOperands(*HiHalf);
5072 MI.eraseFromParent();
5075 case AMDGPU::S_ADD_CO_PSEUDO:
5076 case AMDGPU::S_SUB_CO_PSEUDO: {
5090 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5091 ? AMDGPU::S_ADDC_U32
5092 : AMDGPU::S_SUBB_U32;
5094 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5095 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5100 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5101 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5105 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5113 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5114 assert(WaveSize == 64 || WaveSize == 32);
5116 if (WaveSize == 64) {
5117 if (ST.hasScalarCompareEq64()) {
5123 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5125 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5127 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5128 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5130 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5147 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5153 MI.eraseFromParent();
5156 case AMDGPU::SI_INIT_M0: {
5158 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5159 .
add(
MI.getOperand(0));
5160 MI.eraseFromParent();
5163 case AMDGPU::GET_GROUPSTATICSIZE: {
5168 .
add(
MI.getOperand(0))
5170 MI.eraseFromParent();
5173 case AMDGPU::GET_SHADERCYCLESHILO: {
5187 using namespace AMDGPU::Hwreg;
5188 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5190 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5191 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5193 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5194 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5196 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5200 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5205 .
add(
MI.getOperand(0))
5210 MI.eraseFromParent();
5213 case AMDGPU::SI_INDIRECT_SRC_V1:
5214 case AMDGPU::SI_INDIRECT_SRC_V2:
5215 case AMDGPU::SI_INDIRECT_SRC_V4:
5216 case AMDGPU::SI_INDIRECT_SRC_V8:
5217 case AMDGPU::SI_INDIRECT_SRC_V9:
5218 case AMDGPU::SI_INDIRECT_SRC_V10:
5219 case AMDGPU::SI_INDIRECT_SRC_V11:
5220 case AMDGPU::SI_INDIRECT_SRC_V12:
5221 case AMDGPU::SI_INDIRECT_SRC_V16:
5222 case AMDGPU::SI_INDIRECT_SRC_V32:
5224 case AMDGPU::SI_INDIRECT_DST_V1:
5225 case AMDGPU::SI_INDIRECT_DST_V2:
5226 case AMDGPU::SI_INDIRECT_DST_V4:
5227 case AMDGPU::SI_INDIRECT_DST_V8:
5228 case AMDGPU::SI_INDIRECT_DST_V9:
5229 case AMDGPU::SI_INDIRECT_DST_V10:
5230 case AMDGPU::SI_INDIRECT_DST_V11:
5231 case AMDGPU::SI_INDIRECT_DST_V12:
5232 case AMDGPU::SI_INDIRECT_DST_V16:
5233 case AMDGPU::SI_INDIRECT_DST_V32:
5235 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5236 case AMDGPU::SI_KILL_I1_PSEUDO:
5238 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5247 Register SrcCond =
MI.getOperand(3).getReg();
5249 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5250 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5251 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5252 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5256 : &AMDGPU::VReg_64RegClass;
5259 : &AMDGPU::VReg_64RegClass;
5262 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5264 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5267 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5269 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5272 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5274 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5296 MI.eraseFromParent();
5299 case AMDGPU::SI_BR_UNDEF: {
5303 .
add(
MI.getOperand(0));
5305 MI.eraseFromParent();
5308 case AMDGPU::ADJCALLSTACKUP:
5309 case AMDGPU::ADJCALLSTACKDOWN: {
5316 case AMDGPU::SI_CALL_ISEL: {
5320 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5323 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5329 MI.eraseFromParent();
5332 case AMDGPU::V_ADD_CO_U32_e32:
5333 case AMDGPU::V_SUB_CO_U32_e32:
5334 case AMDGPU::V_SUBREV_CO_U32_e32: {
5337 unsigned Opc =
MI.getOpcode();
5339 bool NeedClampOperand =
false;
5340 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5342 NeedClampOperand =
true;
5346 if (
TII->isVOP3(*
I)) {
5351 I.add(
MI.getOperand(1))
5352 .add(
MI.getOperand(2));
5353 if (NeedClampOperand)
5356 TII->legalizeOperands(*
I);
5358 MI.eraseFromParent();
5361 case AMDGPU::V_ADDC_U32_e32:
5362 case AMDGPU::V_SUBB_U32_e32:
5363 case AMDGPU::V_SUBBREV_U32_e32:
5366 TII->legalizeOperands(
MI);
5368 case AMDGPU::DS_GWS_INIT:
5369 case AMDGPU::DS_GWS_SEMA_BR:
5370 case AMDGPU::DS_GWS_BARRIER:
5371 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5373 case AMDGPU::DS_GWS_SEMA_V:
5374 case AMDGPU::DS_GWS_SEMA_P:
5375 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5383 case AMDGPU::S_SETREG_B32: {
5398 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5399 const unsigned SetMask = WidthMask <<
Offset;
5402 unsigned SetDenormOp = 0;
5403 unsigned SetRoundOp = 0;
5411 SetRoundOp = AMDGPU::S_ROUND_MODE;
5412 SetDenormOp = AMDGPU::S_DENORM_MODE;
5414 SetRoundOp = AMDGPU::S_ROUND_MODE;
5416 SetDenormOp = AMDGPU::S_DENORM_MODE;
5419 if (SetRoundOp || SetDenormOp) {
5422 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5423 unsigned ImmVal = Def->getOperand(1).getImm();
5437 MI.eraseFromParent();
5446 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5450 case AMDGPU::S_INVERSE_BALLOT_U32:
5451 case AMDGPU::S_INVERSE_BALLOT_U64: {
5456 const Register DstReg =
MI.getOperand(0).getReg();
5457 Register MaskReg =
MI.getOperand(1).getReg();
5459 const bool IsVALU =
TRI->isVectorRegister(
MRI, MaskReg);
5462 MaskReg =
TII->readlaneVGPRToSGPR(MaskReg,
MI,
MRI);
5466 MI.eraseFromParent();
5469 case AMDGPU::ENDPGM_TRAP: {
5472 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5490 MI.eraseFromParent();
5493 case AMDGPU::SIMULATED_TRAP: {
5497 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5498 MI.eraseFromParent();
5535 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5622 EVT VT =
N->getValueType(0);
5626 if (VT == MVT::f16) {
5642 unsigned Opc =
Op.getOpcode();
5643 EVT VT =
Op.getValueType();
5644 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5645 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5646 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5647 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5665 unsigned Opc =
Op.getOpcode();
5666 EVT VT =
Op.getValueType();
5667 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5668 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5669 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5670 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5689 unsigned Opc =
Op.getOpcode();
5690 EVT VT =
Op.getValueType();
5691 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5692 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5693 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5694 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5695 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5696 VT == MVT::v32bf16);
5702 : std::pair(Op0, Op0);
5721 switch (
Op.getOpcode()) {
5727 assert((!Result.getNode() ||
5728 Result.getNode()->getNumValues() == 2) &&
5729 "Load should return a value and a chain");
5733 EVT VT =
Op.getValueType();
5735 return lowerFSQRTF32(
Op, DAG);
5737 return lowerFSQRTF64(
Op, DAG);
5742 return LowerTrig(
Op, DAG);
5751 return LowerGlobalAddress(MFI,
Op, DAG);
5758 return lowerINSERT_SUBVECTOR(
Op, DAG);
5760 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5762 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5764 return lowerVECTOR_SHUFFLE(
Op, DAG);
5766 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5768 return lowerBUILD_VECTOR(
Op, DAG);
5771 return lowerFP_ROUND(
Op, DAG);
5776 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5780 int RoundMode =
Op.getConstantOperandVal(1);
5788 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5791 return lowerTRAP(
Op, DAG);
5793 return lowerDEBUGTRAP(
Op, DAG);
5801 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5804 return lowerFLDEXP(
Op, DAG);
5831 return lowerMUL(
Op, DAG);
5834 return lowerXMULO(
Op, DAG);
5837 return lowerXMUL_LOHI(
Op, DAG);
5870 EVT FittingLoadVT = LoadVT;
5902SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5906 bool IsIntrinsic)
const {
5910 EVT LoadVT =
M->getValueType(0);
5912 EVT EquivLoadVT = LoadVT;
5931 VTList, Ops,
M->getMemoryVT(),
5932 M->getMemOperand());
5943 EVT LoadVT =
M->getValueType(0);
5949 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5950 bool IsTFE =
M->getNumValues() == 3;
5969 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand());
5972 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5973 M->getMemOperand(), DAG);
5978 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5979 M->getMemOperand(), DAG);
5987 EVT VT =
N->getValueType(0);
5988 unsigned CondCode =
N->getConstantOperandVal(3);
5999 EVT CmpVT =
LHS.getValueType();
6000 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6021 EVT VT =
N->getValueType(0);
6023 unsigned CondCode =
N->getConstantOperandVal(3);
6032 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6050 EVT VT =
N->getValueType(0);
6057 Src.getOperand(1), Src.getOperand(2));
6068 Exec = AMDGPU::EXEC_LO;
6070 Exec = AMDGPU::EXEC;
6088 switch (
N->getOpcode()) {
6100 unsigned IID =
N->getConstantOperandVal(0);
6102 case Intrinsic::amdgcn_make_buffer_rsrc:
6103 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6105 case Intrinsic::amdgcn_cvt_pkrtz: {
6114 case Intrinsic::amdgcn_cvt_pknorm_i16:
6115 case Intrinsic::amdgcn_cvt_pknorm_u16:
6116 case Intrinsic::amdgcn_cvt_pk_i16:
6117 case Intrinsic::amdgcn_cvt_pk_u16: {
6123 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6125 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6127 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6132 EVT VT =
N->getValueType(0);
6141 case Intrinsic::amdgcn_s_buffer_load: {
6153 EVT VT =
Op.getValueType();
6154 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6166 if (!
Offset->isDivergent()) {
6185 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6197 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6198 Results.push_back(Res.getOperand(
I));
6202 Results.push_back(Res.getValue(1));
6211 EVT VT =
N->getValueType(0);
6216 EVT SelectVT = NewVT;
6217 if (NewVT.
bitsLT(MVT::i32)) {
6220 SelectVT = MVT::i32;
6226 if (NewVT != SelectVT)
6232 if (
N->getValueType(0) != MVT::v2f16)
6245 if (
N->getValueType(0) != MVT::v2f16)
6258 if (
N->getValueType(0) != MVT::f16)
6276 if (
I.getUse().get() !=
Value)
6279 if (
I->getOpcode() == Opcode)
6285unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6287 switch (
Intr->getConstantOperandVal(1)) {
6288 case Intrinsic::amdgcn_if:
6290 case Intrinsic::amdgcn_else:
6292 case Intrinsic::amdgcn_loop:
6294 case Intrinsic::amdgcn_end_cf:
6342 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6355 assert(BR &&
"brcond missing unconditional branch user");
6356 Target = BR->getOperand(1);
6359 unsigned CFNode = isCFIntrinsic(
Intr);
6378 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6408 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6425 Intr->getOperand(0));
6432 MVT VT =
Op.getSimpleValueType();
6435 if (
Op.getConstantOperandVal(0) != 0)
6441 if (
Info->isEntryFunction())
6459 return Op.getValueType().bitsLE(VT) ?
6466 assert(
Op.getValueType() == MVT::f16 &&
6467 "Do not know how to custom lower FP_ROUND for non-f16 type");
6470 EVT SrcVT = Src.getValueType();
6471 if (SrcVT != MVT::f64)
6487 EVT VT =
Op.getValueType();
6490 bool IsIEEEMode =
Info->getMode().IEEE;
6499 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6507 EVT VT =
Op.getValueType();
6511 EVT ExpVT =
Exp.getValueType();
6512 if (ExpVT == MVT::i16)
6533 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6541 EVT VT =
Op.getValueType();
6547 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6574 if (
Op->isDivergent())
6587 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6589 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6592 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6594 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6600 EVT VT =
Op.getValueType();
6607 const APInt &
C = RHSC->getAPIntValue();
6609 if (
C.isPowerOf2()) {
6611 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6616 SL, VT, Result, ShiftAmt),
6636 if (
Op->isDivergent()) {
6653 return lowerTrapEndpgm(
Op, DAG);
6656 lowerTrapHsaQueuePtr(
Op, DAG);
6659SDValue SITargetLowering::lowerTrapEndpgm(
6667 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6677SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6687 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6693 if (UserSGPR == AMDGPU::NoRegister) {
6718SDValue SITargetLowering::lowerTrapHsa(
6744 "debugtrap handler not supported",
6760SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6764 ? AMDGPU::SRC_SHARED_BASE
6765 : AMDGPU::SRC_PRIVATE_BASE;
6788 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6797 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
6803 if (UserSGPR == AMDGPU::NoRegister) {
6810 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6833 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6834 isa<BasicBlockSDNode>(Val))
6837 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6838 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
6852 unsigned DestAS, SrcAS;
6854 bool IsNonNull =
false;
6855 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
6856 SrcAS = ASC->getSrcAddressSpace();
6857 Src = ASC->getOperand(0);
6858 DestAS = ASC->getDestAddressSpace();
6861 Op.getConstantOperandVal(0) ==
6862 Intrinsic::amdgcn_addrspacecast_nonnull);
6863 Src =
Op->getOperand(1);
6864 SrcAS =
Op->getConstantOperandVal(2);
6865 DestAS =
Op->getConstantOperandVal(3);
6880 unsigned NullVal =
TM.getNullPointerValue(DestAS);
6894 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6902 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
6914 Op.getValueType() == MVT::i64) {
6923 Src.getValueType() == MVT::i64)
6947 EVT InsVT =
Ins.getValueType();
6950 unsigned IdxVal =
Idx->getAsZExtVal();
6955 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
6960 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6962 MVT::i32, InsNumElts / 2);
6967 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
6969 if (InsNumElts == 2) {
6982 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7004 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
7005 if (NumElts == 4 && EltSize == 16 && KIdx) {
7016 unsigned Idx = KIdx->getZExtValue();
7017 bool InsertLo =
Idx < 2;
7019 InsertLo ? LoVec : HiVec,
7034 if (isa<ConstantSDNode>(
Idx))
7040 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7046 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7062 DAG.
getNOT(SL, BFM, IntVT), BCVec);
7074 EVT ResultVT =
Op.getValueType();
7087 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7090 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7095 if (VecSize == 128) {
7103 }
else if (VecSize == 256) {
7106 for (
unsigned P = 0;
P < 4; ++
P) {
7112 Parts[0], Parts[1]));
7114 Parts[2], Parts[3]));
7120 for (
unsigned P = 0;
P < 8; ++
P) {
7127 Parts[0], Parts[1], Parts[2], Parts[3]));
7130 Parts[4], Parts[5],Parts[6], Parts[7]));
7133 EVT IdxVT =
Idx.getValueType();
7150 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7165 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7175 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7181 EVT ResultVT =
Op.getValueType();
7184 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7186 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7202 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7203 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7211 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7212 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7213 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7214 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7233 EVT ResultVT =
Op.getValueType();
7249 EVT VT =
Op.getValueType();
7251 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7252 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7271 { CastLo, CastHi });
7275 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7282 for (
unsigned P = 0;
P < 4; ++
P)
7283 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7286 for (
unsigned P = 0;
P < 4; ++
P) {
7296 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7303 for (
unsigned P = 0;
P < 8; ++
P)
7304 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7307 for (
unsigned P = 0;
P < 8; ++
P) {
7317 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7369 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7407 EVT PtrVT =
Op.getValueType();
7423 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7496 SDValue Param = lowerKernargMemParameter(
7506 "non-hsa intrinsic with hsa target",
7515 "intrinsic not supported on subtarget",
7525 unsigned NumElts = Elts.
size();
7527 if (NumElts <= 12) {
7536 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7542 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7543 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7552 EVT SrcVT = Src.getValueType();
7573 bool Unpacked,
bool IsD16,
int DMaskPop,
7574 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7577 EVT ReqRetVT = ResultTypes[0];
7579 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7580 ? (ReqRetNumElts + 1) / 2
7583 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7584 DMaskPop : (DMaskPop + 1) / 2;
7586 MVT DataDwordVT = NumDataDwords == 1 ?
7589 MVT MaskPopVT = MaskPopDwords == 1 ?
7595 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7606 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7608 NumDataDwords - MaskPopDwords);
7613 EVT LegalReqRetVT = ReqRetVT;
7615 if (!
Data.getValueType().isInteger())
7617 Data.getValueType().changeTypeToInteger(),
Data);
7638 if (Result->getNumValues() == 1)
7645 SDValue *LWE,
bool &IsTexFail) {
7646 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7665 unsigned DimIdx,
unsigned EndIdx,
7666 unsigned NumGradients) {
7668 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7676 if (((
I + 1) >= EndIdx) ||
7677 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7678 I == DimIdx + NumGradients - 1))) {
7679 if (
Addr.getValueType() != MVT::i16)
7700 unsigned IntrOpcode =
Intr->BaseOpcode;
7712 bool AdjustRetType =
false;
7713 bool IsAtomicPacked16Bit =
false;
7716 const unsigned ArgOffset = WithChain ? 2 : 1;
7719 unsigned DMaskLanes = 0;
7721 if (BaseOpcode->Atomic) {
7722 VData =
Op.getOperand(2);
7724 IsAtomicPacked16Bit =
7725 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7726 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7729 if (BaseOpcode->AtomicX2) {
7736 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7737 DMask = Is64Bit ? 0xf : 0x3;
7738 NumVDataDwords = Is64Bit ? 4 : 2;
7740 DMask = Is64Bit ? 0x3 : 0x1;
7741 NumVDataDwords = Is64Bit ? 2 : 1;
7744 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7747 if (BaseOpcode->Store) {
7748 VData =
Op.getOperand(2);
7756 VData = handleD16VData(VData, DAG,
true);
7773 (!LoadVT.
isVector() && DMaskLanes > 1))
7781 NumVDataDwords = (DMaskLanes + 1) / 2;
7783 NumVDataDwords = DMaskLanes;
7785 AdjustRetType =
true;
7789 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
7794 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
7796 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7797 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7799 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
7801 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7802 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7805 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
7806 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
7807 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
7812 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
7816 "Bias needs to be converted to 16 bit in A16 mode");
7821 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
7825 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
7826 "require 16 bit args for both gradients and addresses");
7831 if (!
ST->hasA16()) {
7832 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
7833 "support 16 bit addresses\n");
7843 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
7847 IntrOpcode = G16MappingInfo->
G16;
7855 ArgOffset +
Intr->GradientStart,
7856 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
7858 for (
unsigned I = ArgOffset +
Intr->GradientStart;
7859 I < ArgOffset + Intr->CoordStart;
I++)
7866 ArgOffset +
Intr->CoordStart, VAddrEnd,
7870 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
7888 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
7889 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
7890 const bool UseNSA =
ST->hasNSAEncoding() &&
7891 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
7892 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
7893 const bool UsePartialNSA =
7894 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
7897 if (UsePartialNSA) {
7899 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7908 if (!BaseOpcode->Sampler) {
7912 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
7914 Unorm = UnormConst ? True : False;
7919 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
7920 bool IsTexFail =
false;
7921 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7932 NumVDataDwords += 1;
7933 AdjustRetType =
true;
7938 if (AdjustRetType) {
7940 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7943 if (isa<MemSDNode>(
Op))
7948 EVT NewVT = NumVDataDwords > 1 ?
7952 ResultTypes[0] = NewVT;
7953 if (ResultTypes.size() == 3) {
7957 ResultTypes.erase(&ResultTypes[1]);
7961 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
7962 if (BaseOpcode->Atomic)
7969 if (BaseOpcode->Store || BaseOpcode->Atomic)
7971 if (UsePartialNSA) {
7980 if (BaseOpcode->Sampler)
7985 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7989 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7997 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8001 if (BaseOpcode->HasD16)
8003 if (isa<MemSDNode>(
Op))
8006 int NumVAddrDwords =
8012 NumVDataDwords, NumVAddrDwords);
8013 }
else if (IsGFX11Plus) {
8015 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8016 : AMDGPU::MIMGEncGfx11Default,
8017 NumVDataDwords, NumVAddrDwords);
8018 }
else if (IsGFX10Plus) {
8020 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8021 : AMDGPU::MIMGEncGfx10Default,
8022 NumVDataDwords, NumVAddrDwords);
8026 NumVDataDwords, NumVAddrDwords);
8029 "requested image instruction is not supported on this GPU");
8034 NumVDataDwords, NumVAddrDwords);
8037 NumVDataDwords, NumVAddrDwords);
8043 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
8048 if (BaseOpcode->AtomicX2) {
8053 if (BaseOpcode->Store)
8057 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8075 if (!
Offset->isDivergent()) {
8120 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8124 unsigned NumLoads = 1;
8130 if (NumElts == 8 || NumElts == 16) {
8131 NumLoads = NumElts / 4;
8139 setBufferOffsets(
Offset, DAG, &Ops[3],
8140 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8143 for (
unsigned i = 0; i < NumLoads; ++i) {
8149 if (NumElts == 8 || NumElts == 16)
8196 EVT VT =
Op.getValueType();
8198 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8202 switch (IntrinsicID) {
8203 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8206 return getPreloadedValue(DAG, *MFI, VT,
8209 case Intrinsic::amdgcn_dispatch_ptr:
8210 case Intrinsic::amdgcn_queue_ptr: {
8213 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8219 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8221 return getPreloadedValue(DAG, *MFI, VT, RegID);
8223 case Intrinsic::amdgcn_implicitarg_ptr: {
8225 return getImplicitArgPtr(DAG,
DL);
8226 return getPreloadedValue(DAG, *MFI, VT,
8229 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8235 return getPreloadedValue(DAG, *MFI, VT,
8238 case Intrinsic::amdgcn_dispatch_id: {
8241 case Intrinsic::amdgcn_rcp:
8243 case Intrinsic::amdgcn_rsq:
8245 case Intrinsic::amdgcn_rsq_legacy:
8249 case Intrinsic::amdgcn_rcp_legacy:
8253 case Intrinsic::amdgcn_rsq_clamp: {
8267 case Intrinsic::r600_read_ngroups_x:
8271 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8274 case Intrinsic::r600_read_ngroups_y:
8278 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8281 case Intrinsic::r600_read_ngroups_z:
8285 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8288 case Intrinsic::r600_read_global_size_x:
8292 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8295 case Intrinsic::r600_read_global_size_y:
8299 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8302 case Intrinsic::r600_read_global_size_z:
8306 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8309 case Intrinsic::r600_read_local_size_x:
8313 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8315 case Intrinsic::r600_read_local_size_y:
8319 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8321 case Intrinsic::r600_read_local_size_z:
8325 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8327 case Intrinsic::amdgcn_workgroup_id_x:
8328 return getPreloadedValue(DAG, *MFI, VT,
8330 case Intrinsic::amdgcn_workgroup_id_y:
8331 return getPreloadedValue(DAG, *MFI, VT,
8333 case Intrinsic::amdgcn_workgroup_id_z:
8334 return getPreloadedValue(DAG, *MFI, VT,
8336 case Intrinsic::amdgcn_wave_id:
8337 return lowerWaveID(DAG,
Op);
8338 case Intrinsic::amdgcn_lds_kernel_id: {
8340 return getLDSKernelId(DAG,
DL);
8341 return getPreloadedValue(DAG, *MFI, VT,
8344 case Intrinsic::amdgcn_workitem_id_x:
8345 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8346 case Intrinsic::amdgcn_workitem_id_y:
8347 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8348 case Intrinsic::amdgcn_workitem_id_z:
8349 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8350 case Intrinsic::amdgcn_wavefrontsize:
8353 case Intrinsic::amdgcn_s_buffer_load: {
8354 unsigned CPol =
Op.getConstantOperandVal(3);
8361 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8364 case Intrinsic::amdgcn_fdiv_fast:
8365 return lowerFDIV_FAST(
Op, DAG);
8366 case Intrinsic::amdgcn_sin:
8369 case Intrinsic::amdgcn_cos:
8372 case Intrinsic::amdgcn_mul_u24:
8374 case Intrinsic::amdgcn_mul_i24:
8377 case Intrinsic::amdgcn_log_clamp: {
8383 case Intrinsic::amdgcn_fract:
8386 case Intrinsic::amdgcn_class:
8388 Op.getOperand(1),
Op.getOperand(2));
8389 case Intrinsic::amdgcn_div_fmas:
8391 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8394 case Intrinsic::amdgcn_div_fixup:
8396 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8398 case Intrinsic::amdgcn_div_scale: {
8411 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8414 Denominator, Numerator);
8416 case Intrinsic::amdgcn_icmp: {
8418 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8419 Op.getConstantOperandVal(2) == 0 &&
8424 case Intrinsic::amdgcn_fcmp: {
8427 case Intrinsic::amdgcn_ballot:
8429 case Intrinsic::amdgcn_fmed3:
8431 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8432 case Intrinsic::amdgcn_fdot2:
8434 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8436 case Intrinsic::amdgcn_fmul_legacy:
8438 Op.getOperand(1),
Op.getOperand(2));
8439 case Intrinsic::amdgcn_sffbh:
8441 case Intrinsic::amdgcn_sbfe:
8443 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8444 case Intrinsic::amdgcn_ubfe:
8446 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8447 case Intrinsic::amdgcn_cvt_pkrtz:
8448 case Intrinsic::amdgcn_cvt_pknorm_i16:
8449 case Intrinsic::amdgcn_cvt_pknorm_u16:
8450 case Intrinsic::amdgcn_cvt_pk_i16:
8451 case Intrinsic::amdgcn_cvt_pk_u16: {
8453 EVT VT =
Op.getValueType();
8456 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8458 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8460 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8462 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8468 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8471 Op.getOperand(1),
Op.getOperand(2));
8474 case Intrinsic::amdgcn_fmad_ftz:
8476 Op.getOperand(2),
Op.getOperand(3));
8478 case Intrinsic::amdgcn_if_break:
8480 Op->getOperand(1),
Op->getOperand(2)), 0);
8482 case Intrinsic::amdgcn_groupstaticsize: {
8494 case Intrinsic::amdgcn_is_shared:
8495 case Intrinsic::amdgcn_is_private: {
8497 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8499 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8507 case Intrinsic::amdgcn_perm:
8509 Op.getOperand(2),
Op.getOperand(3));
8510 case Intrinsic::amdgcn_reloc_constant: {
8514 auto RelocSymbol = cast<GlobalVariable>(
8520 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8521 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8522 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8528 if (
Op.getOperand(4).getValueType() == MVT::i32)
8534 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8535 Op.getOperand(3), IndexKeyi32);
8537 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8538 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8539 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8540 if (
Op.getOperand(6).getValueType() == MVT::i32)
8546 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8547 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8548 IndexKeyi32, Op.getOperand(7)});
8550 case Intrinsic::amdgcn_addrspacecast_nonnull:
8551 return lowerADDRSPACECAST(
Op, DAG);
8555 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8566 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8572 unsigned NewOpcode)
const {
8576 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8577 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8591 auto *
M = cast<MemSDNode>(
Op);
8595 M->getMemOperand());
8606 unsigned NewOpcode)
const {
8610 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8611 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8625 auto *
M = cast<MemSDNode>(
Op);
8629 M->getMemOperand());
8634 unsigned IntrID =
Op.getConstantOperandVal(1);
8638 case Intrinsic::amdgcn_ds_ordered_add:
8639 case Intrinsic::amdgcn_ds_ordered_swap: {
8644 unsigned IndexOperand =
M->getConstantOperandVal(7);
8645 unsigned WaveRelease =
M->getConstantOperandVal(8);
8646 unsigned WaveDone =
M->getConstantOperandVal(9);
8648 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8649 IndexOperand &= ~0x3f;
8650 unsigned CountDw = 0;
8653 CountDw = (IndexOperand >> 24) & 0xf;
8654 IndexOperand &= ~(0xf << 24);
8656 if (CountDw < 1 || CountDw > 4) {
8658 "ds_ordered_count: dword count must be between 1 and 4");
8665 if (WaveDone && !WaveRelease)
8668 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8669 unsigned ShaderType =
8671 unsigned Offset0 = OrderedCountIndex << 2;
8672 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8675 Offset1 |= (CountDw - 1) << 6;
8678 Offset1 |= ShaderType << 2;
8680 unsigned Offset = Offset0 | (Offset1 << 8);
8689 M->getVTList(), Ops,
M->getMemoryVT(),
8690 M->getMemOperand());
8692 case Intrinsic::amdgcn_ds_fadd: {
8696 case Intrinsic::amdgcn_ds_fadd:
8702 M->getOperand(0),
M->getOperand(2),
M->getOperand(3),
8703 M->getMemOperand());
8705 case Intrinsic::amdgcn_ds_fmin:
8706 case Intrinsic::amdgcn_ds_fmax: {
8710 case Intrinsic::amdgcn_ds_fmin:
8713 case Intrinsic::amdgcn_ds_fmax:
8726 M->getMemoryVT(),
M->getMemOperand());
8728 case Intrinsic::amdgcn_buffer_load:
8729 case Intrinsic::amdgcn_buffer_load_format: {
8730 unsigned Glc =
Op.getConstantOperandVal(5);
8731 unsigned Slc =
Op.getConstantOperandVal(6);
8743 setBufferOffsets(
Op.getOperand(4), DAG, &Ops[3]);
8745 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8748 EVT VT =
Op.getValueType();
8750 auto *
M = cast<MemSDNode>(
Op);
8751 EVT LoadVT =
Op.getValueType();
8759 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
8760 M->getMemOperand());
8762 return getMemIntrinsicNode(Opc,
DL,
Op->getVTList(), Ops, IntVT,
8763 M->getMemOperand(), DAG);
8765 case Intrinsic::amdgcn_raw_buffer_load:
8766 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8767 case Intrinsic::amdgcn_raw_buffer_load_format:
8768 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8769 const bool IsFormat =
8770 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8771 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8773 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8774 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8787 auto *
M = cast<MemSDNode>(
Op);
8788 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8790 case Intrinsic::amdgcn_struct_buffer_load:
8791 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8792 case Intrinsic::amdgcn_struct_buffer_load_format:
8793 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8794 const bool IsFormat =
8795 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8796 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8798 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8799 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8812 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8814 case Intrinsic::amdgcn_tbuffer_load: {
8816 EVT LoadVT =
Op.getValueType();
8819 unsigned Dfmt =
Op.getConstantOperandVal(7);
8820 unsigned Nfmt =
Op.getConstantOperandVal(8);
8821 unsigned Glc =
Op.getConstantOperandVal(9);
8822 unsigned Slc =
Op.getConstantOperandVal(10);
8840 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8843 case Intrinsic::amdgcn_raw_tbuffer_load:
8844 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8846 EVT LoadVT =
Op.getValueType();
8847 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8848 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8867 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8870 case Intrinsic::amdgcn_struct_tbuffer_load:
8871 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8873 EVT LoadVT =
Op.getValueType();
8874 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8875 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8894 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8897 case Intrinsic::amdgcn_buffer_atomic_swap:
8898 case Intrinsic::amdgcn_buffer_atomic_add:
8899 case Intrinsic::amdgcn_buffer_atomic_sub:
8900 case Intrinsic::amdgcn_buffer_atomic_csub:
8901 case Intrinsic::amdgcn_buffer_atomic_smin:
8902 case Intrinsic::amdgcn_buffer_atomic_umin:
8903 case Intrinsic::amdgcn_buffer_atomic_smax:
8904 case Intrinsic::amdgcn_buffer_atomic_umax:
8905 case Intrinsic::amdgcn_buffer_atomic_and:
8906 case Intrinsic::amdgcn_buffer_atomic_or:
8907 case Intrinsic::amdgcn_buffer_atomic_xor:
8908 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8909 unsigned Slc =
Op.getConstantOperandVal(6);
8922 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
8924 EVT VT =
Op.getValueType();
8926 auto *
M = cast<MemSDNode>(
Op);
8927 unsigned Opcode = 0;
8930 case Intrinsic::amdgcn_buffer_atomic_swap:
8933 case Intrinsic::amdgcn_buffer_atomic_add:
8936 case Intrinsic::amdgcn_buffer_atomic_sub:
8939 case Intrinsic::amdgcn_buffer_atomic_csub:
8942 case Intrinsic::amdgcn_buffer_atomic_smin:
8945 case Intrinsic::amdgcn_buffer_atomic_umin:
8948 case Intrinsic::amdgcn_buffer_atomic_smax:
8951 case Intrinsic::amdgcn_buffer_atomic_umax:
8954 case Intrinsic::amdgcn_buffer_atomic_and:
8957 case Intrinsic::amdgcn_buffer_atomic_or:
8960 case Intrinsic::amdgcn_buffer_atomic_xor:
8963 case Intrinsic::amdgcn_buffer_atomic_fadd:
8971 M->getMemOperand());
8973 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8977 return lowerRawBufferAtomicIntrin(
Op, DAG,
8979 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8980 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8982 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8983 return lowerStructBufferAtomicIntrin(
Op, DAG,
8985 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8986 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8988 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8989 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8991 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8992 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8994 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8995 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8997 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8998 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9000 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9003 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9004 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9006 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9007 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9009 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9012 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9013 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9015 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9018 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9019 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9021 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9024 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9027 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9030 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9033 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9034 return lowerRawBufferAtomicIntrin(
Op, DAG,
9036 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9037 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9038 return lowerStructBufferAtomicIntrin(
Op, DAG,
9040 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9043 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9044 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9046 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9047 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9048 return lowerStructBufferAtomicIntrin(
Op, DAG,
9050 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9052 return lowerStructBufferAtomicIntrin(
Op, DAG,
9054 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9056 return lowerStructBufferAtomicIntrin(
Op, DAG,
9058 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9060 return lowerStructBufferAtomicIntrin(
Op, DAG,
9062 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9065 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9068 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9071 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9074 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9077 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9078 return lowerStructBufferAtomicIntrin(
Op, DAG,
9081 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
9082 unsigned Slc =
Op.getConstantOperandVal(7);
9096 setBufferOffsets(
Op.getOperand(6), DAG, &Ops[5]);
9098 EVT VT =
Op.getValueType();
9099 auto *
M = cast<MemSDNode>(
Op);
9102 Op->getVTList(), Ops, VT,
M->getMemOperand());
9104 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9105 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9106 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9107 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9121 EVT VT =
Op.getValueType();
9122 auto *
M = cast<MemSDNode>(
Op);
9125 Op->getVTList(), Ops, VT,
M->getMemOperand());
9127 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9129 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9130 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9144 EVT VT =
Op.getValueType();
9145 auto *
M = cast<MemSDNode>(
Op);
9148 Op->getVTList(), Ops, VT,
M->getMemOperand());
9150 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9152 SDValue NodePtr =
M->getOperand(2);
9153 SDValue RayExtent =
M->getOperand(3);
9154 SDValue RayOrigin =
M->getOperand(4);
9156 SDValue RayInvDir =
M->getOperand(6);
9174 const unsigned NumVDataDwords = 4;
9175 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9176 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9180 const unsigned BaseOpcodes[2][2] = {
9181 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9182 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9183 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9187 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9188 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9189 : AMDGPU::MIMGEncGfx10NSA,
9190 NumVDataDwords, NumVAddrDwords);
9194 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9195 : AMDGPU::MIMGEncGfx10Default,
9196 NumVDataDwords, NumVAddrDwords);
9202 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9205 if (Lanes[0].getValueSizeInBits() == 32) {
9206 for (
unsigned I = 0;
I < 3; ++
I)
9213 { Lanes[0], Lanes[1] })));
9220 { Elt0, Lanes[0] })));
9224 { Lanes[1], Lanes[2] })));
9229 if (UseNSA && IsGFX11Plus) {
9237 for (
unsigned I = 0;
I < 3; ++
I) {
9240 {DirLanes[I], InvDirLanes[I]})));
9255 packLanes(RayOrigin,
true);
9256 packLanes(RayDir,
true);
9257 packLanes(RayInvDir,
false);
9262 if (NumVAddrDwords > 12) {
9282 case Intrinsic::amdgcn_global_atomic_fmin:
9283 case Intrinsic::amdgcn_global_atomic_fmax:
9284 case Intrinsic::amdgcn_global_atomic_fmin_num:
9285 case Intrinsic::amdgcn_global_atomic_fmax_num:
9286 case Intrinsic::amdgcn_flat_atomic_fmin:
9287 case Intrinsic::amdgcn_flat_atomic_fmax:
9288 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9289 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9296 unsigned Opcode = 0;
9298 case Intrinsic::amdgcn_global_atomic_fmin:
9299 case Intrinsic::amdgcn_global_atomic_fmin_num:
9300 case Intrinsic::amdgcn_flat_atomic_fmin:
9301 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9305 case Intrinsic::amdgcn_global_atomic_fmax:
9306 case Intrinsic::amdgcn_global_atomic_fmax_num:
9307 case Intrinsic::amdgcn_flat_atomic_fmax:
9308 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9316 M->getVTList(), Ops,
M->getMemoryVT(),
9317 M->getMemOperand());
9319 case Intrinsic::amdgcn_s_get_barrier_state: {
9323 bool IsInlinableBarID =
false;
9326 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9327 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9331 if (IsInlinableBarID) {
9332 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9348 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9356SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9366 bool IsTFE = VTList.
NumVTs == 3;
9369 unsigned NumOpDWords = NumValueDWords + 1;
9374 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9375 OpDWordsVT, OpDWordsMMO, DAG);
9390 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9396 WidenedMemVT, WidenedMMO);
9406 bool ImageStore)
const {
9441 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9447 if ((NumElements % 2) == 1) {
9449 unsigned I = Elts.
size() / 2;
9465 if (NumElements == 3) {
9486 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9489 switch (IntrinsicID) {
9490 case Intrinsic::amdgcn_exp_compr: {
9494 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9517 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9520 case Intrinsic::amdgcn_s_barrier: {
9523 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9524 if (WGSize <=
ST.getWavefrontSize())
9526 Op.getOperand(0)), 0);
9530 if (
ST.hasSplitBarriers()) {
9535 MVT::Other, K,
Op.getOperand(0)),
9546 case Intrinsic::amdgcn_tbuffer_store: {
9550 VData = handleD16VData(VData, DAG);
9551 unsigned Dfmt =
Op.getConstantOperandVal(8);
9552 unsigned Nfmt =
Op.getConstantOperandVal(9);
9553 unsigned Glc =
Op.getConstantOperandVal(10);
9554 unsigned Slc =
Op.getConstantOperandVal(11);
9572 M->getMemoryVT(),
M->getMemOperand());
9575 case Intrinsic::amdgcn_struct_tbuffer_store:
9576 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9580 VData = handleD16VData(VData, DAG);
9581 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9582 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9600 M->getMemoryVT(),
M->getMemOperand());
9603 case Intrinsic::amdgcn_raw_tbuffer_store:
9604 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9608 VData = handleD16VData(VData, DAG);
9609 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9610 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9628 M->getMemoryVT(),
M->getMemOperand());
9631 case Intrinsic::amdgcn_buffer_store:
9632 case Intrinsic::amdgcn_buffer_store_format: {
9636 VData = handleD16VData(VData, DAG);
9637 unsigned Glc =
Op.getConstantOperandVal(6);
9638 unsigned Slc =
Op.getConstantOperandVal(7);
9651 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
9653 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9660 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9661 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9664 M->getMemoryVT(),
M->getMemOperand());
9667 case Intrinsic::amdgcn_raw_buffer_store:
9668 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9669 case Intrinsic::amdgcn_raw_buffer_store_format:
9670 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9671 const bool IsFormat =
9672 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9673 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9680 VData = handleD16VData(VData, DAG);
9690 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9691 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9711 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9714 M->getMemoryVT(),
M->getMemOperand());
9717 case Intrinsic::amdgcn_struct_buffer_store:
9718 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9719 case Intrinsic::amdgcn_struct_buffer_store_format:
9720 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9721 const bool IsFormat =
9722 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9723 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9731 VData = handleD16VData(VData, DAG);
9741 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9742 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9763 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9766 M->getMemoryVT(),
M->getMemOperand());
9768 case Intrinsic::amdgcn_raw_buffer_load_lds:
9769 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9770 case Intrinsic::amdgcn_struct_buffer_load_lds:
9771 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9775 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9776 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9777 unsigned OpOffset = HasVIndex ? 1 : 0;
9778 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9780 unsigned Size =
Op->getConstantOperandVal(4);
9786 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9787 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9788 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9789 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9792 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9793 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9794 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9795 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9798 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9799 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9800 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9801 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9809 if (HasVIndex && HasVOffset)
9815 else if (HasVOffset)
9818 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9822 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9830 auto *
M = cast<MemSDNode>(
Op);
9857 case Intrinsic::amdgcn_global_load_lds: {
9859 unsigned Size =
Op->getConstantOperandVal(4);
9864 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9867 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9870 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9874 auto *
M = cast<MemSDNode>(
Op);
9887 if (
LHS->isDivergent())
9891 RHS.getOperand(0).getValueType() == MVT::i32) {
9894 VOffset =
RHS.getOperand(0);
9899 if (!
Addr->isDivergent()) {
9915 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9935 case Intrinsic::amdgcn_end_cf:
9937 Op->getOperand(2), Chain), 0);
9938 case Intrinsic::amdgcn_s_barrier_init:
9939 case Intrinsic::amdgcn_s_barrier_join:
9940 case Intrinsic::amdgcn_s_wakeup_barrier: {
9945 bool IsInlinableBarID =
false;
9948 if (isa<ConstantSDNode>(BarOp)) {
9949 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9953 if (IsInlinableBarID) {
9954 switch (IntrinsicID) {
9957 case Intrinsic::amdgcn_s_barrier_init:
9958 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9960 case Intrinsic::amdgcn_s_barrier_join:
9961 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9963 case Intrinsic::amdgcn_s_wakeup_barrier:
9964 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9971 switch (IntrinsicID) {
9974 case Intrinsic::amdgcn_s_barrier_init:
9975 Opc = AMDGPU::S_BARRIER_INIT_M0;
9977 case Intrinsic::amdgcn_s_barrier_join:
9978 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9980 case Intrinsic::amdgcn_s_wakeup_barrier:
9981 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9986 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9992 if (!IsInlinableBarID) {
9997 Op.getOperand(2), M0Val),
10001 }
else if (!IsInlinableBarID) {
10011 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10024std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
10031 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10034 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10048 unsigned Overflow = ImmOffset & ~MaxImm;
10049 ImmOffset -= Overflow;
10050 if ((int32_t)Overflow < 0) {
10051 Overflow += ImmOffset;
10056 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10060 SDValue Ops[] = { N0, OverflowVal };
10075void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10077 Align Alignment)
const {
10080 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10083 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10094 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10096 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10113SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10116 return MaybePointer;
10132 SDValue NumRecords =
Op->getOperand(3);
10135 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10138 std::optional<uint32_t> ConstStride = std::nullopt;
10139 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10140 ConstStride = ConstNode->getZExtValue();
10142 SDValue NewHighHalf = Masked;
10143 if (!ConstStride || *ConstStride != 0) {
10146 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10153 NewHighHalf = DAG.
getNode(
ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10157 NewHighHalf, NumRecords, Flags);
10164SITargetLowering::handleByteShortBufferLoads(
SelectionDAG &DAG,
EVT LoadVT,
10185 if (VDataType == MVT::f16)
10189 Ops[1] = BufferStoreExt;
10194 M->getMemOperand());
10219SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10235 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10242 "unexpected vector extload");
10255 "unexpected fp extload");
10273 DCI.AddToWorklist(Cvt.
getNode());
10278 DCI.AddToWorklist(Cvt.
getNode());
10289 if (
Info.isEntryFunction())
10290 return Info.getUserSGPRInfo().hasFlatScratchInit();
10298 EVT MemVT =
Load->getMemoryVT();
10311 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10314 BasePtr, RealMemVT, MMO);
10344 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10345 "Custom lowering for non-i32 vectors hasn't been implemented.");
10348 unsigned AS =
Load->getAddressSpace();
10367 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10384 Alignment >=
Align(4) && NumElements < 32) {
10399 if (NumElements > 4)
10419 if (NumElements > 2)
10424 if (NumElements > 4)
10436 auto Flags =
Load->getMemOperand()->getFlags();
10438 Load->getAlign(), Flags, &
Fast) &&
10447 MemVT, *
Load->getMemOperand())) {
10457 EVT VT =
Op.getValueType();
10494 EVT VT =
Op.getValueType();
10497 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10504 if (!AllowInaccurateRcp && VT != MVT::f16)
10507 if (CLHS->isExactlyValue(1.0)) {
10524 if (CLHS->isExactlyValue(-1.0)) {
10533 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10547 EVT VT =
Op.getValueType();
10550 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10552 if (!AllowInaccurateDiv)
10573 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10586 return DAG.
getNode(Opcode, SL, VTList,
10595 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10608 return DAG.
getNode(Opcode, SL, VTList,
10614 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10615 return FastLowered;
10642 const APFloat K0Val(0x1p+96f);
10645 const APFloat K1Val(0x1p-32f);
10672 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10673 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10674 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10679 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10680 return FastLowered;
10687 Flags.setNoFPExcept(
true);
10704 DenominatorScaled, Flags);
10706 DenominatorScaled, Flags);
10708 using namespace AMDGPU::Hwreg;
10709 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10717 const bool HasDynamicDenormals =
10723 if (!PreservesDenormals) {
10731 if (HasDynamicDenormals) {
10735 SavedDenormMode =
SDValue(GetReg, 0);
10743 const SDValue EnableDenormValue =
10752 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10753 {EnableDenormValue,
BitField, Glue});
10766 ApproxRcp, One, NegDivScale0, Flags);
10769 ApproxRcp, Fma0, Flags);
10772 Fma1, Fma1, Flags);
10775 NumeratorScaled,
Mul, Flags);
10778 Fma2, Fma1,
Mul, Fma2, Flags);
10781 NumeratorScaled, Fma3, Flags);
10783 if (!PreservesDenormals) {
10790 Fma4.
getValue(1), DisableDenormValue,
10793 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10794 const SDValue DisableDenormValue =
10795 HasDynamicDenormals
10800 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10811 {Fma4, Fma1, Fma3, Scale},
Flags);
10817 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10818 return FastLowered;
10846 NegDivScale0,
Mul, DivScale1);
10878 Fma4, Fma3,
Mul, Scale);
10884 EVT VT =
Op.getValueType();
10886 if (VT == MVT::f32)
10887 return LowerFDIV32(
Op, DAG);
10889 if (VT == MVT::f64)
10890 return LowerFDIV64(
Op, DAG);
10892 if (VT == MVT::f16)
10893 return LowerFDIV16(
Op, DAG);
10902 EVT ResultExpVT =
Op->getValueType(1);
10903 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10933 if (VT == MVT::i1) {
10936 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10940 Store->getValue().getValueType().getScalarType() == MVT::i32);
10942 unsigned AS =
Store->getAddressSpace();
10961 if (NumElements > 4)
10968 VT, *
Store->getMemOperand()))
10977 if (NumElements > 2)
10981 if (NumElements > 4 ||
10990 auto Flags =
Store->getMemOperand()->getFlags();
11025 MVT VT =
Op.getValueType().getSimpleVT();
11194 EVT VT =
Op.getValueType();
11211 switch (
Op.getOpcode()) {
11237 EVT VT =
Op.getValueType();
11253 DAGCombinerInfo &DCI)
const {
11254 EVT VT =
N->getValueType(0);
11256 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11263 EVT SrcVT = Src.getValueType();
11269 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11272 DCI.AddToWorklist(Cvt.
getNode());
11275 if (ScalarVT != MVT::f32) {
11287 DAGCombinerInfo &DCI)
const {
11288 SDValue MagnitudeOp =
N->getOperand(0);
11289 SDValue SignOp =
N->getOperand(1);
11347 unsigned AddrSpace,
11349 DAGCombinerInfo &DCI)
const {
11379 AM.HasBaseReg =
true;
11380 AM.BaseOffs =
Offset.getSExtValue();
11385 EVT VT =
N->getValueType(0);
11391 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11402 switch (
N->getOpcode()) {
11413 DAGCombinerInfo &DCI)
const {
11422 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11423 N->getMemoryVT(), DCI);
11427 NewOps[PtrIdx] = NewPtr;
11436 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11437 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11446SDValue SITargetLowering::splitBinaryBitConstantOp(
11447 DAGCombinerInfo &DCI,
11469 if (V.getValueType() != MVT::i1)
11471 switch (V.getOpcode()) {
11490 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11491 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11492 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11493 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11494 uint32_t NonZeroByteMask = ~ZeroByteMask;
11495 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11508 assert(V.getValueSizeInBits() == 32);
11510 if (V.getNumOperands() != 2)
11519 switch (V.getOpcode()) {
11524 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11529 return (0x03020100 & ~ConstMask) | ConstMask;
11536 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11542 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11549 DAGCombinerInfo &DCI)
const {
11550 if (DCI.isBeforeLegalize())
11554 EVT VT =
N->getValueType(0);
11560 if (VT == MVT::i64 && CRHS) {
11566 if (CRHS && VT == MVT::i32) {
11575 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11576 unsigned Shift = CShift->getZExtValue();
11578 unsigned Offset = NB + Shift;
11579 if ((
Offset & (Bits - 1)) == 0) {
11582 LHS->getOperand(0),
11597 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11603 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11618 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11623 if (
X !=
LHS.getOperand(1))
11661 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11662 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11665 Mask->getZExtValue() & ~OrdMask :
11666 Mask->getZExtValue() & OrdMask;
11674 if (VT == MVT::i32 &&
11687 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11690 if (LHSMask != ~0u && RHSMask != ~0u) {
11693 if (LHSMask > RHSMask) {
11700 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11701 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11704 if (!(LHSUsedLanes & RHSUsedLanes) &&
11707 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11714 for (
unsigned I = 0;
I < 32;
I += 8) {
11716 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11717 Mask &= (0x0c <<
I) & 0xffffffff;
11726 LHS.getOperand(0),
RHS.getOperand(0),
11775static const std::optional<ByteProvider<SDValue>>
11777 unsigned Depth = 0) {
11780 return std::nullopt;
11782 if (
Op.getValueSizeInBits() < 8)
11783 return std::nullopt;
11785 if (
Op.getValueType().isVector())
11788 switch (
Op->getOpcode()) {
11799 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11800 NarrowVT = VTSign->getVT();
11803 return std::nullopt;
11806 if (SrcIndex >= NarrowByteWidth)
11807 return std::nullopt;
11813 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11815 return std::nullopt;
11817 uint64_t BitShift = ShiftOp->getZExtValue();
11819 if (BitShift % 8 != 0)
11820 return std::nullopt;
11822 SrcIndex += BitShift / 8;
11840static const std::optional<ByteProvider<SDValue>>
11842 unsigned StartingIndex = 0) {
11846 return std::nullopt;
11848 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11850 return std::nullopt;
11852 return std::nullopt;
11854 bool IsVec =
Op.getValueType().isVector();
11855 switch (
Op.getOpcode()) {
11858 return std::nullopt;
11863 return std::nullopt;
11867 return std::nullopt;
11870 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11871 return std::nullopt;
11872 if (!
LHS ||
LHS->isConstantZero())
11874 if (!
RHS ||
RHS->isConstantZero())
11876 return std::nullopt;
11881 return std::nullopt;
11883 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11885 return std::nullopt;
11887 uint32_t BitMask = BitMaskOp->getZExtValue();
11891 if ((IndexMask & BitMask) != IndexMask) {
11894 if (IndexMask & BitMask)
11895 return std::nullopt;
11904 return std::nullopt;
11907 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11908 if (!ShiftOp ||
Op.getValueType().isVector())
11909 return std::nullopt;
11911 uint64_t BitsProvided =
Op.getValueSizeInBits();
11912 if (BitsProvided % 8 != 0)
11913 return std::nullopt;
11915 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11917 return std::nullopt;
11919 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11920 uint64_t ByteShift = BitShift / 8;
11922 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11923 uint64_t BytesProvided = BitsProvided / 8;
11924 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11925 NewIndex %= BytesProvided;
11932 return std::nullopt;
11934 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11936 return std::nullopt;
11938 uint64_t BitShift = ShiftOp->getZExtValue();
11940 return std::nullopt;
11942 auto BitsProvided =
Op.getScalarValueSizeInBits();
11943 if (BitsProvided % 8 != 0)
11944 return std::nullopt;
11946 uint64_t BytesProvided = BitsProvided / 8;
11947 uint64_t ByteShift = BitShift / 8;
11952 return BytesProvided - ByteShift >
Index
11960 return std::nullopt;
11962 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11964 return std::nullopt;
11966 uint64_t BitShift = ShiftOp->getZExtValue();
11967 if (BitShift % 8 != 0)
11968 return std::nullopt;
11969 uint64_t ByteShift = BitShift / 8;
11975 return Index < ByteShift
11978 Depth + 1, StartingIndex);
11987 return std::nullopt;
11994 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11995 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11997 if (NarrowBitWidth % 8 != 0)
11998 return std::nullopt;
11999 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12001 if (
Index >= NarrowByteWidth)
12003 ? std::optional<ByteProvider<SDValue>>(
12011 return std::nullopt;
12015 if (NarrowByteWidth >=
Index) {
12020 return std::nullopt;
12027 return std::nullopt;
12031 auto L = cast<LoadSDNode>(
Op.getNode());
12033 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12034 if (NarrowBitWidth % 8 != 0)
12035 return std::nullopt;
12036 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12041 if (
Index >= NarrowByteWidth) {
12043 ? std::optional<ByteProvider<SDValue>>(
12048 if (NarrowByteWidth >
Index) {
12052 return std::nullopt;
12057 return std::nullopt;
12060 Depth + 1, StartingIndex);
12064 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12066 return std::nullopt;
12067 auto VecIdx = IdxOp->getZExtValue();
12068 auto ScalarSize =
Op.getScalarValueSizeInBits();
12069 if (ScalarSize != 32) {
12070 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
12074 StartingIndex,
Index);
12079 return std::nullopt;
12081 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12083 return std::nullopt;
12086 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
12087 if (IdxMask > 0x07 && IdxMask != 0x0c)
12088 return std::nullopt;
12090 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12091 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12093 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12099 return std::nullopt;
12114 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12118 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12121 auto MemVT = L->getMemoryVT();
12124 return L->getMemoryVT().getSizeInBits() == 16;
12134 int Low8 = Mask & 0xff;
12135 int Hi8 = (Mask & 0xff00) >> 8;
12137 assert(Low8 < 8 && Hi8 < 8);
12139 bool IsConsecutive = (Hi8 - Low8 == 1);
12144 bool Is16Aligned = !(Low8 % 2);
12146 return IsConsecutive && Is16Aligned;
12154 int Low16 = PermMask & 0xffff;
12155 int Hi16 = (PermMask & 0xffff0000) >> 16;
12165 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12167 if (!OtherOpIs16Bit)
12175 unsigned DWordOffset) {
12178 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12180 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12185 if (Src.getValueType().isVector()) {
12186 auto ScalarTySize = Src.getScalarValueSizeInBits();
12187 auto ScalarTy = Src.getValueType().getScalarType();
12188 if (ScalarTySize == 32) {
12192 if (ScalarTySize > 32) {
12195 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12196 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12203 assert(ScalarTySize < 32);
12204 auto NumElements =
TypeSize / ScalarTySize;
12205 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12206 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12207 auto NumElementsIn32 = 32 / ScalarTySize;
12208 auto NumAvailElements = DWordOffset < Trunc32Elements
12210 : NumElements - NormalizedTrunc;
12223 auto ShiftVal = 32 * DWordOffset;
12231 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12236 for (
int i = 0; i < 4; i++) {
12238 std::optional<ByteProvider<SDValue>>
P =
12241 if (!
P ||
P->isConstantZero())
12246 if (PermNodes.
size() != 4)
12249 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12250 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12252 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12253 auto PermOp = PermNodes[i];
12256 int SrcByteAdjust = 4;
12260 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12261 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12263 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12264 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12268 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12269 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12272 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12274 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12277 SDValue Op = *PermNodes[FirstSrc.first].Src;
12279 assert(
Op.getValueSizeInBits() == 32);
12283 int Low16 = PermMask & 0xffff;
12284 int Hi16 = (PermMask & 0xffff0000) >> 16;
12286 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12287 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12290 if (WellFormedLow && WellFormedHi)
12294 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12303 assert(
Op.getValueType().isByteSized() &&
12321 DAGCombinerInfo &DCI)
const {
12326 EVT VT =
N->getValueType(0);
12327 if (VT == MVT::i1) {
12332 if (Src !=
RHS.getOperand(0))
12337 if (!CLHS || !CRHS)
12341 static const uint32_t MaxMask = 0x3ff;
12355 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12360 Sel |=
LHS.getConstantOperandVal(2);
12369 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12373 auto usesCombinedOperand = [](
SDNode *OrUse) {
12376 !OrUse->getValueType(0).isVector())
12380 for (
auto VUse : OrUse->uses()) {
12381 if (!VUse->getValueType(0).isVector())
12388 if (VUse->getOpcode() == VectorwiseOp)
12394 if (!
any_of(
N->uses(), usesCombinedOperand))
12400 if (LHSMask != ~0u && RHSMask != ~0u) {
12403 if (LHSMask > RHSMask) {
12410 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12411 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12414 if (!(LHSUsedLanes & RHSUsedLanes) &&
12417 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12419 LHSMask &= ~RHSUsedLanes;
12420 RHSMask &= ~LHSUsedLanes;
12422 LHSMask |= LHSUsedLanes & 0x04040404;
12428 LHS.getOperand(0),
RHS.getOperand(0),
12432 if (LHSMask == ~0u || RHSMask == ~0u) {
12438 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12453 if (SrcVT == MVT::i32) {
12459 DCI.AddToWorklist(LowOr.
getNode());
12460 DCI.AddToWorklist(HiBits.
getNode());
12468 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12472 N->getOperand(0), CRHS))
12480 DAGCombinerInfo &DCI)
const {
12481 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12490 EVT VT =
N->getValueType(0);
12491 if (CRHS && VT == MVT::i64) {
12513 LHS->getOperand(0), FNegLHS, FNegRHS);
12522 DAGCombinerInfo &DCI)
const {
12527 EVT VT =
N->getValueType(0);
12528 if (VT != MVT::i32)
12532 if (Src.getValueType() != MVT::i16)
12539SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12540 DAGCombinerInfo &DCI)
const {
12542 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12547 VTSign->getVT() == MVT::i8) ||
12549 VTSign->getVT() == MVT::i16))) {
12551 "s_buffer_load_{u8, i8} are supported "
12552 "in GFX12 (or newer) architectures.");
12553 EVT VT = Src.getValueType();
12558 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12564 auto *
M = cast<MemSDNode>(Src);
12565 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12566 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12570 VTSign->getVT() == MVT::i8) ||
12572 VTSign->getVT() == MVT::i16)) &&
12574 auto *
M = cast<MemSDNode>(Src);
12586 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12587 Src.getOperand(0).getValueType());
12590 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12592 Ops,
M->getMemoryVT(),
12593 M->getMemOperand());
12594 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12601 DAGCombinerInfo &DCI)
const {
12609 if (
N->getOperand(0).isUndef())
12616 DAGCombinerInfo &DCI)
const {
12617 EVT VT =
N->getValueType(0);
12621 return DCI.DAG.getConstantFP(
12644 unsigned Opcode =
Op.getOpcode();
12648 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12649 const auto &
F = CFP->getValueAPF();
12650 if (
F.isNaN() &&
F.isSignaling())
12652 if (!
F.isDenormal())
12715 if (
Op.getValueType() == MVT::i32) {
12720 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12721 if (
RHS->getZExtValue() == 0xffff0000) {
12731 return Op.getValueType().getScalarType() != MVT::f16;
12799 if (
Op.getValueType() == MVT::i16) {
12810 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12812 switch (IntrinsicID) {
12813 case Intrinsic::amdgcn_cvt_pkrtz:
12814 case Intrinsic::amdgcn_cubeid:
12815 case Intrinsic::amdgcn_frexp_mant:
12816 case Intrinsic::amdgcn_fdot2:
12817 case Intrinsic::amdgcn_rcp:
12818 case Intrinsic::amdgcn_rsq:
12819 case Intrinsic::amdgcn_rsq_clamp:
12820 case Intrinsic::amdgcn_rcp_legacy:
12821 case Intrinsic::amdgcn_rsq_legacy:
12822 case Intrinsic::amdgcn_trig_preop:
12823 case Intrinsic::amdgcn_log:
12824 case Intrinsic::amdgcn_exp2:
12825 case Intrinsic::amdgcn_sqrt:
12846 unsigned Opcode =
MI->getOpcode();
12848 if (Opcode == AMDGPU::G_FCANONICALIZE)
12851 std::optional<FPValueAndVReg> FCR;
12854 if (FCR->Value.isSignaling())
12856 if (!FCR->Value.isDenormal())
12867 case AMDGPU::G_FADD:
12868 case AMDGPU::G_FSUB:
12869 case AMDGPU::G_FMUL:
12870 case AMDGPU::G_FCEIL:
12871 case AMDGPU::G_FFLOOR:
12872 case AMDGPU::G_FRINT:
12873 case AMDGPU::G_FNEARBYINT:
12874 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12875 case AMDGPU::G_INTRINSIC_TRUNC:
12876 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12877 case AMDGPU::G_FMA:
12878 case AMDGPU::G_FMAD:
12879 case AMDGPU::G_FSQRT:
12880 case AMDGPU::G_FDIV:
12881 case AMDGPU::G_FREM:
12882 case AMDGPU::G_FPOW:
12883 case AMDGPU::G_FPEXT:
12884 case AMDGPU::G_FLOG:
12885 case AMDGPU::G_FLOG2:
12886 case AMDGPU::G_FLOG10:
12887 case AMDGPU::G_FPTRUNC:
12888 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12889 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12890 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12891 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12892 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12894 case AMDGPU::G_FNEG:
12895 case AMDGPU::G_FABS:
12896 case AMDGPU::G_FCOPYSIGN:
12898 case AMDGPU::G_FMINNUM:
12899 case AMDGPU::G_FMAXNUM:
12900 case AMDGPU::G_FMINNUM_IEEE:
12901 case AMDGPU::G_FMAXNUM_IEEE:
12902 case AMDGPU::G_FMINIMUM:
12903 case AMDGPU::G_FMAXIMUM: {
12911 case AMDGPU::G_BUILD_VECTOR:
12916 case AMDGPU::G_INTRINSIC:
12917 case AMDGPU::G_INTRINSIC_CONVERGENT:
12919 case Intrinsic::amdgcn_fmul_legacy:
12920 case Intrinsic::amdgcn_fmad_ftz:
12921 case Intrinsic::amdgcn_sqrt:
12922 case Intrinsic::amdgcn_fmed3:
12923 case Intrinsic::amdgcn_sin:
12924 case Intrinsic::amdgcn_cos:
12925 case Intrinsic::amdgcn_log:
12926 case Intrinsic::amdgcn_exp2:
12927 case Intrinsic::amdgcn_log_clamp:
12928 case Intrinsic::amdgcn_rcp:
12929 case Intrinsic::amdgcn_rcp_legacy:
12930 case Intrinsic::amdgcn_rsq:
12931 case Intrinsic::amdgcn_rsq_clamp:
12932 case Intrinsic::amdgcn_rsq_legacy:
12933 case Intrinsic::amdgcn_div_scale:
12934 case Intrinsic::amdgcn_div_fmas:
12935 case Intrinsic::amdgcn_div_fixup:
12936 case Intrinsic::amdgcn_fract:
12937 case Intrinsic::amdgcn_cvt_pkrtz:
12938 case Intrinsic::amdgcn_cubeid:
12939 case Intrinsic::amdgcn_cubema:
12940 case Intrinsic::amdgcn_cubesc:
12941 case Intrinsic::amdgcn_cubetc:
12942 case Intrinsic::amdgcn_frexp_mant:
12943 case Intrinsic::amdgcn_fdot2:
12944 case Intrinsic::amdgcn_trig_preop:
12959SDValue SITargetLowering::getCanonicalConstantFP(
12962 if (
C.isDenormal()) {
12976 if (
C.isSignaling()) {
12995 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12998SDValue SITargetLowering::performFCanonicalizeCombine(
13000 DAGCombinerInfo &DCI)
const {
13003 EVT VT =
N->getValueType(0);
13012 EVT VT =
N->getValueType(0);
13013 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13029 EVT EltVT =
Lo.getValueType();
13032 for (
unsigned I = 0;
I != 2; ++
I) {
13035 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
13036 CFP->getValueAPF());
13037 }
else if (
Op.isUndef()) {
13049 if (isa<ConstantFPSDNode>(NewElts[1]))
13050 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13055 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13106 if (!MinK || !MaxK)
13119 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13120 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13162 if (
Info->getMode().DX10Clamp) {
13171 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13193 DAGCombinerInfo &DCI)
const {
13196 EVT VT =
N->getValueType(0);
13197 unsigned Opc =
N->getOpcode();
13206 (VT == MVT::i32 || VT == MVT::f32 ||
13207 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->
hasMin3Max3_16()))) {
13214 N->getValueType(0),
13227 N->getValueType(0),
13237 if (
SDValue Med3 = performIntMed3ImmCombine(
13242 if (
SDValue Med3 = performIntMed3ImmCombine(
13248 if (
SDValue Med3 = performIntMed3ImmCombine(
13253 if (
SDValue Med3 = performIntMed3ImmCombine(
13263 (VT == MVT::f32 || VT == MVT::f64 ||
13267 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13278 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13279 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13288 DAGCombinerInfo &DCI)
const {
13289 EVT VT =
N->getValueType(0);
13312 if (
Info->getMode().DX10Clamp) {
13315 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13318 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13321 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13332 DAGCombinerInfo &DCI)
const {
13336 return DCI.DAG.getUNDEF(
N->getValueType(0));
13344 bool IsDivergentIdx,
13349 unsigned VecSize = EltSize * NumElem;
13352 if (VecSize <= 64 && EltSize < 32)
13361 if (IsDivergentIdx)
13365 unsigned NumInsts = NumElem +
13366 ((EltSize + 31) / 32) * NumElem ;
13371 return NumInsts <= 16;
13375 return NumInsts <= 15;
13380 if (isa<ConstantSDNode>(
Idx))
13393SDValue SITargetLowering::performExtractVectorEltCombine(
13394 SDNode *
N, DAGCombinerInfo &DCI)
const {
13400 EVT ResVT =
N->getValueType(0);
13419 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13447 DCI.AddToWorklist(Elt0.
getNode());
13448 DCI.AddToWorklist(Elt1.
getNode());
13470 if (!DCI.isBeforeLegalize())
13476 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13477 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13478 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13481 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13482 unsigned EltIdx = BitIndex / 32;
13483 unsigned LeftoverBitIdx = BitIndex % 32;
13487 DCI.AddToWorklist(Cast.
getNode());
13491 DCI.AddToWorklist(Elt.
getNode());
13494 DCI.AddToWorklist(Srl.
getNode());
13498 DCI.AddToWorklist(Trunc.
getNode());
13500 if (VecEltVT == ResVT) {
13512SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13513 DAGCombinerInfo &DCI)
const {
13527 EVT IdxVT =
Idx.getValueType();
13544 Src.getOperand(0).getValueType() == MVT::f16) {
13545 return Src.getOperand(0);
13548 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13549 APFloat Val = CFP->getValueAPF();
13550 bool LosesInfo =
true;
13560 DAGCombinerInfo &DCI)
const {
13562 "combine only useful on gfx8");
13564 SDValue TruncSrc =
N->getOperand(0);
13565 EVT VT =
N->getValueType(0);
13566 if (VT != MVT::f16)
13604unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13606 const SDNode *N1)
const {
13611 if (((VT == MVT::f32 &&
13613 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13633 EVT VT =
N->getValueType(0);
13634 if (VT != MVT::i32 && VT != MVT::i64)
13640 unsigned Opc =
N->getOpcode();
13663 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13685 DAGCombinerInfo &DCI)
const {
13689 EVT VT =
N->getValueType(0);
13699 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13703 if (NumBits <= 32 || NumBits > 64)
13715 unsigned NumUsers = 0;
13740 bool MulSignedLo =
false;
13741 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13750 if (VT != MVT::i64) {
13773 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13775 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13777 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13779 if (!MulLHSUnsigned32) {
13786 if (!MulRHSUnsigned32) {
13797 if (VT != MVT::i64)
13804static std::optional<ByteProvider<SDValue>>
13807 if (!Byte0 || Byte0->isConstantZero()) {
13808 return std::nullopt;
13811 if (Byte1 && !Byte1->isConstantZero()) {
13812 return std::nullopt;
13818 unsigned FirstCs =
First & 0x0c0c0c0c;
13819 unsigned SecondCs = Second & 0x0c0c0c0c;
13820 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13821 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13823 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13824 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13825 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13826 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13828 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13852 for (
int BPI = 0; BPI < 2; BPI++) {
13855 BPP = {Src1, Src0};
13857 unsigned ZeroMask = 0x0c0c0c0c;
13858 unsigned FMask = 0xFF << (8 * (3 - Step));
13860 unsigned FirstMask =
13861 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13862 unsigned SecondMask =
13863 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13867 int FirstGroup = -1;
13868 for (
int I = 0;
I < 2;
I++) {
13870 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13871 return IterElt.SrcOp == *BPP.first.Src &&
13872 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13882 if (FirstGroup != -1) {
13884 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13885 return IterElt.SrcOp == *BPP.second.Src &&
13886 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13892 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13900 unsigned ZeroMask = 0x0c0c0c0c;
13901 unsigned FMask = 0xFF << (8 * (3 - Step));
13905 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13909 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13920 if (Srcs.
size() == 1) {
13921 auto Elt = Srcs.
begin();
13925 if (Elt->PermMask == 0x3020100)
13932 auto FirstElt = Srcs.
begin();
13933 auto SecondElt = std::next(FirstElt);
13940 auto FirstMask = FirstElt->PermMask;
13941 auto SecondMask = SecondElt->PermMask;
13943 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13944 unsigned FirstPlusFour = FirstMask | 0x04040404;
13947 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13959 FirstElt = std::next(SecondElt);
13960 if (FirstElt == Srcs.
end())
13963 SecondElt = std::next(FirstElt);
13966 if (SecondElt == Srcs.
end()) {
13972 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13978 return Perms.
size() == 2
13984 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13985 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13986 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13987 EntryMask += ZeroMask;
13992 auto Opcode =
Op.getOpcode();
13998static std::optional<bool>
14009 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14012 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14014 assert(!(S0IsUnsigned && S0IsSigned));
14015 assert(!(S1IsUnsigned && S1IsSigned));
14023 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14029 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14030 return std::nullopt;
14042 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14043 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14048 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14054 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14055 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14056 return std::nullopt;
14062 DAGCombinerInfo &DCI)
const {
14064 EVT VT =
N->getValueType(0);
14071 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14076 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14083 std::optional<bool> IsSigned;
14089 int ChainLength = 0;
14090 for (
int I = 0;
I < 4;
I++) {
14091 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14094 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14097 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14102 TempNode->getOperand(MulIdx), *Src0, *Src1,
14103 TempNode->getOperand(MulIdx)->getOperand(0),
14104 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14108 IsSigned = *IterIsSigned;
14109 if (*IterIsSigned != *IsSigned)
14112 auto AddIdx = 1 - MulIdx;
14115 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14116 Src2s.
push_back(TempNode->getOperand(AddIdx));
14126 TempNode->getOperand(AddIdx), *Src0, *Src1,
14127 TempNode->getOperand(AddIdx)->getOperand(0),
14128 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14132 if (*IterIsSigned != *IsSigned)
14136 ChainLength =
I + 2;
14140 TempNode = TempNode->getOperand(AddIdx);
14142 ChainLength =
I + 1;
14143 if (TempNode->getNumOperands() < 2)
14145 LHS = TempNode->getOperand(0);
14146 RHS = TempNode->getOperand(1);
14149 if (ChainLength < 2)
14155 if (ChainLength < 4) {
14165 bool UseOriginalSrc =
false;
14166 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14167 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14168 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14169 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14171 auto Src0Mask = Src0s.
begin()->PermMask;
14172 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14173 bool UniqueEntries =
true;
14174 for (
auto I = 1;
I < 4;
I++) {
14175 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14178 UniqueEntries =
false;
14184 if (UniqueEntries) {
14185 UseOriginalSrc =
true;
14187 auto FirstElt = Src0s.
begin();
14191 auto SecondElt = Src1s.
begin();
14193 SecondElt->DWordOffset);
14202 if (!UseOriginalSrc) {
14209 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14212 : Intrinsic::amdgcn_udot4,
14222 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14227 unsigned Opc =
LHS.getOpcode();
14232 Opc =
RHS.getOpcode();
14238 auto Cond =
RHS.getOperand(0);
14246 return DAG.
getNode(Opc, SL, VTList, Args);
14260 DAGCombinerInfo &DCI)
const {
14262 EVT VT =
N->getValueType(0);
14264 if (VT != MVT::i32)
14273 unsigned Opc =
RHS.getOpcode();
14279 auto Cond =
RHS.getOperand(0);
14287 return DAG.
getNode(Opc, SL, VTList, Args);
14301SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14302 DAGCombinerInfo &DCI)
const {
14304 if (
N->getValueType(0) != MVT::i32)
14315 unsigned LHSOpc =
LHS.getOpcode();
14316 unsigned Opc =
N->getOpcode();
14326 DAGCombinerInfo &DCI)
const {
14331 EVT VT =
N->getValueType(0);
14343 if (
A ==
LHS.getOperand(1)) {
14344 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14345 if (FusedOp != 0) {
14347 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14355 if (
A ==
RHS.getOperand(1)) {
14356 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14357 if (FusedOp != 0) {
14359 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14368 DAGCombinerInfo &DCI)
const {
14374 EVT VT =
N->getValueType(0);
14387 if (
A ==
LHS.getOperand(1)) {
14388 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14393 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14402 if (
A ==
RHS.getOperand(1)) {
14403 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14406 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14415 DAGCombinerInfo &DCI)
const {
14418 EVT VT =
N->getValueType(0);
14432 bool IsNegative =
false;
14433 if (CLHS->isExactlyValue(1.0) ||
14434 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14450 DAGCombinerInfo &DCI)
const {
14452 EVT VT =
N->getValueType(0);
14474 (
N->getFlags().hasAllowContract() &&
14475 FMA->getFlags().hasAllowContract())) {
14509 if (Vec1 == Vec2 || Vec3 == Vec4)
14515 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14516 (Vec1 == Vec4 && Vec2 == Vec3)) {
14525 DAGCombinerInfo &DCI)
const {
14531 EVT VT =
LHS.getValueType();
14534 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14536 CRHS = dyn_cast<ConstantSDNode>(LHS);
14560 return LHS.getOperand(0);
14566 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14567 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14568 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14575 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14576 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14584 return LHS.getOperand(0);
14588 if (VT != MVT::f32 && VT != MVT::f64 &&
14621 DAGCombinerInfo &DCI)
const {
14639 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14643 unsigned ShiftOffset = 8 *
Offset;
14645 ShiftOffset -=
C->getZExtValue();
14647 ShiftOffset +=
C->getZExtValue();
14649 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14651 MVT::f32, Shifted);
14662 DCI.AddToWorklist(
N);
14669 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14675 DAGCombinerInfo &DCI)
const {
14685 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14688 APFloat One(
F.getSemantics(),
"1.0");
14690 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14700 switch (
N->getOpcode()) {
14702 return performAddCombine(
N, DCI);
14704 return performSubCombine(
N, DCI);
14707 return performAddCarrySubCarryCombine(
N, DCI);
14709 return performFAddCombine(
N, DCI);
14711 return performFSubCombine(
N, DCI);
14713 return performFDivCombine(
N, DCI);
14715 return performSetCCCombine(
N, DCI);
14728 return performMinMaxCombine(
N, DCI);
14730 return performFMACombine(
N, DCI);
14732 return performAndCombine(
N, DCI);
14734 return performOrCombine(
N, DCI);
14737 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14738 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14744 return performXorCombine(
N, DCI);
14746 return performZeroExtendCombine(
N, DCI);
14748 return performSignExtendInRegCombine(
N , DCI);
14750 return performClassCombine(
N, DCI);
14752 return performFCanonicalizeCombine(
N, DCI);
14754 return performRcpCombine(
N, DCI);
14769 return performUCharToFloatCombine(
N, DCI);
14771 return performFCopySignCombine(
N, DCI);
14776 return performCvtF32UByteNCombine(
N, DCI);
14778 return performFMed3Combine(
N, DCI);
14780 return performCvtPkRTZCombine(
N, DCI);
14782 return performClampCombine(
N, DCI);
14785 EVT VT =
N->getValueType(0);
14788 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14791 EVT EltVT = Src.getValueType();
14792 if (EltVT != MVT::i16)
14802 return performExtractVectorEltCombine(
N, DCI);
14804 return performInsertVectorEltCombine(
N, DCI);
14806 return performFPRoundCombine(
N, DCI);
14808 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14814 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14815 return performMemSDNodeCombine(MemNode, DCI);
14828 default:
return ~0u;
14829 case AMDGPU::sub0:
return 0;
14830 case AMDGPU::sub1:
return 1;
14831 case AMDGPU::sub2:
return 2;
14832 case AMDGPU::sub3:
return 3;
14833 case AMDGPU::sub4:
return 4;
14840 unsigned Opcode =
Node->getMachineOpcode();
14844 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14850 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14851 unsigned NewDmask = 0;
14854 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14855 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14858 unsigned TFCLane = 0;
14859 bool HasChain =
Node->getNumValues() > 1;
14861 if (OldDmask == 0) {
14869 TFCLane = OldBitsSet;
14877 if (
I.getUse().getResNo() != 0)
14881 if (!
I->isMachineOpcode() ||
14882 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14894 if (UsesTFC && Lane == TFCLane) {
14899 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14901 Dmask &= ~(1 << Comp);
14909 NewDmask |= 1 << Comp;
14914 bool NoChannels = !NewDmask;
14921 if (OldBitsSet == 1)
14927 if (NewDmask == OldDmask)
14936 unsigned NewChannels = BitsSet + UsesTFC;
14940 assert(NewOpcode != -1 &&
14941 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14942 "failed to find equivalent MIMG op");
14950 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14952 MVT ResultVT = NewChannels == 1 ?
14954 NewChannels == 5 ? 8 : NewChannels);
14968 if (NewChannels == 1) {
14978 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14983 if (i || !NoChannels)
14988 if (NewUser !=
User) {
14996 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14997 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14998 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14999 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
15009 Op =
Op.getOperand(0);
15011 return isa<FrameIndexSDNode>(
Op);
15020 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15021 SDValue SrcVal = Node->getOperand(2);
15029 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15031 SDNode *Glued = Node->getGluedNode();
15033 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15040 return ToResultReg.
getNode();
15045 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15053 Node->getOperand(i).getValueType(),
15054 Node->getOperand(i)), 0));
15065 unsigned Opcode = Node->getMachineOpcode();
15067 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15068 !
TII->isGather4(Opcode) &&
15070 return adjustWritemask(Node, DAG);
15073 if (Opcode == AMDGPU::INSERT_SUBREG ||
15074 Opcode == AMDGPU::REG_SEQUENCE) {
15080 case AMDGPU::V_DIV_SCALE_F32_e64:
15081 case AMDGPU::V_DIV_SCALE_F64_e64: {
15085 SDValue Src0 = Node->getOperand(1);
15086 SDValue Src1 = Node->getOperand(3);
15087 SDValue Src2 = Node->getOperand(5);
15091 (Src0 == Src1 || Src0 == Src2))
15148 unsigned InitIdx = 0;
15150 if (
TII->isImage(
MI)) {
15158 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15159 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15160 unsigned D16Val = D16 ? D16->getImm() : 0;
15162 if (!TFEVal && !LWEVal)
15173 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15175 unsigned dmask = MO_Dmask->
getImm();
15182 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15188 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15189 if (DstSize < InitIdx)
15192 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15200 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15201 unsigned NewDst = 0;
15210 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15211 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15229 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15242 if (
TII->isVOP3(
MI.getOpcode())) {
15244 TII->legalizeOperandsVOP3(
MRI,
MI);
15249 if (!
MI.getDesc().operands().empty()) {
15250 unsigned Opc =
MI.getOpcode();
15251 bool HasAGPRs =
Info->mayNeedAGPRs();
15259 if ((
I == Src2Idx) && (HasAGPRs))
15262 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15264 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15265 if (!
TRI->hasAGPRs(RC))
15267 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15268 if (!Src || !Src->isCopy() ||
15269 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15271 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15275 MRI.setRegClass(
Op.getReg(), NewRC);
15282 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15283 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15284 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15285 if (
TRI->isVectorSuperClass(RC)) {
15286 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15287 MRI.setRegClass(Src2->getReg(), NewRC);
15288 if (Src2->isTied())
15289 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15298 if (
TII->isImage(
MI))
15299 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15325 MVT::v2i32, Ops0), 0);
15355 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15377std::pair<unsigned, const TargetRegisterClass *>
15384 if (Constraint.
size() == 1) {
15386 switch (Constraint[0]) {
15393 RC = &AMDGPU::SReg_32RegClass;
15396 RC = &AMDGPU::SGPR_64RegClass;
15401 return std::pair(0U,
nullptr);
15408 RC = &AMDGPU::VGPR_32RegClass;
15413 return std::pair(0U,
nullptr);
15422 RC = &AMDGPU::AGPR_32RegClass;
15427 return std::pair(0U,
nullptr);
15436 return std::pair(0U, RC);
15441 if (
RegName.consume_front(
"v")) {
15442 RC = &AMDGPU::VGPR_32RegClass;
15443 }
else if (
RegName.consume_front(
"s")) {
15444 RC = &AMDGPU::SGPR_32RegClass;
15445 }
else if (
RegName.consume_front(
"a")) {
15446 RC = &AMDGPU::AGPR_32RegClass;
15451 if (
RegName.consume_front(
"[")) {
15461 RC =
TRI->getVGPRClassForBitWidth(Width);
15463 RC =
TRI->getSGPRClassForBitWidth(Width);
15465 RC =
TRI->getAGPRClassForBitWidth(Width);
15467 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15468 return std::pair(Reg, RC);
15473 if (!
Failed && Idx < RC->getNumRegs())
15481 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15487 if (Constraint.
size() == 1) {
15488 switch (Constraint[0]) {
15497 }
else if (Constraint ==
"DA" ||
15498 Constraint ==
"DB") {
15506 if (Constraint.
size() == 1) {
15507 switch (Constraint[0]) {
15523 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15530 std::vector<SDValue> &Ops,
15545 unsigned Size =
Op.getScalarValueSizeInBits();
15553 Val =
C->getSExtValue();
15557 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15563 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15566 Val =
C->getSExtValue();
15570 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15580 if (Constraint.
size() == 1) {
15581 switch (Constraint[0]) {
15585 return isInt<16>(Val);
15589 return isInt<32>(Val);
15596 }
else if (Constraint.
size() == 2) {
15597 if (Constraint ==
"DA") {
15598 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15599 int64_t LoBits =
static_cast<int32_t
>(Val);
15603 if (Constraint ==
"DB") {
15611 unsigned MaxSize)
const {
15612 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15615 MVT VT =
Op.getSimpleValueType();
15640 switch (UnalignedClassID) {
15641 case AMDGPU::VReg_64RegClassID:
15642 return AMDGPU::VReg_64_Align2RegClassID;
15643 case AMDGPU::VReg_96RegClassID:
15644 return AMDGPU::VReg_96_Align2RegClassID;
15645 case AMDGPU::VReg_128RegClassID:
15646 return AMDGPU::VReg_128_Align2RegClassID;
15647 case AMDGPU::VReg_160RegClassID:
15648 return AMDGPU::VReg_160_Align2RegClassID;
15649 case AMDGPU::VReg_192RegClassID:
15650 return AMDGPU::VReg_192_Align2RegClassID;
15651 case AMDGPU::VReg_224RegClassID:
15652 return AMDGPU::VReg_224_Align2RegClassID;
15653 case AMDGPU::VReg_256RegClassID:
15654 return AMDGPU::VReg_256_Align2RegClassID;
15655 case AMDGPU::VReg_288RegClassID:
15656 return AMDGPU::VReg_288_Align2RegClassID;
15657 case AMDGPU::VReg_320RegClassID:
15658 return AMDGPU::VReg_320_Align2RegClassID;
15659 case AMDGPU::VReg_352RegClassID:
15660 return AMDGPU::VReg_352_Align2RegClassID;
15661 case AMDGPU::VReg_384RegClassID:
15662 return AMDGPU::VReg_384_Align2RegClassID;
15663 case AMDGPU::VReg_512RegClassID:
15664 return AMDGPU::VReg_512_Align2RegClassID;
15665 case AMDGPU::VReg_1024RegClassID:
15666 return AMDGPU::VReg_1024_Align2RegClassID;
15667 case AMDGPU::AReg_64RegClassID:
15668 return AMDGPU::AReg_64_Align2RegClassID;
15669 case AMDGPU::AReg_96RegClassID:
15670 return AMDGPU::AReg_96_Align2RegClassID;
15671 case AMDGPU::AReg_128RegClassID:
15672 return AMDGPU::AReg_128_Align2RegClassID;
15673 case AMDGPU::AReg_160RegClassID:
15674 return AMDGPU::AReg_160_Align2RegClassID;
15675 case AMDGPU::AReg_192RegClassID:
15676 return AMDGPU::AReg_192_Align2RegClassID;
15677 case AMDGPU::AReg_256RegClassID:
15678 return AMDGPU::AReg_256_Align2RegClassID;
15679 case AMDGPU::AReg_512RegClassID:
15680 return AMDGPU::AReg_512_Align2RegClassID;
15681 case AMDGPU::AReg_1024RegClassID:
15682 return AMDGPU::AReg_1024_Align2RegClassID;
15698 if (
Info->isEntryFunction()) {
15705 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15707 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15708 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15709 &AMDGPU::SGPR_64RegClass);
15710 Info->setSGPRForEXECCopy(SReg);
15713 Info->getStackPtrOffsetReg()));
15714 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15715 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15719 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15720 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15722 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15723 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15725 Info->limitOccupancy(MF);
15727 if (ST.isWave32() && !MF.
empty()) {
15728 for (
auto &
MBB : MF) {
15729 for (
auto &
MI :
MBB) {
15730 TII->fixImplicitOperands(
MI);
15740 if (ST.needsAlignedVGPRs()) {
15741 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
15747 if (NewClassID != -1)
15748 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15757 const APInt &DemandedElts,
15759 unsigned Depth)
const {
15761 unsigned Opc =
Op.getOpcode();
15764 unsigned IID =
Op.getConstantOperandVal(0);
15766 case Intrinsic::amdgcn_mbcnt_lo:
15767 case Intrinsic::amdgcn_mbcnt_hi: {
15774 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15776 MaxActiveBits += Src1ValBits ? 1 : 0;
15777 unsigned Size =
Op.getValueType().getSizeInBits();
15778 if (MaxActiveBits <
Size)
15787 Op, Known, DemandedElts, DAG,
Depth);
15802 unsigned MaxValue =
15811 switch (
MI->getOpcode()) {
15812 case AMDGPU::G_INTRINSIC:
15813 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15815 case Intrinsic::amdgcn_workitem_id_x:
15818 case Intrinsic::amdgcn_workitem_id_y:
15821 case Intrinsic::amdgcn_workitem_id_z:
15824 case Intrinsic::amdgcn_mbcnt_lo:
15825 case Intrinsic::amdgcn_mbcnt_hi: {
15827 unsigned Size =
MRI.getType(R).getSizeInBits();
15831 case Intrinsic::amdgcn_groupstaticsize: {
15842 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15845 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15848 case AMDGPU::G_AMDGPU_SMED3:
15849 case AMDGPU::G_AMDGPU_UMED3: {
15850 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15877 unsigned Depth)
const {
15879 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15885 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15912 if (Header->getAlignment() != PrefAlign)
15913 return Header->getAlignment();
15915 unsigned LoopSize = 0;
15923 LoopSize +=
TII->getInstSizeInBytes(
MI);
15924 if (LoopSize > 192)
15929 if (LoopSize <= 64)
15932 if (LoopSize <= 128)
15933 return CacheLineAlign;
15939 auto I = Exit->getFirstNonDebugInstr();
15940 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15941 return CacheLineAlign;
15950 if (PreTerm == Pre->
begin() ||
15951 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15955 auto ExitHead = Exit->getFirstNonDebugInstr();
15956 if (ExitHead == Exit->end() ||
15957 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15962 return CacheLineAlign;
15970 N =
N->getOperand(0).getNode();
15981 switch (
N->getOpcode()) {
15989 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15990 return !
TRI->isSGPRReg(
MRI, Reg);
15996 return !
TRI->isSGPRReg(
MRI, Reg);
16000 unsigned AS = L->getAddressSpace();
16034 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16036 return A->readMem() &&
A->writeMem();
16071 unsigned Depth)
const {
16076 if (
Info->getMode().DX10Clamp)
16089static bool fpModeMatchesGlobalFPAtomicMode(
const AtomicRMWInst *RMW) {
16103 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
16116 <<
"Hardware instruction generated for atomic "
16118 <<
" operation at memory scope " << MemScope;
16136 bool HasSystemScope =
16195 if (HasSystemScope)
16244 if (HasSystemScope)
16281 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16283 : &AMDGPU::SReg_32RegClass;
16284 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16285 return TRI->getEquivalentSGPRClass(RC);
16286 else if (
TRI->isSGPRClass(RC) && isDivergent)
16287 return TRI->getEquivalentVGPRClass(RC);
16299 unsigned WaveSize) {
16304 if (!
IT ||
IT->getBitWidth() != WaveSize)
16307 if (!isa<Instruction>(V))
16309 if (!Visited.
insert(V).second)
16311 bool Result =
false;
16312 for (
const auto *U : V->users()) {
16313 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16314 if (V == U->getOperand(1)) {
16315 switch (Intrinsic->getIntrinsicID()) {
16319 case Intrinsic::amdgcn_if_break:
16320 case Intrinsic::amdgcn_if:
16321 case Intrinsic::amdgcn_else:
16326 if (V == U->getOperand(0)) {
16327 switch (Intrinsic->getIntrinsicID()) {
16331 case Intrinsic::amdgcn_end_cf:
16332 case Intrinsic::amdgcn_loop:
16338 Result =
hasCFUser(U, Visited, WaveSize);
16347 const Value *V)
const {
16348 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16349 if (CI->isInlineAsm()) {
16358 for (
auto &TC : TargetConstraints) {
16362 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16375 for (;
I != E; ++
I) {
16376 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16399 return MRI.hasOneNonDBGUse(N0);
16406 if (
I.getMetadata(
"amdgpu.noclobber"))
16408 if (
I.getMetadata(
"amdgpu.last.use"))
16418 if (!Def->isMachineOpcode())
16429 PhysReg = AMDGPU::SCC;
16431 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16445 "this cannot be replaced with add");
16451 "target should have atomic fadd instructions");
16454 "generic atomicrmw expansion only supports FP32 operand in flat "
16528 for (
auto &
P : MDs)
16539 {
Addr},
nullptr,
"is.shared");
16540 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16545 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16550 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16556 Value *LoadedPrivate =
16557 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16565 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
bool hasD16Images() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
bool isCompare() const
Return true if this instruction is a comparison.
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ BUFFER_ATOMIC_FADD_BF16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const