40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
352 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
366 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
380 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
394 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
408 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
526 {MVT::f32, MVT::f64},
Legal);
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
979 EVT DestVT,
EVT SrcVT)
const {
989 LLT DestTy,
LLT SrcTy)
const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1019 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1046 return (NumElts + 1) / 2;
1052 return NumElts * ((
Size + 31) / 32);
1061 EVT VT,
EVT &IntermediateVT,
1062 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1075 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((
Size + 31) / 32);
1109 return NumIntermediates;
1114 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1123 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Ty);
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1157 DL.getPointerSizeInBits(AS) == 192)
1167 DL.getPointerSizeInBits(AS) == 160) ||
1169 DL.getPointerSizeInBits(AS) == 192))
1177 unsigned IntrID)
const {
1179 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1193 if (RsrcIntr->IsImage)
1197 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1204 Info.ptrVal = RsrcArg;
1212 if (RsrcIntr->IsImage) {
1213 unsigned MaxNumLanes = 4;
1233 std::numeric_limits<unsigned>::max());
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1266 case Intrinsic::amdgcn_raw_buffer_load_lds:
1267 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1268 case Intrinsic::amdgcn_struct_buffer_load_lds:
1269 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1270 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1281 case Intrinsic::amdgcn_ds_ordered_add:
1282 case Intrinsic::amdgcn_ds_ordered_swap: {
1295 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1296 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1299 Info.ptrVal =
nullptr;
1304 case Intrinsic::amdgcn_ds_append:
1305 case Intrinsic::amdgcn_ds_consume: {
1318 case Intrinsic::amdgcn_global_atomic_csub: {
1328 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1338 case Intrinsic::amdgcn_global_atomic_fadd:
1339 case Intrinsic::amdgcn_global_atomic_fmin:
1340 case Intrinsic::amdgcn_global_atomic_fmax:
1341 case Intrinsic::amdgcn_global_atomic_fmin_num:
1342 case Intrinsic::amdgcn_global_atomic_fmax_num:
1343 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1344 case Intrinsic::amdgcn_flat_atomic_fadd:
1345 case Intrinsic::amdgcn_flat_atomic_fmin:
1346 case Intrinsic::amdgcn_flat_atomic_fmax:
1347 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1348 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1349 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1350 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1351 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1362 case Intrinsic::amdgcn_global_load_tr_b64:
1363 case Intrinsic::amdgcn_global_load_tr_b128: {
1371 case Intrinsic::amdgcn_ds_gws_init:
1372 case Intrinsic::amdgcn_ds_gws_barrier:
1373 case Intrinsic::amdgcn_ds_gws_sema_v:
1374 case Intrinsic::amdgcn_ds_gws_sema_br:
1375 case Intrinsic::amdgcn_ds_gws_sema_p:
1376 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1386 Info.memVT = MVT::i32;
1390 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1396 case Intrinsic::amdgcn_global_load_lds: {
1398 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1404 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1414 Info.memVT = MVT::i32;
1429 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1432 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1433 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1445 Type *&AccessTy)
const {
1447 switch (
II->getIntrinsicID()) {
1448 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1449 case Intrinsic::amdgcn_ds_append:
1450 case Intrinsic::amdgcn_ds_consume:
1451 case Intrinsic::amdgcn_ds_ordered_add:
1452 case Intrinsic::amdgcn_ds_ordered_swap:
1453 case Intrinsic::amdgcn_flat_atomic_fadd:
1454 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1455 case Intrinsic::amdgcn_flat_atomic_fmax:
1456 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1457 case Intrinsic::amdgcn_flat_atomic_fmin:
1458 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1459 case Intrinsic::amdgcn_global_atomic_csub:
1460 case Intrinsic::amdgcn_global_atomic_fadd:
1461 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1462 case Intrinsic::amdgcn_global_atomic_fmax:
1463 case Intrinsic::amdgcn_global_atomic_fmax_num:
1464 case Intrinsic::amdgcn_global_atomic_fmin:
1465 case Intrinsic::amdgcn_global_atomic_fmin_num:
1466 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b64:
1468 case Intrinsic::amdgcn_global_load_tr_b128:
1469 Ptr =
II->getArgOperand(0);
1471 case Intrinsic::amdgcn_global_load_lds:
1472 Ptr =
II->getArgOperand(1);
1477 AccessTy =
II->getType();
1483 unsigned AddrSpace)
const {
1495 return AM.
Scale == 0 &&
1497 AM.
BaseOffs, AddrSpace, FlatVariant));
1517 return isLegalMUBUFAddressingMode(AM);
1520bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1531 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1543 if (AM.HasBaseReg) {
1574 return isLegalMUBUFAddressingMode(AM);
1581 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1631 : isLegalMUBUFAddressingMode(AM);
1678 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1692 Alignment < RequiredAlignment)
1713 RequiredAlignment =
Align(4);
1731 *IsFast = (Alignment >= RequiredAlignment) ? 64
1732 : (Alignment <
Align(4)) ? 32
1754 *IsFast = (Alignment >= RequiredAlignment) ? 96
1755 : (Alignment <
Align(4)) ? 32
1768 RequiredAlignment =
Align(8);
1779 *IsFast = (Alignment >= RequiredAlignment) ? 128
1780 : (Alignment <
Align(4)) ? 32
1797 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1799 return Alignment >= RequiredAlignment ||
1804 bool AlignedBy4 = Alignment >=
Align(4);
1806 *IsFast = AlignedBy4;
1808 return AlignedBy4 ||
1818 bool AlignedBy4 = Alignment >=
Align(4);
1820 *IsFast = AlignedBy4;
1831 return Alignment >=
Align(4) ||
1845 return Size >= 32 && Alignment >=
Align(4);
1850 unsigned *IsFast)
const {
1852 Alignment, Flags, IsFast);
1862 if (
Op.size() >= 16 &&
1866 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1874 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1884 unsigned DestAS)
const {
1892 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1896 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1916 unsigned Index)
const {
1963 std::tie(InputPtrReg, RC, ArgTy) =
1973 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1979 const SDLoc &SL)
const {
1986 const SDLoc &SL)
const {
1989 std::optional<uint32_t> KnownSize =
1991 if (KnownSize.has_value())
2018 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2027SDValue SITargetLowering::lowerKernargMemParameter(
2039 int64_t OffsetDiff =
Offset - AlignDownOffset;
2045 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2055 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2066 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2113 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2142 Reg = &WorkGroupIDX;
2143 RC = &AMDGPU::SReg_32RegClass;
2147 Reg = &WorkGroupIDY;
2148 RC = &AMDGPU::SReg_32RegClass;
2152 Reg = &WorkGroupIDZ;
2153 RC = &AMDGPU::SReg_32RegClass;
2184 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2188 "vector type argument should have been split");
2193 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2202 "unexpected vector split in ps argument type");
2216 Info->markPSInputAllocated(PSInputNum);
2218 Info->markPSInputEnabled(PSInputNum);
2235 if (
Info.hasWorkItemIDX()) {
2241 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2245 if (
Info.hasWorkItemIDY()) {
2251 unsigned Reg = AMDGPU::VGPR1;
2259 if (
Info.hasWorkItemIDZ()) {
2265 unsigned Reg = AMDGPU::VGPR2;
2285 if (RegIdx == ArgVGPRs.
size()) {
2292 unsigned Reg = ArgVGPRs[RegIdx];
2294 assert(Reg != AMDGPU::NoRegister);
2304 unsigned NumArgRegs) {
2307 if (RegIdx == ArgSGPRs.
size())
2310 unsigned Reg = ArgSGPRs[RegIdx];
2312 assert(Reg != AMDGPU::NoRegister);
2326 assert(Reg != AMDGPU::NoRegister);
2352 const unsigned Mask = 0x3ff;
2355 if (
Info.hasWorkItemIDX()) {
2357 Info.setWorkItemIDX(Arg);
2360 if (
Info.hasWorkItemIDY()) {
2362 Info.setWorkItemIDY(Arg);
2365 if (
Info.hasWorkItemIDZ())
2377 const unsigned Mask = 0x3ff;
2402 if (
Info.hasImplicitArgPtr())
2410 if (
Info.hasWorkGroupIDX())
2413 if (
Info.hasWorkGroupIDY())
2416 if (
Info.hasWorkGroupIDZ())
2419 if (
Info.hasLDSKernelId())
2431 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2438 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2444 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2452 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2467 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2473 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2479 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2494 unsigned LastExplicitArgOffset =
2497 bool InPreloadSequence =
true;
2499 for (
auto &Arg :
F.args()) {
2500 if (!InPreloadSequence || !Arg.hasInRegAttr())
2503 int ArgIdx = Arg.getArgNo();
2506 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2507 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2510 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2511 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2513 assert(ArgLocs[ArgIdx].isMemLoc());
2514 auto &ArgLoc = ArgLocs[InIdx];
2516 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2518 unsigned NumAllocSGPRs =
2519 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2522 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2523 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2524 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2528 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2529 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2531 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2533 InPreloadSequence =
false;
2539 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2541 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2543 if (PreloadRegs->
size() > 1)
2544 RC = &AMDGPU::SGPR_32RegClass;
2545 for (
auto &Reg : *PreloadRegs) {
2551 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2560 if (
Info.hasLDSKernelId()) {
2562 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2572 bool IsShader)
const {
2580 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2582 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2586 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2587 Info.hasWorkGroupIDY() +
2588 Info.hasWorkGroupIDZ() +
2589 Info.hasWorkGroupInfo();
2590 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2592 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2597 if (!HasArchitectedSGPRs) {
2598 if (
Info.hasWorkGroupIDX()) {
2600 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2604 if (
Info.hasWorkGroupIDY()) {
2606 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2610 if (
Info.hasWorkGroupIDZ()) {
2612 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 if (
Info.hasWorkGroupInfo()) {
2619 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2623 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2625 unsigned PrivateSegmentWaveByteOffsetReg;
2628 PrivateSegmentWaveByteOffsetReg =
2629 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2633 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2635 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2638 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2640 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2641 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2645 Info.getNumPreloadedSGPRs() >= 16);
2660 if (HasStackObjects)
2661 Info.setHasNonSpillStackObjects(
true);
2666 HasStackObjects =
true;
2670 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2672 if (!ST.enableFlatScratch()) {
2673 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2680 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2682 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2692 Info.setScratchRSrcReg(ReservedBufferReg);
2711 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2712 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2719 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2720 if (!
MRI.isLiveIn(Reg)) {
2721 Info.setStackPtrOffsetReg(Reg);
2726 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2733 if (ST.getFrameLowering()->hasFP(MF)) {
2734 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2740 return !
Info->isEntryFunction();
2752 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2761 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2762 RC = &AMDGPU::SGPR_64RegClass;
2763 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2764 RC = &AMDGPU::SGPR_32RegClass;
2770 Entry->addLiveIn(*
I);
2775 for (
auto *Exit : Exits)
2777 TII->get(TargetOpcode::COPY), *
I)
2795 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2814 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2815 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2823 !
Info->hasWorkGroupIDZ());
2842 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2843 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2846 Info->markPSInputAllocated(0);
2847 Info->markPSInputEnabled(0);
2858 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2859 if ((PsInputBits & 0x7F) == 0 ||
2860 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2863 }
else if (IsKernel) {
2866 Splits.
append(Ins.begin(), Ins.end());
2879 }
else if (!IsGraphics) {
2904 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2914 if (IsEntryFunc && VA.
isMemLoc()) {
2937 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2941 int64_t OffsetDiff =
Offset - AlignDownOffset;
2948 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2959 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2960 Ins[i].Flags.isSExt(), &Ins[i]);
2968 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2971 if (PreloadRegs.
size() == 1) {
2972 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2977 TRI->getRegSizeInBits(*RC)));
2985 for (
auto Reg : PreloadRegs) {
2992 PreloadRegs.size()),
3009 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3010 Ins[i].Flags.isSExt(), &Ins[i]);
3015 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3016 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3021 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3035 if (!IsEntryFunc && VA.
isMemLoc()) {
3036 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3047 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3048 RC = &AMDGPU::VGPR_32RegClass;
3049 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3050 RC = &AMDGPU::SGPR_32RegClass;
3111 Info->setBytesInStackArgArea(StackArgSize);
3113 return Chains.
empty() ? Chain :
3131 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3137 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3138 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3139 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3162 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3180 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3181 ++
I, ++RealRVLocIdx) {
3185 SDValue Arg = OutVals[RealRVLocIdx];
3213 if (!
Info->isEntryFunction()) {
3219 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3221 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3237 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3322 auto &ArgUsageInfo =
3324 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3354 std::tie(OutgoingArg, ArgRC, ArgTy) =
3362 std::tie(IncomingArg, IncomingArgRC, Ty) =
3364 assert(IncomingArgRC == ArgRC);
3367 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3375 InputReg = getImplicitArgPtr(DAG,
DL);
3377 std::optional<uint32_t> Id =
3379 if (Id.has_value()) {
3391 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3395 unsigned SpecialArgOffset =
3409 std::tie(OutgoingArg, ArgRC, Ty) =
3412 std::tie(OutgoingArg, ArgRC, Ty) =
3415 std::tie(OutgoingArg, ArgRC, Ty) =
3430 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3431 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3432 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3449 InputReg = InputReg.
getNode() ?
3458 InputReg = InputReg.
getNode() ?
3462 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3463 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3473 IncomingArgX ? *IncomingArgX :
3474 IncomingArgY ? *IncomingArgY :
3475 *IncomingArgZ, ~0u);
3482 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3523 if (Callee->isDivergent())
3530 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3534 if (!CallerPreserved)
3537 bool CCMatch = CallerCC == CalleeCC;
3550 if (Arg.hasByValAttr())
3564 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3565 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3574 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3608 if (IsChainCallConv) {
3612 RequestedExec = CLI.
Args.back();
3613 assert(RequestedExec.
Node &&
"No node for EXEC");
3618 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3619 CLI.
Outs.pop_back();
3623 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3624 CLI.
Outs.pop_back();
3629 "Haven't popped all the pieces of the EXEC mask");
3640 bool IsSibCall =
false;
3654 "unsupported call to variadic function ");
3662 "unsupported required tail call to function ");
3667 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3671 "site marked musttail or on llvm.amdgcn.cs.chain");
3678 if (!TailCallOpt && IsTailCall)
3723 if (!IsSibCall || IsChainCallConv) {
3730 RegsToPass.emplace_back(IsChainCallConv
3731 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3732 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3739 MVT PtrVT = MVT::i32;
3742 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3770 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3778 int32_t
Offset = LocMemOffset;
3785 unsigned OpSize = Flags.isByVal() ?
3791 ? Flags.getNonZeroByValAlign()
3818 if (Outs[i].Flags.isByVal()) {
3820 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3823 Outs[i].Flags.getNonZeroByValAlign(),
3825 nullptr, std::nullopt, DstInfo,
3831 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3837 if (!MemOpChains.
empty())
3843 for (
auto &RegToPass : RegsToPass) {
3845 RegToPass.second, InGlue);
3854 if (IsTailCall && !IsSibCall) {
3859 std::vector<SDValue> Ops;
3860 Ops.push_back(Chain);
3861 Ops.push_back(Callee);
3878 if (IsChainCallConv)
3879 Ops.push_back(RequestedExec.
Node);
3883 for (
auto &RegToPass : RegsToPass) {
3885 RegToPass.second.getValueType()));
3890 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3891 assert(Mask &&
"Missing call preserved mask for calling convention");
3901 MVT::Glue, GlueOps),
3906 Ops.push_back(InGlue);
3925 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3930 Chain = Call.getValue(0);
3931 InGlue = Call.getValue(1);
3933 uint64_t CalleePopBytes = NumBytes;
3952 EVT VT =
Op.getValueType();
3967 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3978 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3979 if (Alignment && *Alignment > StackAlign) {
4000 if (isa<ConstantSDNode>(
Size))
4007 if (
Op.getValueType() != MVT::i32)
4026 assert(
Op.getValueType() == MVT::i32);
4035 Op.getOperand(0), IntrinID, GetRoundBothImm);
4069 SDValue RoundModeTimesNumBits =
4089 TableEntry, EnumOffset);
4103 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4105 static_cast<uint32_t>(ConstMode->getZExtValue()),
4117 if (UseReducedTable) {
4123 SDValue RoundModeTimesNumBits =
4143 SDValue RoundModeTimesNumBits =
4152 NewMode = TruncTable;
4161 ReadFirstLaneID, NewMode);
4174 IntrinID, RoundBothImm, NewMode);
4180 if (
Op->isDivergent())
4183 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4199 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4200 EVT SrcVT = Src.getValueType();
4209 EVT DstVT =
Op.getValueType();
4218 if (
Op.getValueType() != MVT::i64)
4232 Op.getOperand(0), IntrinID, ModeHwRegImm);
4234 Op.getOperand(0), IntrinID, TrapHwRegImm);
4248 if (
Op.getOperand(1).getValueType() != MVT::i64)
4260 ReadFirstLaneID, NewModeReg);
4262 ReadFirstLaneID, NewTrapReg);
4264 unsigned ModeHwReg =
4267 unsigned TrapHwReg =
4275 IntrinID, ModeHwRegImm, NewModeReg);
4278 IntrinID, TrapHwRegImm, NewTrapReg);
4285 .
Case(
"m0", AMDGPU::M0)
4286 .
Case(
"exec", AMDGPU::EXEC)
4287 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4288 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4289 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4290 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4291 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4294 if (Reg == AMDGPU::NoRegister) {
4308 case AMDGPU::EXEC_LO:
4309 case AMDGPU::EXEC_HI:
4310 case AMDGPU::FLAT_SCR_LO:
4311 case AMDGPU::FLAT_SCR_HI:
4316 case AMDGPU::FLAT_SCR:
4335 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4344static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4366 auto Next = std::next(
I);
4379 return std::pair(LoopBB, RemainderBB);
4386 auto I =
MI.getIterator();
4387 auto E = std::next(
I);
4409 Src->setIsKill(
false);
4425 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4428 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4450 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4451 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4460 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4461 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4462 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4463 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4471 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4478 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4482 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4487 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4488 : AMDGPU::S_AND_SAVEEXEC_B64),
4492 MRI.setSimpleHint(NewExec, CondReg);
4494 if (UseGPRIdxMode) {
4496 SGPRIdxReg = CurrentIdxReg;
4498 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4499 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4506 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4509 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4516 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4518 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4519 : AMDGPU::S_XOR_B64_term), Exec)
4540 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4541 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4549 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4551 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4552 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4553 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4554 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4569 InitResultReg, DstReg, PhiReg, TmpExec,
4570 Offset, UseGPRIdxMode, SGPRIdxReg);
4587static std::pair<unsigned, int>
4592 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4597 return std::pair(AMDGPU::sub0,
Offset);
4611 assert(
Idx->getReg() != AMDGPU::NoRegister);
4632 return Idx->getReg();
4634 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4651 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4652 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4661 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4664 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4668 if (UseGPRIdxMode) {
4675 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4688 MI.eraseFromParent();
4697 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4698 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4704 UseGPRIdxMode, SGPRIdxReg);
4708 if (UseGPRIdxMode) {
4710 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4712 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4717 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4722 MI.eraseFromParent();
4739 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4750 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4752 if (
Idx->getReg() == AMDGPU::NoRegister) {
4763 MI.eraseFromParent();
4768 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4772 if (UseGPRIdxMode) {
4776 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4785 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4786 TRI.getRegSizeInBits(*VecRC), 32,
false);
4792 MI.eraseFromParent();
4802 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4806 UseGPRIdxMode, SGPRIdxReg);
4809 if (UseGPRIdxMode) {
4811 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4813 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4819 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4820 TRI.getRegSizeInBits(*VecRC), 32,
false);
4821 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4827 MI.eraseFromParent();
4842 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4870 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4871 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4873 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4874 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4875 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4877 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4878 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4880 bool IsWave32 = ST.isWave32();
4881 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4882 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4887 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4890 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4895 I = ComputeLoop->end();
4897 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4901 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4902 .
addReg(TmpSReg->getOperand(0).getReg())
4906 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4907 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4908 .
addReg(ActiveBits->getOperand(0).getReg());
4909 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4910 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4912 .
addReg(FF1->getOperand(0).getReg());
4913 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4915 .
addReg(LaneValue->getOperand(0).getReg());
4918 unsigned BITSETOpc =
4919 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4920 auto NewActiveBits =
4921 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4922 .
addReg(FF1->getOperand(0).getReg())
4923 .
addReg(ActiveBits->getOperand(0).getReg());
4926 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4927 .addMBB(ComputeLoop);
4928 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4929 .addMBB(ComputeLoop);
4932 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4934 .
addReg(NewActiveBits->getOperand(0).getReg())
4936 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4941 MI.eraseFromParent();
4952 switch (
MI.getOpcode()) {
4953 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4955 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4957 case AMDGPU::S_UADDO_PSEUDO:
4958 case AMDGPU::S_USUBO_PSEUDO: {
4965 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4967 : AMDGPU::S_SUB_I32;
4974 MI.eraseFromParent();
4977 case AMDGPU::S_ADD_U64_PSEUDO:
4978 case AMDGPU::S_SUB_U64_PSEUDO: {
4987 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4989 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4997 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4998 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5001 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5003 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5006 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5008 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5010 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5011 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5024 MI.eraseFromParent();
5027 case AMDGPU::V_ADD_U64_PSEUDO:
5028 case AMDGPU::V_SUB_U64_PSEUDO: {
5034 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5040 if (IsAdd && ST.hasLshlAddB64()) {
5046 TII->legalizeOperands(*
Add);
5047 MI.eraseFromParent();
5051 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5053 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5054 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5056 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5057 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5061 : &AMDGPU::VReg_64RegClass;
5064 : &AMDGPU::VReg_64RegClass;
5067 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5069 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5072 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5074 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5077 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5079 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5081 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5088 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5102 TII->legalizeOperands(*LoHalf);
5103 TII->legalizeOperands(*HiHalf);
5104 MI.eraseFromParent();
5107 case AMDGPU::S_ADD_CO_PSEUDO:
5108 case AMDGPU::S_SUB_CO_PSEUDO: {
5122 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5123 ? AMDGPU::S_ADDC_U32
5124 : AMDGPU::S_SUBB_U32;
5126 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5127 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5132 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5133 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5137 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5139 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5145 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5146 assert(WaveSize == 64 || WaveSize == 32);
5148 if (WaveSize == 64) {
5149 if (ST.hasScalarCompareEq64()) {
5155 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5157 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5159 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5160 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5162 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5179 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5185 MI.eraseFromParent();
5188 case AMDGPU::SI_INIT_M0: {
5190 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5191 .
add(
MI.getOperand(0));
5192 MI.eraseFromParent();
5195 case AMDGPU::GET_GROUPSTATICSIZE: {
5200 .
add(
MI.getOperand(0))
5202 MI.eraseFromParent();
5205 case AMDGPU::GET_SHADERCYCLESHILO: {
5219 using namespace AMDGPU::Hwreg;
5220 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5222 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5223 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5225 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5226 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5228 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5232 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 .
add(
MI.getOperand(0))
5242 MI.eraseFromParent();
5245 case AMDGPU::SI_INDIRECT_SRC_V1:
5246 case AMDGPU::SI_INDIRECT_SRC_V2:
5247 case AMDGPU::SI_INDIRECT_SRC_V4:
5248 case AMDGPU::SI_INDIRECT_SRC_V8:
5249 case AMDGPU::SI_INDIRECT_SRC_V9:
5250 case AMDGPU::SI_INDIRECT_SRC_V10:
5251 case AMDGPU::SI_INDIRECT_SRC_V11:
5252 case AMDGPU::SI_INDIRECT_SRC_V12:
5253 case AMDGPU::SI_INDIRECT_SRC_V16:
5254 case AMDGPU::SI_INDIRECT_SRC_V32:
5256 case AMDGPU::SI_INDIRECT_DST_V1:
5257 case AMDGPU::SI_INDIRECT_DST_V2:
5258 case AMDGPU::SI_INDIRECT_DST_V4:
5259 case AMDGPU::SI_INDIRECT_DST_V8:
5260 case AMDGPU::SI_INDIRECT_DST_V9:
5261 case AMDGPU::SI_INDIRECT_DST_V10:
5262 case AMDGPU::SI_INDIRECT_DST_V11:
5263 case AMDGPU::SI_INDIRECT_DST_V12:
5264 case AMDGPU::SI_INDIRECT_DST_V16:
5265 case AMDGPU::SI_INDIRECT_DST_V32:
5267 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5268 case AMDGPU::SI_KILL_I1_PSEUDO:
5270 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5279 Register SrcCond =
MI.getOperand(3).getReg();
5281 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5282 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5283 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5284 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5288 : &AMDGPU::VReg_64RegClass;
5291 : &AMDGPU::VReg_64RegClass;
5294 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5296 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5299 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5301 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5304 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5306 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5328 MI.eraseFromParent();
5331 case AMDGPU::SI_BR_UNDEF: {
5335 .
add(
MI.getOperand(0));
5337 MI.eraseFromParent();
5340 case AMDGPU::ADJCALLSTACKUP:
5341 case AMDGPU::ADJCALLSTACKDOWN: {
5348 case AMDGPU::SI_CALL_ISEL: {
5352 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5355 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5361 MI.eraseFromParent();
5364 case AMDGPU::V_ADD_CO_U32_e32:
5365 case AMDGPU::V_SUB_CO_U32_e32:
5366 case AMDGPU::V_SUBREV_CO_U32_e32: {
5369 unsigned Opc =
MI.getOpcode();
5371 bool NeedClampOperand =
false;
5372 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5374 NeedClampOperand =
true;
5378 if (
TII->isVOP3(*
I)) {
5383 I.add(
MI.getOperand(1))
5384 .add(
MI.getOperand(2));
5385 if (NeedClampOperand)
5388 TII->legalizeOperands(*
I);
5390 MI.eraseFromParent();
5393 case AMDGPU::V_ADDC_U32_e32:
5394 case AMDGPU::V_SUBB_U32_e32:
5395 case AMDGPU::V_SUBBREV_U32_e32:
5398 TII->legalizeOperands(
MI);
5400 case AMDGPU::DS_GWS_INIT:
5401 case AMDGPU::DS_GWS_SEMA_BR:
5402 case AMDGPU::DS_GWS_BARRIER:
5403 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5405 case AMDGPU::DS_GWS_SEMA_V:
5406 case AMDGPU::DS_GWS_SEMA_P:
5407 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5415 case AMDGPU::S_SETREG_B32: {
5430 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5431 const unsigned SetMask = WidthMask <<
Offset;
5434 unsigned SetDenormOp = 0;
5435 unsigned SetRoundOp = 0;
5443 SetRoundOp = AMDGPU::S_ROUND_MODE;
5444 SetDenormOp = AMDGPU::S_DENORM_MODE;
5446 SetRoundOp = AMDGPU::S_ROUND_MODE;
5448 SetDenormOp = AMDGPU::S_DENORM_MODE;
5451 if (SetRoundOp || SetDenormOp) {
5454 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5455 unsigned ImmVal = Def->getOperand(1).getImm();
5469 MI.eraseFromParent();
5478 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5482 case AMDGPU::S_INVERSE_BALLOT_U32:
5483 case AMDGPU::S_INVERSE_BALLOT_U64:
5486 MI.setDesc(
TII->get(AMDGPU::COPY));
5488 case AMDGPU::ENDPGM_TRAP: {
5491 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5509 MI.eraseFromParent();
5512 case AMDGPU::SIMULATED_TRAP: {
5516 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5517 MI.eraseFromParent();
5554 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5641 EVT VT =
N->getValueType(0);
5645 if (VT == MVT::f16) {
5661 unsigned Opc =
Op.getOpcode();
5662 EVT VT =
Op.getValueType();
5663 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5664 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5665 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5666 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5684 unsigned Opc =
Op.getOpcode();
5685 EVT VT =
Op.getValueType();
5686 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5687 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5688 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5689 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5708 unsigned Opc =
Op.getOpcode();
5709 EVT VT =
Op.getValueType();
5710 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5711 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5712 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5713 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5714 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5715 VT == MVT::v32bf16);
5721 : std::pair(Op0, Op0);
5740 switch (
Op.getOpcode()) {
5746 assert((!Result.getNode() ||
5747 Result.getNode()->getNumValues() == 2) &&
5748 "Load should return a value and a chain");
5752 EVT VT =
Op.getValueType();
5754 return lowerFSQRTF32(
Op, DAG);
5756 return lowerFSQRTF64(
Op, DAG);
5761 return LowerTrig(
Op, DAG);
5770 return LowerGlobalAddress(MFI,
Op, DAG);
5777 return lowerINSERT_SUBVECTOR(
Op, DAG);
5779 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5781 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5783 return lowerVECTOR_SHUFFLE(
Op, DAG);
5785 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5787 return lowerBUILD_VECTOR(
Op, DAG);
5790 return lowerFP_ROUND(
Op, DAG);
5795 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5799 int RoundMode =
Op.getConstantOperandVal(1);
5807 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5810 return lowerTRAP(
Op, DAG);
5812 return lowerDEBUGTRAP(
Op, DAG);
5821 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5824 return lowerFLDEXP(
Op, DAG);
5851 return lowerMUL(
Op, DAG);
5854 return lowerXMULO(
Op, DAG);
5857 return lowerXMUL_LOHI(
Op, DAG);
5890 EVT FittingLoadVT = LoadVT;
5922SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5926 bool IsIntrinsic)
const {
5930 EVT LoadVT =
M->getValueType(0);
5932 EVT EquivLoadVT = LoadVT;
5951 VTList, Ops,
M->getMemoryVT(),
5952 M->getMemOperand());
5963 EVT LoadVT =
M->getValueType(0);
5969 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5970 bool IsTFE =
M->getNumValues() == 3;
5983 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
5987 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5988 M->getMemOperand(), DAG);
5993 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5994 M->getMemOperand(), DAG);
6002 EVT VT =
N->getValueType(0);
6003 unsigned CondCode =
N->getConstantOperandVal(3);
6014 EVT CmpVT =
LHS.getValueType();
6015 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6036 EVT VT =
N->getValueType(0);
6038 unsigned CondCode =
N->getConstantOperandVal(3);
6047 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6065 EVT VT =
N->getValueType(0);
6072 Src.getOperand(1), Src.getOperand(2));
6083 Exec = AMDGPU::EXEC_LO;
6085 Exec = AMDGPU::EXEC;
6102 EVT VT =
N->getValueType(0);
6104 unsigned IID =
N->getConstantOperandVal(0);
6105 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6106 IID == Intrinsic::amdgcn_permlanex16;
6114 case Intrinsic::amdgcn_permlane16:
6115 case Intrinsic::amdgcn_permlanex16:
6120 case Intrinsic::amdgcn_writelane:
6123 case Intrinsic::amdgcn_readlane:
6126 case Intrinsic::amdgcn_readfirstlane:
6127 case Intrinsic::amdgcn_permlane64:
6137 if (
SDNode *GL =
N->getGluedNode()) {
6139 GL = GL->getOperand(0).getNode();
6149 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6151 Src1 =
N->getOperand(2);
6152 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6153 Src2 =
N->getOperand(3);
6156 if (ValSize == 32) {
6171 if (IID == Intrinsic::amdgcn_writelane) {
6176 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6178 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6181 if (ValSize % 32 != 0)
6185 EVT VT =
N->getValueType(0);
6189 unsigned NumOperands =
N->getNumOperands();
6191 SDNode *GL =
N->getGluedNode();
6196 for (
unsigned i = 0; i != NE; ++i) {
6197 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6199 SDValue Operand =
N->getOperand(j);
6230 return unrollLaneOp(LaneOp.
getNode());
6237 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6238 for (
unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6246 if (IID == Intrinsic::amdgcn_writelane)
6252 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6253 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6270 if (IID == Intrinsic::amdgcn_writelane)
6273 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6281 switch (
N->getOpcode()) {
6293 unsigned IID =
N->getConstantOperandVal(0);
6295 case Intrinsic::amdgcn_make_buffer_rsrc:
6296 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6298 case Intrinsic::amdgcn_cvt_pkrtz: {
6307 case Intrinsic::amdgcn_cvt_pknorm_i16:
6308 case Intrinsic::amdgcn_cvt_pknorm_u16:
6309 case Intrinsic::amdgcn_cvt_pk_i16:
6310 case Intrinsic::amdgcn_cvt_pk_u16: {
6316 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6318 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6320 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6325 EVT VT =
N->getValueType(0);
6334 case Intrinsic::amdgcn_s_buffer_load: {
6346 EVT VT =
Op.getValueType();
6347 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6359 if (!
Offset->isDivergent()) {
6378 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6390 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6391 Results.push_back(Res.getOperand(
I));
6395 Results.push_back(Res.getValue(1));
6404 EVT VT =
N->getValueType(0);
6409 EVT SelectVT = NewVT;
6410 if (NewVT.
bitsLT(MVT::i32)) {
6413 SelectVT = MVT::i32;
6419 if (NewVT != SelectVT)
6425 if (
N->getValueType(0) != MVT::v2f16)
6438 if (
N->getValueType(0) != MVT::v2f16)
6451 if (
N->getValueType(0) != MVT::f16)
6469 if (
I.getUse().get() !=
Value)
6472 if (
I->getOpcode() == Opcode)
6478unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6480 switch (
Intr->getConstantOperandVal(1)) {
6481 case Intrinsic::amdgcn_if:
6483 case Intrinsic::amdgcn_else:
6485 case Intrinsic::amdgcn_loop:
6487 case Intrinsic::amdgcn_end_cf:
6535 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6548 assert(BR &&
"brcond missing unconditional branch user");
6549 Target = BR->getOperand(1);
6552 unsigned CFNode = isCFIntrinsic(
Intr);
6571 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6601 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6618 Intr->getOperand(0));
6625 MVT VT =
Op.getSimpleValueType();
6628 if (
Op.getConstantOperandVal(0) != 0)
6634 if (
Info->isEntryFunction())
6652 return Op.getValueType().bitsLE(VT) ?
6659 assert(
Op.getValueType() == MVT::f16 &&
6660 "Do not know how to custom lower FP_ROUND for non-f16 type");
6663 EVT SrcVT = Src.getValueType();
6664 if (SrcVT != MVT::f64)
6680 EVT VT =
Op.getValueType();
6683 bool IsIEEEMode =
Info->getMode().IEEE;
6692 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6700 EVT VT =
Op.getValueType();
6704 EVT ExpVT =
Exp.getValueType();
6705 if (ExpVT == MVT::i16)
6726 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6734 EVT VT =
Op.getValueType();
6740 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6767 if (
Op->isDivergent())
6780 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6782 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6785 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6787 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6793 EVT VT =
Op.getValueType();
6800 const APInt &
C = RHSC->getAPIntValue();
6802 if (
C.isPowerOf2()) {
6804 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6809 SL, VT, Result, ShiftAmt),
6829 if (
Op->isDivergent()) {
6846 return lowerTrapEndpgm(
Op, DAG);
6849 lowerTrapHsaQueuePtr(
Op, DAG);
6852SDValue SITargetLowering::lowerTrapEndpgm(
6860 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6870SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6880 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6886 if (UserSGPR == AMDGPU::NoRegister) {
6911SDValue SITargetLowering::lowerTrapHsa(
6937 "debugtrap handler not supported",
6953SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6957 ? AMDGPU::SRC_SHARED_BASE
6958 : AMDGPU::SRC_PRIVATE_BASE;
6981 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6990 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
6996 if (UserSGPR == AMDGPU::NoRegister) {
7003 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7026 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7027 isa<BasicBlockSDNode>(Val))
7030 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7031 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
7045 unsigned DestAS, SrcAS;
7047 bool IsNonNull =
false;
7048 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7049 SrcAS = ASC->getSrcAddressSpace();
7050 Src = ASC->getOperand(0);
7051 DestAS = ASC->getDestAddressSpace();
7054 Op.getConstantOperandVal(0) ==
7055 Intrinsic::amdgcn_addrspacecast_nonnull);
7056 Src =
Op->getOperand(1);
7057 SrcAS =
Op->getConstantOperandVal(2);
7058 DestAS =
Op->getConstantOperandVal(3);
7073 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7087 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7095 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7107 Op.getValueType() == MVT::i64) {
7116 Src.getValueType() == MVT::i64)
7140 EVT InsVT =
Ins.getValueType();
7143 unsigned IdxVal =
Idx->getAsZExtVal();
7148 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7153 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7155 MVT::i32, InsNumElts / 2);
7160 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7162 if (InsNumElts == 2) {
7175 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7197 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
7198 if (NumElts == 4 && EltSize == 16 && KIdx) {
7209 unsigned Idx = KIdx->getZExtValue();
7210 bool InsertLo =
Idx < 2;
7212 InsertLo ? LoVec : HiVec,
7227 if (isa<ConstantSDNode>(
Idx))
7233 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7239 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7255 DAG.
getNOT(SL, BFM, IntVT), BCVec);
7267 EVT ResultVT =
Op.getValueType();
7280 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7283 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7288 if (VecSize == 128) {
7296 }
else if (VecSize == 256) {
7299 for (
unsigned P = 0;
P < 4; ++
P) {
7305 Parts[0], Parts[1]));
7307 Parts[2], Parts[3]));
7313 for (
unsigned P = 0;
P < 8; ++
P) {
7320 Parts[0], Parts[1], Parts[2], Parts[3]));
7323 Parts[4], Parts[5],Parts[6], Parts[7]));
7326 EVT IdxVT =
Idx.getValueType();
7343 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7358 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7368 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7374 EVT ResultVT =
Op.getValueType();
7377 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7379 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7395 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7396 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7404 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7405 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7406 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7407 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7426 EVT ResultVT =
Op.getValueType();
7442 EVT VT =
Op.getValueType();
7444 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7445 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7464 { CastLo, CastHi });
7468 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7475 for (
unsigned P = 0;
P < 4; ++
P)
7476 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7479 for (
unsigned P = 0;
P < 4; ++
P) {
7489 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7496 for (
unsigned P = 0;
P < 8; ++
P)
7497 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7500 for (
unsigned P = 0;
P < 8; ++
P) {
7510 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7562 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7600 EVT PtrVT =
Op.getValueType();
7616 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7689 SDValue Param = lowerKernargMemParameter(
7699 "non-hsa intrinsic with hsa target",
7708 "intrinsic not supported on subtarget",
7718 unsigned NumElts = Elts.
size();
7720 if (NumElts <= 12) {
7729 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7735 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7736 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7745 EVT SrcVT = Src.getValueType();
7766 bool Unpacked,
bool IsD16,
int DMaskPop,
7767 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7770 EVT ReqRetVT = ResultTypes[0];
7772 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7773 ? (ReqRetNumElts + 1) / 2
7776 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7778 MVT DataDwordVT = NumDataDwords == 1 ?
7781 MVT MaskPopVT = MaskPopDwords == 1 ?
7787 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7798 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7800 NumDataDwords - MaskPopDwords);
7805 EVT LegalReqRetVT = ReqRetVT;
7807 if (!
Data.getValueType().isInteger())
7809 Data.getValueType().changeTypeToInteger(),
Data);
7830 if (Result->getNumValues() == 1)
7837 SDValue *LWE,
bool &IsTexFail) {
7838 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7857 unsigned DimIdx,
unsigned EndIdx,
7858 unsigned NumGradients) {
7860 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7868 if (((
I + 1) >= EndIdx) ||
7869 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7870 I == DimIdx + NumGradients - 1))) {
7871 if (
Addr.getValueType() != MVT::i16)
7892 unsigned IntrOpcode =
Intr->BaseOpcode;
7904 bool AdjustRetType =
false;
7905 bool IsAtomicPacked16Bit =
false;
7908 const unsigned ArgOffset = WithChain ? 2 : 1;
7911 unsigned DMaskLanes = 0;
7913 if (BaseOpcode->Atomic) {
7914 VData =
Op.getOperand(2);
7916 IsAtomicPacked16Bit =
7917 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7918 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7921 if (BaseOpcode->AtomicX2) {
7928 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7929 DMask = Is64Bit ? 0xf : 0x3;
7930 NumVDataDwords = Is64Bit ? 4 : 2;
7932 DMask = Is64Bit ? 0x3 : 0x1;
7933 NumVDataDwords = Is64Bit ? 2 : 1;
7936 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7939 if (BaseOpcode->Store) {
7940 VData =
Op.getOperand(2);
7948 VData = handleD16VData(VData, DAG,
true);
7965 (!LoadVT.
isVector() && DMaskLanes > 1))
7973 NumVDataDwords = (DMaskLanes + 1) / 2;
7975 NumVDataDwords = DMaskLanes;
7977 AdjustRetType =
true;
7981 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
7986 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
7988 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7989 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7991 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
7993 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7994 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7997 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
7998 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
7999 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8004 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8008 "Bias needs to be converted to 16 bit in A16 mode");
8013 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8017 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8018 "require 16 bit args for both gradients and addresses");
8023 if (!
ST->hasA16()) {
8024 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8025 "support 16 bit addresses\n");
8035 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8039 IntrOpcode = G16MappingInfo->
G16;
8047 ArgOffset +
Intr->GradientStart,
8048 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8050 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8051 I < ArgOffset + Intr->CoordStart;
I++)
8058 ArgOffset +
Intr->CoordStart, VAddrEnd,
8062 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8080 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8081 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8082 const bool UseNSA =
ST->hasNSAEncoding() &&
8083 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8084 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8085 const bool UsePartialNSA =
8086 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8089 if (UsePartialNSA) {
8091 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8100 if (!BaseOpcode->Sampler) {
8104 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8106 Unorm = UnormConst ? True : False;
8111 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8112 bool IsTexFail =
false;
8113 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8124 NumVDataDwords += 1;
8125 AdjustRetType =
true;
8130 if (AdjustRetType) {
8132 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8135 if (isa<MemSDNode>(
Op))
8140 EVT NewVT = NumVDataDwords > 1 ?
8144 ResultTypes[0] = NewVT;
8145 if (ResultTypes.size() == 3) {
8149 ResultTypes.erase(&ResultTypes[1]);
8153 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8154 if (BaseOpcode->Atomic)
8161 if (BaseOpcode->Store || BaseOpcode->Atomic)
8163 if (UsePartialNSA) {
8172 if (BaseOpcode->Sampler)
8177 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8181 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8189 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8193 if (BaseOpcode->HasD16)
8195 if (isa<MemSDNode>(
Op))
8198 int NumVAddrDwords =
8204 NumVDataDwords, NumVAddrDwords);
8205 }
else if (IsGFX11Plus) {
8207 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8208 : AMDGPU::MIMGEncGfx11Default,
8209 NumVDataDwords, NumVAddrDwords);
8210 }
else if (IsGFX10Plus) {
8212 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8213 : AMDGPU::MIMGEncGfx10Default,
8214 NumVDataDwords, NumVAddrDwords);
8218 NumVDataDwords, NumVAddrDwords);
8221 "requested image instruction is not supported on this GPU");
8226 NumVDataDwords, NumVAddrDwords);
8229 NumVDataDwords, NumVAddrDwords);
8235 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
8240 if (BaseOpcode->AtomicX2) {
8245 if (BaseOpcode->Store)
8249 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8267 if (!
Offset->isDivergent()) {
8312 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8316 unsigned NumLoads = 1;
8322 if (NumElts == 8 || NumElts == 16) {
8323 NumLoads = NumElts / 4;
8331 setBufferOffsets(
Offset, DAG, &Ops[3],
8332 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8335 for (
unsigned i = 0; i < NumLoads; ++i) {
8341 if (NumElts == 8 || NumElts == 16)
8388 EVT VT =
Op.getValueType();
8390 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8394 switch (IntrinsicID) {
8395 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8398 return getPreloadedValue(DAG, *MFI, VT,
8401 case Intrinsic::amdgcn_dispatch_ptr:
8402 case Intrinsic::amdgcn_queue_ptr: {
8405 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8411 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8413 return getPreloadedValue(DAG, *MFI, VT, RegID);
8415 case Intrinsic::amdgcn_implicitarg_ptr: {
8417 return getImplicitArgPtr(DAG,
DL);
8418 return getPreloadedValue(DAG, *MFI, VT,
8421 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8427 return getPreloadedValue(DAG, *MFI, VT,
8430 case Intrinsic::amdgcn_dispatch_id: {
8433 case Intrinsic::amdgcn_rcp:
8435 case Intrinsic::amdgcn_rsq:
8437 case Intrinsic::amdgcn_rsq_legacy:
8441 case Intrinsic::amdgcn_rcp_legacy:
8445 case Intrinsic::amdgcn_rsq_clamp: {
8459 case Intrinsic::r600_read_ngroups_x:
8463 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8466 case Intrinsic::r600_read_ngroups_y:
8470 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8473 case Intrinsic::r600_read_ngroups_z:
8477 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8480 case Intrinsic::r600_read_global_size_x:
8484 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8487 case Intrinsic::r600_read_global_size_y:
8491 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8494 case Intrinsic::r600_read_global_size_z:
8498 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8501 case Intrinsic::r600_read_local_size_x:
8505 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8507 case Intrinsic::r600_read_local_size_y:
8511 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8513 case Intrinsic::r600_read_local_size_z:
8517 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8519 case Intrinsic::amdgcn_workgroup_id_x:
8520 return getPreloadedValue(DAG, *MFI, VT,
8522 case Intrinsic::amdgcn_workgroup_id_y:
8523 return getPreloadedValue(DAG, *MFI, VT,
8525 case Intrinsic::amdgcn_workgroup_id_z:
8526 return getPreloadedValue(DAG, *MFI, VT,
8528 case Intrinsic::amdgcn_wave_id:
8529 return lowerWaveID(DAG,
Op);
8530 case Intrinsic::amdgcn_lds_kernel_id: {
8532 return getLDSKernelId(DAG,
DL);
8533 return getPreloadedValue(DAG, *MFI, VT,
8536 case Intrinsic::amdgcn_workitem_id_x:
8537 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8538 case Intrinsic::amdgcn_workitem_id_y:
8539 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8540 case Intrinsic::amdgcn_workitem_id_z:
8541 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8542 case Intrinsic::amdgcn_wavefrontsize:
8545 case Intrinsic::amdgcn_s_buffer_load: {
8546 unsigned CPol =
Op.getConstantOperandVal(3);
8553 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8556 case Intrinsic::amdgcn_fdiv_fast:
8557 return lowerFDIV_FAST(
Op, DAG);
8558 case Intrinsic::amdgcn_sin:
8561 case Intrinsic::amdgcn_cos:
8564 case Intrinsic::amdgcn_mul_u24:
8566 case Intrinsic::amdgcn_mul_i24:
8569 case Intrinsic::amdgcn_log_clamp: {
8575 case Intrinsic::amdgcn_fract:
8578 case Intrinsic::amdgcn_class:
8580 Op.getOperand(1),
Op.getOperand(2));
8581 case Intrinsic::amdgcn_div_fmas:
8583 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8586 case Intrinsic::amdgcn_div_fixup:
8588 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8590 case Intrinsic::amdgcn_div_scale: {
8603 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8606 Denominator, Numerator);
8608 case Intrinsic::amdgcn_icmp: {
8610 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8611 Op.getConstantOperandVal(2) == 0 &&
8616 case Intrinsic::amdgcn_fcmp: {
8619 case Intrinsic::amdgcn_ballot:
8621 case Intrinsic::amdgcn_fmed3:
8623 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8624 case Intrinsic::amdgcn_fdot2:
8626 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8628 case Intrinsic::amdgcn_fmul_legacy:
8630 Op.getOperand(1),
Op.getOperand(2));
8631 case Intrinsic::amdgcn_sffbh:
8633 case Intrinsic::amdgcn_sbfe:
8635 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8636 case Intrinsic::amdgcn_ubfe:
8638 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8639 case Intrinsic::amdgcn_cvt_pkrtz:
8640 case Intrinsic::amdgcn_cvt_pknorm_i16:
8641 case Intrinsic::amdgcn_cvt_pknorm_u16:
8642 case Intrinsic::amdgcn_cvt_pk_i16:
8643 case Intrinsic::amdgcn_cvt_pk_u16: {
8645 EVT VT =
Op.getValueType();
8648 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8650 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8652 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8654 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8660 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8663 Op.getOperand(1),
Op.getOperand(2));
8666 case Intrinsic::amdgcn_fmad_ftz:
8668 Op.getOperand(2),
Op.getOperand(3));
8670 case Intrinsic::amdgcn_if_break:
8672 Op->getOperand(1),
Op->getOperand(2)), 0);
8674 case Intrinsic::amdgcn_groupstaticsize: {
8686 case Intrinsic::amdgcn_is_shared:
8687 case Intrinsic::amdgcn_is_private: {
8689 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8691 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8699 case Intrinsic::amdgcn_perm:
8701 Op.getOperand(2),
Op.getOperand(3));
8702 case Intrinsic::amdgcn_reloc_constant: {
8706 auto RelocSymbol = cast<GlobalVariable>(
8712 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8713 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8714 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8715 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8716 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8717 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8718 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8719 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8720 if (
Op.getOperand(4).getValueType() == MVT::i32)
8726 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8727 Op.getOperand(3), IndexKeyi32);
8729 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8730 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8731 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8732 if (
Op.getOperand(6).getValueType() == MVT::i32)
8738 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8739 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8740 IndexKeyi32, Op.getOperand(7)});
8742 case Intrinsic::amdgcn_addrspacecast_nonnull:
8743 return lowerADDRSPACECAST(
Op, DAG);
8744 case Intrinsic::amdgcn_readlane:
8745 case Intrinsic::amdgcn_readfirstlane:
8746 case Intrinsic::amdgcn_writelane:
8747 case Intrinsic::amdgcn_permlane16:
8748 case Intrinsic::amdgcn_permlanex16:
8749 case Intrinsic::amdgcn_permlane64:
8754 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8765 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8771 unsigned NewOpcode)
const {
8775 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8776 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8790 auto *
M = cast<MemSDNode>(
Op);
8794 M->getMemOperand());
8799 unsigned NewOpcode)
const {
8803 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8804 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8818 auto *
M = cast<MemSDNode>(
Op);
8822 M->getMemOperand());
8827 unsigned IntrID =
Op.getConstantOperandVal(1);
8831 case Intrinsic::amdgcn_ds_ordered_add:
8832 case Intrinsic::amdgcn_ds_ordered_swap: {
8837 unsigned IndexOperand =
M->getConstantOperandVal(7);
8838 unsigned WaveRelease =
M->getConstantOperandVal(8);
8839 unsigned WaveDone =
M->getConstantOperandVal(9);
8841 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8842 IndexOperand &= ~0x3f;
8843 unsigned CountDw = 0;
8846 CountDw = (IndexOperand >> 24) & 0xf;
8847 IndexOperand &= ~(0xf << 24);
8849 if (CountDw < 1 || CountDw > 4) {
8851 "ds_ordered_count: dword count must be between 1 and 4");
8858 if (WaveDone && !WaveRelease)
8861 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8862 unsigned ShaderType =
8864 unsigned Offset0 = OrderedCountIndex << 2;
8865 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8868 Offset1 |= (CountDw - 1) << 6;
8871 Offset1 |= ShaderType << 2;
8873 unsigned Offset = Offset0 | (Offset1 << 8);
8882 M->getVTList(), Ops,
M->getMemoryVT(),
8883 M->getMemOperand());
8885 case Intrinsic::amdgcn_raw_buffer_load:
8886 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8887 case Intrinsic::amdgcn_raw_buffer_load_format:
8888 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8889 const bool IsFormat =
8890 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8891 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8893 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8894 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8907 auto *
M = cast<MemSDNode>(
Op);
8908 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8910 case Intrinsic::amdgcn_struct_buffer_load:
8911 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8912 case Intrinsic::amdgcn_struct_buffer_load_format:
8913 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8914 const bool IsFormat =
8915 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8916 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8918 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8919 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8932 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8934 case Intrinsic::amdgcn_raw_tbuffer_load:
8935 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8937 EVT LoadVT =
Op.getValueType();
8938 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8939 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8958 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8961 case Intrinsic::amdgcn_struct_tbuffer_load:
8962 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8964 EVT LoadVT =
Op.getValueType();
8965 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8966 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8985 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8988 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8989 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8991 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8992 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8994 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8995 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8997 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9000 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9003 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9004 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9006 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9007 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9009 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9012 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9013 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9015 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9018 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9019 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9021 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9024 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9027 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9030 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9033 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9034 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9036 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9039 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9042 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9043 return lowerRawBufferAtomicIntrin(
Op, DAG,
9045 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9046 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9047 return lowerStructBufferAtomicIntrin(
Op, DAG,
9049 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9050 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9052 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9053 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9055 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9056 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9057 return lowerStructBufferAtomicIntrin(
Op, DAG,
9059 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9060 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9061 return lowerStructBufferAtomicIntrin(
Op, DAG,
9063 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9064 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9065 return lowerStructBufferAtomicIntrin(
Op, DAG,
9067 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9069 return lowerStructBufferAtomicIntrin(
Op, DAG,
9071 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9074 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9077 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9080 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9083 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9084 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9086 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9087 return lowerStructBufferAtomicIntrin(
Op, DAG,
9090 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9092 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9093 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9107 EVT VT =
Op.getValueType();
9108 auto *
M = cast<MemSDNode>(
Op);
9111 Op->getVTList(), Ops, VT,
M->getMemOperand());
9113 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9114 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9115 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9116 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9130 EVT VT =
Op.getValueType();
9131 auto *
M = cast<MemSDNode>(
Op);
9134 Op->getVTList(), Ops, VT,
M->getMemOperand());
9136 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9138 SDValue NodePtr =
M->getOperand(2);
9139 SDValue RayExtent =
M->getOperand(3);
9140 SDValue RayOrigin =
M->getOperand(4);
9142 SDValue RayInvDir =
M->getOperand(6);
9160 const unsigned NumVDataDwords = 4;
9161 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9162 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9166 const unsigned BaseOpcodes[2][2] = {
9167 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9168 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9169 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9173 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9174 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9175 : AMDGPU::MIMGEncGfx10NSA,
9176 NumVDataDwords, NumVAddrDwords);
9180 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9181 : AMDGPU::MIMGEncGfx10Default,
9182 NumVDataDwords, NumVAddrDwords);
9188 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9191 if (Lanes[0].getValueSizeInBits() == 32) {
9192 for (
unsigned I = 0;
I < 3; ++
I)
9199 { Lanes[0], Lanes[1] })));
9206 { Elt0, Lanes[0] })));
9210 { Lanes[1], Lanes[2] })));
9215 if (UseNSA && IsGFX11Plus) {
9223 for (
unsigned I = 0;
I < 3; ++
I) {
9226 {DirLanes[I], InvDirLanes[I]})));
9241 packLanes(RayOrigin,
true);
9242 packLanes(RayDir,
true);
9243 packLanes(RayInvDir,
false);
9248 if (NumVAddrDwords > 12) {
9268 case Intrinsic::amdgcn_global_atomic_fmin:
9269 case Intrinsic::amdgcn_global_atomic_fmax:
9270 case Intrinsic::amdgcn_global_atomic_fmin_num:
9271 case Intrinsic::amdgcn_global_atomic_fmax_num:
9272 case Intrinsic::amdgcn_flat_atomic_fmin:
9273 case Intrinsic::amdgcn_flat_atomic_fmax:
9274 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9275 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9282 unsigned Opcode = 0;
9284 case Intrinsic::amdgcn_global_atomic_fmin:
9285 case Intrinsic::amdgcn_global_atomic_fmin_num:
9286 case Intrinsic::amdgcn_flat_atomic_fmin:
9287 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9291 case Intrinsic::amdgcn_global_atomic_fmax:
9292 case Intrinsic::amdgcn_global_atomic_fmax_num:
9293 case Intrinsic::amdgcn_flat_atomic_fmax:
9294 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9302 Ops,
M->getMemOperand());
9304 case Intrinsic::amdgcn_s_get_barrier_state: {
9308 bool IsInlinableBarID =
false;
9311 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9312 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9316 if (IsInlinableBarID) {
9317 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9321 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9333 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9341SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9351 bool IsTFE = VTList.
NumVTs == 3;
9354 unsigned NumOpDWords = NumValueDWords + 1;
9359 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9360 OpDWordsVT, OpDWordsMMO, DAG);
9375 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9381 WidenedMemVT, WidenedMMO);
9391 bool ImageStore)
const {
9426 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9432 if ((NumElements % 2) == 1) {
9434 unsigned I = Elts.
size() / 2;
9450 if (NumElements == 3) {
9471 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9474 switch (IntrinsicID) {
9475 case Intrinsic::amdgcn_exp_compr: {
9479 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9502 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9505 case Intrinsic::amdgcn_s_barrier: {
9508 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9509 if (WGSize <=
ST.getWavefrontSize())
9511 Op.getOperand(0)), 0);
9515 if (
ST.hasSplitBarriers()) {
9520 MVT::Other, K,
Op.getOperand(0)),
9532 case Intrinsic::amdgcn_struct_tbuffer_store:
9533 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9537 VData = handleD16VData(VData, DAG);
9538 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9539 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9557 M->getMemoryVT(),
M->getMemOperand());
9560 case Intrinsic::amdgcn_raw_tbuffer_store:
9561 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9565 VData = handleD16VData(VData, DAG);
9566 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9567 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9585 M->getMemoryVT(),
M->getMemOperand());
9588 case Intrinsic::amdgcn_raw_buffer_store:
9589 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9590 case Intrinsic::amdgcn_raw_buffer_store_format:
9591 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9592 const bool IsFormat =
9593 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9594 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9601 VData = handleD16VData(VData, DAG);
9611 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9612 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9632 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9635 M->getMemoryVT(),
M->getMemOperand());
9638 case Intrinsic::amdgcn_struct_buffer_store:
9639 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9640 case Intrinsic::amdgcn_struct_buffer_store_format:
9641 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9642 const bool IsFormat =
9643 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9644 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9652 VData = handleD16VData(VData, DAG);
9662 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9663 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9684 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9687 M->getMemoryVT(),
M->getMemOperand());
9689 case Intrinsic::amdgcn_raw_buffer_load_lds:
9690 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9691 case Intrinsic::amdgcn_struct_buffer_load_lds:
9692 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9696 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9697 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9698 unsigned OpOffset = HasVIndex ? 1 : 0;
9699 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9701 unsigned Size =
Op->getConstantOperandVal(4);
9707 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9708 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9709 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9710 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9713 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9714 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9715 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9716 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9719 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9720 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9721 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9722 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9730 if (HasVIndex && HasVOffset)
9736 else if (HasVOffset)
9739 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9743 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9751 auto *
M = cast<MemSDNode>(
Op);
9778 case Intrinsic::amdgcn_global_load_lds: {
9780 unsigned Size =
Op->getConstantOperandVal(4);
9785 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9788 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9791 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9795 auto *
M = cast<MemSDNode>(
Op);
9808 if (
LHS->isDivergent())
9812 RHS.getOperand(0).getValueType() == MVT::i32) {
9815 VOffset =
RHS.getOperand(0);
9820 if (!
Addr->isDivergent()) {
9836 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9856 case Intrinsic::amdgcn_end_cf:
9858 Op->getOperand(2), Chain), 0);
9859 case Intrinsic::amdgcn_s_barrier_init:
9860 case Intrinsic::amdgcn_s_barrier_join:
9861 case Intrinsic::amdgcn_s_wakeup_barrier: {
9866 bool IsInlinableBarID =
false;
9869 if (isa<ConstantSDNode>(BarOp)) {
9870 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9874 if (IsInlinableBarID) {
9875 switch (IntrinsicID) {
9878 case Intrinsic::amdgcn_s_barrier_init:
9879 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9881 case Intrinsic::amdgcn_s_barrier_join:
9882 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9884 case Intrinsic::amdgcn_s_wakeup_barrier:
9885 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9892 switch (IntrinsicID) {
9895 case Intrinsic::amdgcn_s_barrier_init:
9896 Opc = AMDGPU::S_BARRIER_INIT_M0;
9898 case Intrinsic::amdgcn_s_barrier_join:
9899 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9901 case Intrinsic::amdgcn_s_wakeup_barrier:
9902 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9907 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9913 if (!IsInlinableBarID) {
9918 Op.getOperand(2), M0Val),
9922 }
else if (!IsInlinableBarID) {
9932 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9945std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9952 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9969 unsigned Overflow = ImmOffset & ~MaxImm;
9970 ImmOffset -= Overflow;
9971 if ((int32_t)Overflow < 0) {
9972 Overflow += ImmOffset;
9981 SDValue Ops[] = { N0, OverflowVal };
9996void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
9998 Align Alignment)
const {
10001 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10004 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10015 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10017 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10034SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10037 return MaybePointer;
10053 SDValue NumRecords =
Op->getOperand(3);
10056 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10059 std::optional<uint32_t> ConstStride = std::nullopt;
10060 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10061 ConstStride = ConstNode->getZExtValue();
10064 if (!ConstStride || *ConstStride != 0) {
10067 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10078 NewHighHalf, NumRecords, Flags);
10088 bool IsTFE)
const {
10098 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10125 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10129 Ops[1] = BufferStoreExt;
10134 M->getMemOperand());
10159SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10175 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10182 "unexpected vector extload");
10195 "unexpected fp extload");
10213 DCI.AddToWorklist(Cvt.
getNode());
10218 DCI.AddToWorklist(Cvt.
getNode());
10229 if (
Info.isEntryFunction())
10230 return Info.getUserSGPRInfo().hasFlatScratchInit();
10238 EVT MemVT =
Load->getMemoryVT();
10251 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10254 BasePtr, RealMemVT, MMO);
10284 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10285 "Custom lowering for non-i32 vectors hasn't been implemented.");
10288 unsigned AS =
Load->getAddressSpace();
10307 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10324 Alignment >=
Align(4) && NumElements < 32) {
10339 if (NumElements > 4)
10359 if (NumElements > 2)
10364 if (NumElements > 4)
10376 auto Flags =
Load->getMemOperand()->getFlags();
10378 Load->getAlign(), Flags, &
Fast) &&
10387 MemVT, *
Load->getMemOperand())) {
10397 EVT VT =
Op.getValueType();
10434 EVT VT =
Op.getValueType();
10437 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10444 if (!AllowInaccurateRcp && VT != MVT::f16)
10447 if (CLHS->isExactlyValue(1.0)) {
10464 if (CLHS->isExactlyValue(-1.0)) {
10473 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10487 EVT VT =
Op.getValueType();
10490 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10492 if (!AllowInaccurateDiv)
10513 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10526 return DAG.
getNode(Opcode, SL, VTList,
10535 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10548 return DAG.
getNode(Opcode, SL, VTList,
10554 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10555 return FastLowered;
10582 const APFloat K0Val(0x1p+96f);
10585 const APFloat K1Val(0x1p-32f);
10612 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10613 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10614 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10619 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10620 return FastLowered;
10627 Flags.setNoFPExcept(
true);
10644 DenominatorScaled, Flags);
10646 DenominatorScaled, Flags);
10648 using namespace AMDGPU::Hwreg;
10649 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10657 const bool HasDynamicDenormals =
10663 if (!PreservesDenormals) {
10671 if (HasDynamicDenormals) {
10675 SavedDenormMode =
SDValue(GetReg, 0);
10683 const SDValue EnableDenormValue =
10692 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10693 {EnableDenormValue,
BitField, Glue});
10706 ApproxRcp, One, NegDivScale0, Flags);
10709 ApproxRcp, Fma0, Flags);
10712 Fma1, Fma1, Flags);
10715 NumeratorScaled,
Mul, Flags);
10718 Fma2, Fma1,
Mul, Fma2, Flags);
10721 NumeratorScaled, Fma3, Flags);
10723 if (!PreservesDenormals) {
10730 Fma4.
getValue(1), DisableDenormValue,
10733 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10734 const SDValue DisableDenormValue =
10735 HasDynamicDenormals
10740 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10751 {Fma4, Fma1, Fma3, Scale},
Flags);
10757 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10758 return FastLowered;
10786 NegDivScale0,
Mul, DivScale1);
10818 Fma4, Fma3,
Mul, Scale);
10824 EVT VT =
Op.getValueType();
10826 if (VT == MVT::f32)
10827 return LowerFDIV32(
Op, DAG);
10829 if (VT == MVT::f64)
10830 return LowerFDIV64(
Op, DAG);
10832 if (VT == MVT::f16)
10833 return LowerFDIV16(
Op, DAG);
10842 EVT ResultExpVT =
Op->getValueType(1);
10843 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10873 if (VT == MVT::i1) {
10876 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10880 Store->getValue().getValueType().getScalarType() == MVT::i32);
10882 unsigned AS =
Store->getAddressSpace();
10901 if (NumElements > 4)
10908 VT, *
Store->getMemOperand()))
10918 if (NumElements > 2)
10922 if (NumElements > 4 ||
10931 auto Flags =
Store->getMemOperand()->getFlags();
10966 MVT VT =
Op.getValueType().getSimpleVT();
11135 EVT VT =
Op.getValueType();
11152 switch (
Op.getOpcode()) {
11178 EVT VT =
Op.getValueType();
11194 DAGCombinerInfo &DCI)
const {
11195 EVT VT =
N->getValueType(0);
11197 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11204 EVT SrcVT = Src.getValueType();
11210 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11213 DCI.AddToWorklist(Cvt.
getNode());
11216 if (ScalarVT != MVT::f32) {
11228 DAGCombinerInfo &DCI)
const {
11229 SDValue MagnitudeOp =
N->getOperand(0);
11230 SDValue SignOp =
N->getOperand(1);
11288 unsigned AddrSpace,
11290 DAGCombinerInfo &DCI)
const {
11320 AM.HasBaseReg =
true;
11321 AM.BaseOffs =
Offset.getSExtValue();
11326 EVT VT =
N->getValueType(0);
11332 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11343 switch (
N->getOpcode()) {
11354 DAGCombinerInfo &DCI)
const {
11363 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11364 N->getMemoryVT(), DCI);
11368 NewOps[PtrIdx] = NewPtr;
11377 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11378 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11387SDValue SITargetLowering::splitBinaryBitConstantOp(
11388 DAGCombinerInfo &DCI,
11410 if (V.getValueType() != MVT::i1)
11412 switch (V.getOpcode()) {
11431 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11432 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11433 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11434 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11435 uint32_t NonZeroByteMask = ~ZeroByteMask;
11436 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11449 assert(V.getValueSizeInBits() == 32);
11451 if (V.getNumOperands() != 2)
11460 switch (V.getOpcode()) {
11465 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11470 return (0x03020100 & ~ConstMask) | ConstMask;
11477 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11483 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11490 DAGCombinerInfo &DCI)
const {
11491 if (DCI.isBeforeLegalize())
11495 EVT VT =
N->getValueType(0);
11501 if (VT == MVT::i64 && CRHS) {
11507 if (CRHS && VT == MVT::i32) {
11516 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11517 unsigned Shift = CShift->getZExtValue();
11519 unsigned Offset = NB + Shift;
11520 if ((
Offset & (Bits - 1)) == 0) {
11523 LHS->getOperand(0),
11538 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11544 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11559 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11564 if (
X !=
LHS.getOperand(1))
11602 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11603 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11606 Mask->getZExtValue() & ~OrdMask :
11607 Mask->getZExtValue() & OrdMask;
11615 if (VT == MVT::i32 &&
11628 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11631 if (LHSMask != ~0u && RHSMask != ~0u) {
11634 if (LHSMask > RHSMask) {
11641 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11642 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11645 if (!(LHSUsedLanes & RHSUsedLanes) &&
11648 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11655 for (
unsigned I = 0;
I < 32;
I += 8) {
11657 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11658 Mask &= (0x0c <<
I) & 0xffffffff;
11667 LHS.getOperand(0),
RHS.getOperand(0),
11716static const std::optional<ByteProvider<SDValue>>
11718 unsigned Depth = 0) {
11721 return std::nullopt;
11723 if (
Op.getValueSizeInBits() < 8)
11724 return std::nullopt;
11726 if (
Op.getValueType().isVector())
11729 switch (
Op->getOpcode()) {
11740 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11741 NarrowVT = VTSign->getVT();
11744 return std::nullopt;
11747 if (SrcIndex >= NarrowByteWidth)
11748 return std::nullopt;
11754 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11756 return std::nullopt;
11758 uint64_t BitShift = ShiftOp->getZExtValue();
11760 if (BitShift % 8 != 0)
11761 return std::nullopt;
11763 SrcIndex += BitShift / 8;
11781static const std::optional<ByteProvider<SDValue>>
11783 unsigned StartingIndex = 0) {
11787 return std::nullopt;
11789 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11791 return std::nullopt;
11793 return std::nullopt;
11795 bool IsVec =
Op.getValueType().isVector();
11796 switch (
Op.getOpcode()) {
11799 return std::nullopt;
11804 return std::nullopt;
11808 return std::nullopt;
11811 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11812 return std::nullopt;
11813 if (!
LHS ||
LHS->isConstantZero())
11815 if (!
RHS ||
RHS->isConstantZero())
11817 return std::nullopt;
11822 return std::nullopt;
11824 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11826 return std::nullopt;
11828 uint32_t BitMask = BitMaskOp->getZExtValue();
11832 if ((IndexMask & BitMask) != IndexMask) {
11835 if (IndexMask & BitMask)
11836 return std::nullopt;
11845 return std::nullopt;
11848 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11849 if (!ShiftOp ||
Op.getValueType().isVector())
11850 return std::nullopt;
11852 uint64_t BitsProvided =
Op.getValueSizeInBits();
11853 if (BitsProvided % 8 != 0)
11854 return std::nullopt;
11856 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11858 return std::nullopt;
11860 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11861 uint64_t ByteShift = BitShift / 8;
11863 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11864 uint64_t BytesProvided = BitsProvided / 8;
11865 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11866 NewIndex %= BytesProvided;
11873 return std::nullopt;
11875 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11877 return std::nullopt;
11879 uint64_t BitShift = ShiftOp->getZExtValue();
11881 return std::nullopt;
11883 auto BitsProvided =
Op.getScalarValueSizeInBits();
11884 if (BitsProvided % 8 != 0)
11885 return std::nullopt;
11887 uint64_t BytesProvided = BitsProvided / 8;
11888 uint64_t ByteShift = BitShift / 8;
11893 return BytesProvided - ByteShift >
Index
11901 return std::nullopt;
11903 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11905 return std::nullopt;
11907 uint64_t BitShift = ShiftOp->getZExtValue();
11908 if (BitShift % 8 != 0)
11909 return std::nullopt;
11910 uint64_t ByteShift = BitShift / 8;
11916 return Index < ByteShift
11919 Depth + 1, StartingIndex);
11928 return std::nullopt;
11935 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11936 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11938 if (NarrowBitWidth % 8 != 0)
11939 return std::nullopt;
11940 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11942 if (
Index >= NarrowByteWidth)
11944 ? std::optional<ByteProvider<SDValue>>(
11952 return std::nullopt;
11956 if (NarrowByteWidth >=
Index) {
11961 return std::nullopt;
11968 return std::nullopt;
11972 auto L = cast<LoadSDNode>(
Op.getNode());
11974 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11975 if (NarrowBitWidth % 8 != 0)
11976 return std::nullopt;
11977 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11982 if (
Index >= NarrowByteWidth) {
11984 ? std::optional<ByteProvider<SDValue>>(
11989 if (NarrowByteWidth >
Index) {
11993 return std::nullopt;
11998 return std::nullopt;
12001 Depth + 1, StartingIndex);
12005 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12007 return std::nullopt;
12008 auto VecIdx = IdxOp->getZExtValue();
12009 auto ScalarSize =
Op.getScalarValueSizeInBits();
12010 if (ScalarSize < 32)
12011 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
12013 StartingIndex,
Index);
12018 return std::nullopt;
12020 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12022 return std::nullopt;
12025 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
12026 if (IdxMask > 0x07 && IdxMask != 0x0c)
12027 return std::nullopt;
12029 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12030 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12032 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12038 return std::nullopt;
12053 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12057 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12060 auto MemVT = L->getMemoryVT();
12063 return L->getMemoryVT().getSizeInBits() == 16;
12073 int Low8 = Mask & 0xff;
12074 int Hi8 = (Mask & 0xff00) >> 8;
12076 assert(Low8 < 8 && Hi8 < 8);
12078 bool IsConsecutive = (Hi8 - Low8 == 1);
12083 bool Is16Aligned = !(Low8 % 2);
12085 return IsConsecutive && Is16Aligned;
12093 int Low16 = PermMask & 0xffff;
12094 int Hi16 = (PermMask & 0xffff0000) >> 16;
12104 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12106 if (!OtherOpIs16Bit)
12114 unsigned DWordOffset) {
12117 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12119 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12124 if (Src.getValueType().isVector()) {
12125 auto ScalarTySize = Src.getScalarValueSizeInBits();
12126 auto ScalarTy = Src.getValueType().getScalarType();
12127 if (ScalarTySize == 32) {
12131 if (ScalarTySize > 32) {
12134 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12135 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12142 assert(ScalarTySize < 32);
12143 auto NumElements =
TypeSize / ScalarTySize;
12144 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12145 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12146 auto NumElementsIn32 = 32 / ScalarTySize;
12147 auto NumAvailElements = DWordOffset < Trunc32Elements
12149 : NumElements - NormalizedTrunc;
12162 auto ShiftVal = 32 * DWordOffset;
12170 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12175 for (
int i = 0; i < 4; i++) {
12177 std::optional<ByteProvider<SDValue>>
P =
12180 if (!
P ||
P->isConstantZero())
12185 if (PermNodes.
size() != 4)
12188 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12189 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12191 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12192 auto PermOp = PermNodes[i];
12195 int SrcByteAdjust = 4;
12199 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12200 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12202 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12203 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12207 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12208 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12211 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12213 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12216 SDValue Op = *PermNodes[FirstSrc.first].Src;
12218 assert(
Op.getValueSizeInBits() == 32);
12222 int Low16 = PermMask & 0xffff;
12223 int Hi16 = (PermMask & 0xffff0000) >> 16;
12225 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12226 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12229 if (WellFormedLow && WellFormedHi)
12233 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12242 assert(
Op.getValueType().isByteSized() &&
12260 DAGCombinerInfo &DCI)
const {
12265 EVT VT =
N->getValueType(0);
12266 if (VT == MVT::i1) {
12271 if (Src !=
RHS.getOperand(0))
12276 if (!CLHS || !CRHS)
12280 static const uint32_t MaxMask = 0x3ff;
12294 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12299 Sel |=
LHS.getConstantOperandVal(2);
12308 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12312 auto usesCombinedOperand = [](
SDNode *OrUse) {
12315 !OrUse->getValueType(0).isVector())
12319 for (
auto VUse : OrUse->uses()) {
12320 if (!VUse->getValueType(0).isVector())
12327 if (VUse->getOpcode() == VectorwiseOp)
12333 if (!
any_of(
N->uses(), usesCombinedOperand))
12339 if (LHSMask != ~0u && RHSMask != ~0u) {
12342 if (LHSMask > RHSMask) {
12349 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12350 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12353 if (!(LHSUsedLanes & RHSUsedLanes) &&
12356 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12358 LHSMask &= ~RHSUsedLanes;
12359 RHSMask &= ~LHSUsedLanes;
12361 LHSMask |= LHSUsedLanes & 0x04040404;
12367 LHS.getOperand(0),
RHS.getOperand(0),
12371 if (LHSMask == ~0u || RHSMask == ~0u) {
12377 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12392 if (SrcVT == MVT::i32) {
12398 DCI.AddToWorklist(LowOr.
getNode());
12399 DCI.AddToWorklist(HiBits.
getNode());
12407 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12411 N->getOperand(0), CRHS))
12419 DAGCombinerInfo &DCI)
const {
12420 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12429 EVT VT =
N->getValueType(0);
12430 if (CRHS && VT == MVT::i64) {
12452 LHS->getOperand(0), FNegLHS, FNegRHS);
12461 DAGCombinerInfo &DCI)
const {
12466 EVT VT =
N->getValueType(0);
12467 if (VT != MVT::i32)
12471 if (Src.getValueType() != MVT::i16)
12478SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12479 DAGCombinerInfo &DCI)
const {
12481 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12486 VTSign->getVT() == MVT::i8) ||
12488 VTSign->getVT() == MVT::i16))) {
12490 "s_buffer_load_{u8, i8} are supported "
12491 "in GFX12 (or newer) architectures.");
12492 EVT VT = Src.getValueType();
12497 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12503 auto *
M = cast<MemSDNode>(Src);
12504 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12505 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12510 VTSign->getVT() == MVT::i8) ||
12512 VTSign->getVT() == MVT::i16)) &&
12514 auto *
M = cast<MemSDNode>(Src);
12526 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12527 Src.getOperand(0).getValueType());
12530 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12532 Ops,
M->getMemoryVT(),
12533 M->getMemOperand());
12534 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12541 DAGCombinerInfo &DCI)
const {
12549 if (
N->getOperand(0).isUndef())
12556 DAGCombinerInfo &DCI)
const {
12557 EVT VT =
N->getValueType(0);
12561 return DCI.DAG.getConstantFP(
12584 unsigned Opcode =
Op.getOpcode();
12588 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12589 const auto &
F = CFP->getValueAPF();
12590 if (
F.isNaN() &&
F.isSignaling())
12592 if (!
F.isDenormal())
12655 if (
Op.getValueType() == MVT::i32) {
12660 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12661 if (
RHS->getZExtValue() == 0xffff0000) {
12671 return Op.getValueType().getScalarType() != MVT::f16;
12739 if (
Op.getValueType() == MVT::i16) {
12750 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12752 switch (IntrinsicID) {
12753 case Intrinsic::amdgcn_cvt_pkrtz:
12754 case Intrinsic::amdgcn_cubeid:
12755 case Intrinsic::amdgcn_frexp_mant:
12756 case Intrinsic::amdgcn_fdot2:
12757 case Intrinsic::amdgcn_rcp:
12758 case Intrinsic::amdgcn_rsq:
12759 case Intrinsic::amdgcn_rsq_clamp:
12760 case Intrinsic::amdgcn_rcp_legacy:
12761 case Intrinsic::amdgcn_rsq_legacy:
12762 case Intrinsic::amdgcn_trig_preop:
12763 case Intrinsic::amdgcn_log:
12764 case Intrinsic::amdgcn_exp2:
12765 case Intrinsic::amdgcn_sqrt:
12786 unsigned Opcode =
MI->getOpcode();
12788 if (Opcode == AMDGPU::G_FCANONICALIZE)
12791 std::optional<FPValueAndVReg> FCR;
12794 if (FCR->Value.isSignaling())
12796 if (!FCR->Value.isDenormal())
12807 case AMDGPU::G_FADD:
12808 case AMDGPU::G_FSUB:
12809 case AMDGPU::G_FMUL:
12810 case AMDGPU::G_FCEIL:
12811 case AMDGPU::G_FFLOOR:
12812 case AMDGPU::G_FRINT:
12813 case AMDGPU::G_FNEARBYINT:
12814 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12815 case AMDGPU::G_INTRINSIC_TRUNC:
12816 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12817 case AMDGPU::G_FMA:
12818 case AMDGPU::G_FMAD:
12819 case AMDGPU::G_FSQRT:
12820 case AMDGPU::G_FDIV:
12821 case AMDGPU::G_FREM:
12822 case AMDGPU::G_FPOW:
12823 case AMDGPU::G_FPEXT:
12824 case AMDGPU::G_FLOG:
12825 case AMDGPU::G_FLOG2:
12826 case AMDGPU::G_FLOG10:
12827 case AMDGPU::G_FPTRUNC:
12828 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12829 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12830 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12831 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12832 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12834 case AMDGPU::G_FNEG:
12835 case AMDGPU::G_FABS:
12836 case AMDGPU::G_FCOPYSIGN:
12838 case AMDGPU::G_FMINNUM:
12839 case AMDGPU::G_FMAXNUM:
12840 case AMDGPU::G_FMINNUM_IEEE:
12841 case AMDGPU::G_FMAXNUM_IEEE:
12842 case AMDGPU::G_FMINIMUM:
12843 case AMDGPU::G_FMAXIMUM: {
12851 case AMDGPU::G_BUILD_VECTOR:
12856 case AMDGPU::G_INTRINSIC:
12857 case AMDGPU::G_INTRINSIC_CONVERGENT:
12859 case Intrinsic::amdgcn_fmul_legacy:
12860 case Intrinsic::amdgcn_fmad_ftz:
12861 case Intrinsic::amdgcn_sqrt:
12862 case Intrinsic::amdgcn_fmed3:
12863 case Intrinsic::amdgcn_sin:
12864 case Intrinsic::amdgcn_cos:
12865 case Intrinsic::amdgcn_log:
12866 case Intrinsic::amdgcn_exp2:
12867 case Intrinsic::amdgcn_log_clamp:
12868 case Intrinsic::amdgcn_rcp:
12869 case Intrinsic::amdgcn_rcp_legacy:
12870 case Intrinsic::amdgcn_rsq:
12871 case Intrinsic::amdgcn_rsq_clamp:
12872 case Intrinsic::amdgcn_rsq_legacy:
12873 case Intrinsic::amdgcn_div_scale:
12874 case Intrinsic::amdgcn_div_fmas:
12875 case Intrinsic::amdgcn_div_fixup:
12876 case Intrinsic::amdgcn_fract:
12877 case Intrinsic::amdgcn_cvt_pkrtz:
12878 case Intrinsic::amdgcn_cubeid:
12879 case Intrinsic::amdgcn_cubema:
12880 case Intrinsic::amdgcn_cubesc:
12881 case Intrinsic::amdgcn_cubetc:
12882 case Intrinsic::amdgcn_frexp_mant:
12883 case Intrinsic::amdgcn_fdot2:
12884 case Intrinsic::amdgcn_trig_preop:
12899SDValue SITargetLowering::getCanonicalConstantFP(
12902 if (
C.isDenormal()) {
12916 if (
C.isSignaling()) {
12935 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12938SDValue SITargetLowering::performFCanonicalizeCombine(
12940 DAGCombinerInfo &DCI)
const {
12943 EVT VT =
N->getValueType(0);
12952 EVT VT =
N->getValueType(0);
12953 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
12969 EVT EltVT =
Lo.getValueType();
12972 for (
unsigned I = 0;
I != 2; ++
I) {
12975 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
12976 CFP->getValueAPF());
12977 }
else if (
Op.isUndef()) {
12989 if (isa<ConstantFPSDNode>(NewElts[1]))
12990 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12995 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13046 if (!MinK || !MaxK)
13059 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13060 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13102 if (
Info->getMode().DX10Clamp) {
13111 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13143 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13146 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.
hasIEEEMinMax3();
13151 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13160 DAGCombinerInfo &DCI)
const {
13163 EVT VT =
N->getValueType(0);
13164 unsigned Opc =
N->getOpcode();
13178 N->getValueType(0),
13191 N->getValueType(0),
13201 if (
SDValue Med3 = performIntMed3ImmCombine(
13206 if (
SDValue Med3 = performIntMed3ImmCombine(
13212 if (
SDValue Med3 = performIntMed3ImmCombine(
13217 if (
SDValue Med3 = performIntMed3ImmCombine(
13227 (VT == MVT::f32 || VT == MVT::f64 ||
13231 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13242 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13243 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13252 DAGCombinerInfo &DCI)
const {
13253 EVT VT =
N->getValueType(0);
13276 if (
Info->getMode().DX10Clamp) {
13279 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13282 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13285 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13296 DAGCombinerInfo &DCI)
const {
13300 return DCI.DAG.getUNDEF(
N->getValueType(0));
13308 bool IsDivergentIdx,
13313 unsigned VecSize = EltSize * NumElem;
13316 if (VecSize <= 64 && EltSize < 32)
13325 if (IsDivergentIdx)
13329 unsigned NumInsts = NumElem +
13330 ((EltSize + 31) / 32) * NumElem ;
13335 return NumInsts <= 16;
13339 return NumInsts <= 15;
13344 if (isa<ConstantSDNode>(
Idx))
13357SDValue SITargetLowering::performExtractVectorEltCombine(
13358 SDNode *
N, DAGCombinerInfo &DCI)
const {
13364 EVT ResVT =
N->getValueType(0);
13383 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13411 DCI.AddToWorklist(Elt0.
getNode());
13412 DCI.AddToWorklist(Elt1.
getNode());
13434 if (!DCI.isBeforeLegalize())
13440 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13441 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13442 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13445 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13446 unsigned EltIdx = BitIndex / 32;
13447 unsigned LeftoverBitIdx = BitIndex % 32;
13451 DCI.AddToWorklist(Cast.
getNode());
13455 DCI.AddToWorklist(Elt.
getNode());
13458 DCI.AddToWorklist(Srl.
getNode());
13462 DCI.AddToWorklist(Trunc.
getNode());
13464 if (VecEltVT == ResVT) {
13476SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13477 DAGCombinerInfo &DCI)
const {
13491 EVT IdxVT =
Idx.getValueType();
13508 Src.getOperand(0).getValueType() == MVT::f16) {
13509 return Src.getOperand(0);
13512 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13513 APFloat Val = CFP->getValueAPF();
13514 bool LosesInfo =
true;
13524 DAGCombinerInfo &DCI)
const {
13526 "combine only useful on gfx8");
13528 SDValue TruncSrc =
N->getOperand(0);
13529 EVT VT =
N->getValueType(0);
13530 if (VT != MVT::f16)
13568unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13570 const SDNode *N1)
const {
13575 if (((VT == MVT::f32 &&
13577 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13597 EVT VT =
N->getValueType(0);
13598 if (VT != MVT::i32 && VT != MVT::i64)
13604 unsigned Opc =
N->getOpcode();
13627 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13649 DAGCombinerInfo &DCI)
const {
13653 EVT VT =
N->getValueType(0);
13663 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13667 if (NumBits <= 32 || NumBits > 64)
13679 unsigned NumUsers = 0;
13704 bool MulSignedLo =
false;
13705 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13714 if (VT != MVT::i64) {
13737 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13739 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13741 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13743 if (!MulLHSUnsigned32) {
13750 if (!MulRHSUnsigned32) {
13761 if (VT != MVT::i64)
13768static std::optional<ByteProvider<SDValue>>
13771 if (!Byte0 || Byte0->isConstantZero()) {
13772 return std::nullopt;
13775 if (Byte1 && !Byte1->isConstantZero()) {
13776 return std::nullopt;
13782 unsigned FirstCs =
First & 0x0c0c0c0c;
13783 unsigned SecondCs = Second & 0x0c0c0c0c;
13784 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13785 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13787 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13788 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13789 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13790 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13792 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13816 for (
int BPI = 0; BPI < 2; BPI++) {
13819 BPP = {Src1, Src0};
13821 unsigned ZeroMask = 0x0c0c0c0c;
13822 unsigned FMask = 0xFF << (8 * (3 - Step));
13824 unsigned FirstMask =
13825 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13826 unsigned SecondMask =
13827 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13831 int FirstGroup = -1;
13832 for (
int I = 0;
I < 2;
I++) {
13834 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13835 return IterElt.SrcOp == *BPP.first.Src &&
13836 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13846 if (FirstGroup != -1) {
13848 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13849 return IterElt.SrcOp == *BPP.second.Src &&
13850 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13856 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13864 unsigned ZeroMask = 0x0c0c0c0c;
13865 unsigned FMask = 0xFF << (8 * (3 - Step));
13869 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13873 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13884 if (Srcs.
size() == 1) {
13885 auto Elt = Srcs.
begin();
13889 if (Elt->PermMask == 0x3020100)
13896 auto FirstElt = Srcs.
begin();
13897 auto SecondElt = std::next(FirstElt);
13904 auto FirstMask = FirstElt->PermMask;
13905 auto SecondMask = SecondElt->PermMask;
13907 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13908 unsigned FirstPlusFour = FirstMask | 0x04040404;
13911 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13923 FirstElt = std::next(SecondElt);
13924 if (FirstElt == Srcs.
end())
13927 SecondElt = std::next(FirstElt);
13930 if (SecondElt == Srcs.
end()) {
13936 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13942 return Perms.
size() == 2
13948 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13949 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13950 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13951 EntryMask += ZeroMask;
13956 auto Opcode =
Op.getOpcode();
13962static std::optional<bool>
13973 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13976 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13978 assert(!(S0IsUnsigned && S0IsSigned));
13979 assert(!(S1IsUnsigned && S1IsSigned));
13987 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13993 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13994 return std::nullopt;
14006 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14007 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14012 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14018 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14019 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14020 return std::nullopt;
14026 DAGCombinerInfo &DCI)
const {
14028 EVT VT =
N->getValueType(0);
14035 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14040 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14047 std::optional<bool> IsSigned;
14053 int ChainLength = 0;
14054 for (
int I = 0;
I < 4;
I++) {
14055 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14058 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14061 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14066 TempNode->getOperand(MulIdx), *Src0, *Src1,
14067 TempNode->getOperand(MulIdx)->getOperand(0),
14068 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14072 IsSigned = *IterIsSigned;
14073 if (*IterIsSigned != *IsSigned)
14076 auto AddIdx = 1 - MulIdx;
14079 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14080 Src2s.
push_back(TempNode->getOperand(AddIdx));
14090 TempNode->getOperand(AddIdx), *Src0, *Src1,
14091 TempNode->getOperand(AddIdx)->getOperand(0),
14092 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14096 if (*IterIsSigned != *IsSigned)
14100 ChainLength =
I + 2;
14104 TempNode = TempNode->getOperand(AddIdx);
14106 ChainLength =
I + 1;
14107 if (TempNode->getNumOperands() < 2)
14109 LHS = TempNode->getOperand(0);
14110 RHS = TempNode->getOperand(1);
14113 if (ChainLength < 2)
14119 if (ChainLength < 4) {
14129 bool UseOriginalSrc =
false;
14130 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14131 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14132 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14133 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14135 auto Src0Mask = Src0s.
begin()->PermMask;
14136 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14137 bool UniqueEntries =
true;
14138 for (
auto I = 1;
I < 4;
I++) {
14139 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14142 UniqueEntries =
false;
14148 if (UniqueEntries) {
14149 UseOriginalSrc =
true;
14151 auto FirstElt = Src0s.
begin();
14155 auto SecondElt = Src1s.
begin();
14157 SecondElt->DWordOffset);
14166 if (!UseOriginalSrc) {
14173 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14176 : Intrinsic::amdgcn_udot4,
14186 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14191 unsigned Opc =
LHS.getOpcode();
14196 Opc =
RHS.getOpcode();
14202 auto Cond =
RHS.getOperand(0);
14210 return DAG.
getNode(Opc, SL, VTList, Args);
14224 DAGCombinerInfo &DCI)
const {
14226 EVT VT =
N->getValueType(0);
14228 if (VT != MVT::i32)
14237 unsigned Opc =
RHS.getOpcode();
14243 auto Cond =
RHS.getOperand(0);
14251 return DAG.
getNode(Opc, SL, VTList, Args);
14265SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14266 DAGCombinerInfo &DCI)
const {
14268 if (
N->getValueType(0) != MVT::i32)
14279 unsigned LHSOpc =
LHS.getOpcode();
14280 unsigned Opc =
N->getOpcode();
14290 DAGCombinerInfo &DCI)
const {
14295 EVT VT =
N->getValueType(0);
14307 if (
A ==
LHS.getOperand(1)) {
14308 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14309 if (FusedOp != 0) {
14311 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14319 if (
A ==
RHS.getOperand(1)) {
14320 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14321 if (FusedOp != 0) {
14323 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14332 DAGCombinerInfo &DCI)
const {
14338 EVT VT =
N->getValueType(0);
14351 if (
A ==
LHS.getOperand(1)) {
14352 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14357 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14366 if (
A ==
RHS.getOperand(1)) {
14367 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14370 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14379 DAGCombinerInfo &DCI)
const {
14382 EVT VT =
N->getValueType(0);
14396 bool IsNegative =
false;
14397 if (CLHS->isExactlyValue(1.0) ||
14398 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14414 DAGCombinerInfo &DCI)
const {
14416 EVT VT =
N->getValueType(0);
14438 (
N->getFlags().hasAllowContract() &&
14439 FMA->getFlags().hasAllowContract())) {
14473 if (Vec1 == Vec2 || Vec3 == Vec4)
14479 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14480 (Vec1 == Vec4 && Vec2 == Vec3)) {
14489 DAGCombinerInfo &DCI)
const {
14495 EVT VT =
LHS.getValueType();
14498 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14500 CRHS = dyn_cast<ConstantSDNode>(LHS);
14524 return LHS.getOperand(0);
14530 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14531 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14532 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14539 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14540 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14548 return LHS.getOperand(0);
14552 if (VT != MVT::f32 && VT != MVT::f64 &&
14585 DAGCombinerInfo &DCI)
const {
14603 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14607 unsigned ShiftOffset = 8 *
Offset;
14609 ShiftOffset -=
C->getZExtValue();
14611 ShiftOffset +=
C->getZExtValue();
14613 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14615 MVT::f32, Shifted);
14626 DCI.AddToWorklist(
N);
14633 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14639 DAGCombinerInfo &DCI)
const {
14649 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14652 APFloat One(
F.getSemantics(),
"1.0");
14654 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14664 switch (
N->getOpcode()) {
14666 return performAddCombine(
N, DCI);
14668 return performSubCombine(
N, DCI);
14671 return performAddCarrySubCarryCombine(
N, DCI);
14673 return performFAddCombine(
N, DCI);
14675 return performFSubCombine(
N, DCI);
14677 return performFDivCombine(
N, DCI);
14679 return performSetCCCombine(
N, DCI);
14692 return performMinMaxCombine(
N, DCI);
14694 return performFMACombine(
N, DCI);
14696 return performAndCombine(
N, DCI);
14698 return performOrCombine(
N, DCI);
14701 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14702 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14708 return performXorCombine(
N, DCI);
14710 return performZeroExtendCombine(
N, DCI);
14712 return performSignExtendInRegCombine(
N , DCI);
14714 return performClassCombine(
N, DCI);
14716 return performFCanonicalizeCombine(
N, DCI);
14718 return performRcpCombine(
N, DCI);
14733 return performUCharToFloatCombine(
N, DCI);
14735 return performFCopySignCombine(
N, DCI);
14740 return performCvtF32UByteNCombine(
N, DCI);
14742 return performFMed3Combine(
N, DCI);
14744 return performCvtPkRTZCombine(
N, DCI);
14746 return performClampCombine(
N, DCI);
14749 EVT VT =
N->getValueType(0);
14752 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14755 EVT EltVT = Src.getValueType();
14756 if (EltVT != MVT::i16)
14766 return performExtractVectorEltCombine(
N, DCI);
14768 return performInsertVectorEltCombine(
N, DCI);
14770 return performFPRoundCombine(
N, DCI);
14772 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14778 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14779 return performMemSDNodeCombine(MemNode, DCI);
14792 default:
return ~0u;
14793 case AMDGPU::sub0:
return 0;
14794 case AMDGPU::sub1:
return 1;
14795 case AMDGPU::sub2:
return 2;
14796 case AMDGPU::sub3:
return 3;
14797 case AMDGPU::sub4:
return 4;
14804 unsigned Opcode =
Node->getMachineOpcode();
14808 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14814 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14815 unsigned NewDmask = 0;
14818 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14819 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14822 unsigned TFCLane = 0;
14823 bool HasChain =
Node->getNumValues() > 1;
14825 if (OldDmask == 0) {
14833 TFCLane = OldBitsSet;
14841 if (
I.getUse().getResNo() != 0)
14845 if (!
I->isMachineOpcode() ||
14846 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14858 if (UsesTFC && Lane == TFCLane) {
14863 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14865 Dmask &= ~(1 << Comp);
14873 NewDmask |= 1 << Comp;
14878 bool NoChannels = !NewDmask;
14885 if (OldBitsSet == 1)
14891 if (NewDmask == OldDmask)
14900 unsigned NewChannels = BitsSet + UsesTFC;
14904 assert(NewOpcode != -1 &&
14905 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14906 "failed to find equivalent MIMG op");
14914 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14916 MVT ResultVT = NewChannels == 1 ?
14918 NewChannels == 5 ? 8 : NewChannels);
14932 if (NewChannels == 1) {
14942 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14947 if (i || !NoChannels)
14952 if (NewUser !=
User) {
14960 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14961 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14962 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14963 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
14973 Op =
Op.getOperand(0);
14975 return isa<FrameIndexSDNode>(
Op);
14984 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14985 SDValue SrcVal = Node->getOperand(2);
14993 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14995 SDNode *Glued = Node->getGluedNode();
14997 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15004 return ToResultReg.
getNode();
15009 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15017 Node->getOperand(i).getValueType(),
15018 Node->getOperand(i)), 0));
15029 unsigned Opcode = Node->getMachineOpcode();
15031 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15032 !
TII->isGather4(Opcode) &&
15034 return adjustWritemask(Node, DAG);
15037 if (Opcode == AMDGPU::INSERT_SUBREG ||
15038 Opcode == AMDGPU::REG_SEQUENCE) {
15044 case AMDGPU::V_DIV_SCALE_F32_e64:
15045 case AMDGPU::V_DIV_SCALE_F64_e64: {
15049 SDValue Src0 = Node->getOperand(1);
15050 SDValue Src1 = Node->getOperand(3);
15051 SDValue Src2 = Node->getOperand(5);
15055 (Src0 == Src1 || Src0 == Src2))
15112 unsigned InitIdx = 0;
15114 if (
TII->isImage(
MI)) {
15122 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15123 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15124 unsigned D16Val = D16 ? D16->getImm() : 0;
15126 if (!TFEVal && !LWEVal)
15137 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15139 unsigned dmask = MO_Dmask->
getImm();
15146 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15152 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15153 if (DstSize < InitIdx)
15156 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15164 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15165 unsigned NewDst = 0;
15174 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15175 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15193 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15206 if (
TII->isVOP3(
MI.getOpcode())) {
15208 TII->legalizeOperandsVOP3(
MRI,
MI);
15213 if (!
MI.getDesc().operands().empty()) {
15214 unsigned Opc =
MI.getOpcode();
15215 bool HasAGPRs =
Info->mayNeedAGPRs();
15223 if ((
I == Src2Idx) && (HasAGPRs))
15226 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15228 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15229 if (!
TRI->hasAGPRs(RC))
15231 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15232 if (!Src || !Src->isCopy() ||
15233 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15235 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15239 MRI.setRegClass(
Op.getReg(), NewRC);
15246 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15247 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15248 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15249 if (
TRI->isVectorSuperClass(RC)) {
15250 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15251 MRI.setRegClass(Src2->getReg(), NewRC);
15252 if (Src2->isTied())
15253 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15262 if (
TII->isImage(
MI))
15263 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15289 MVT::v2i32, Ops0), 0);
15319 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15341std::pair<unsigned, const TargetRegisterClass *>
15348 if (Constraint.
size() == 1) {
15350 switch (Constraint[0]) {
15357 RC = &AMDGPU::SReg_32RegClass;
15360 RC = &AMDGPU::SGPR_64RegClass;
15365 return std::pair(0U,
nullptr);
15372 RC = &AMDGPU::VGPR_32RegClass;
15377 return std::pair(0U,
nullptr);
15386 RC = &AMDGPU::AGPR_32RegClass;
15391 return std::pair(0U,
nullptr);
15400 return std::pair(0U, RC);
15405 if (
RegName.consume_front(
"v")) {
15406 RC = &AMDGPU::VGPR_32RegClass;
15407 }
else if (
RegName.consume_front(
"s")) {
15408 RC = &AMDGPU::SGPR_32RegClass;
15409 }
else if (
RegName.consume_front(
"a")) {
15410 RC = &AMDGPU::AGPR_32RegClass;
15415 if (
RegName.consume_front(
"[")) {
15425 RC =
TRI->getVGPRClassForBitWidth(Width);
15427 RC =
TRI->getSGPRClassForBitWidth(Width);
15429 RC =
TRI->getAGPRClassForBitWidth(Width);
15431 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15432 return std::pair(Reg, RC);
15437 if (!
Failed && Idx < RC->getNumRegs())
15445 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15451 if (Constraint.
size() == 1) {
15452 switch (Constraint[0]) {
15461 }
else if (Constraint ==
"DA" ||
15462 Constraint ==
"DB") {
15470 if (Constraint.
size() == 1) {
15471 switch (Constraint[0]) {
15487 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15494 std::vector<SDValue> &Ops,
15509 unsigned Size =
Op.getScalarValueSizeInBits();
15517 Val =
C->getSExtValue();
15521 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15527 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15530 Val =
C->getSExtValue();
15534 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15544 if (Constraint.
size() == 1) {
15545 switch (Constraint[0]) {
15549 return isInt<16>(Val);
15553 return isInt<32>(Val);
15560 }
else if (Constraint.
size() == 2) {
15561 if (Constraint ==
"DA") {
15562 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15563 int64_t LoBits =
static_cast<int32_t
>(Val);
15567 if (Constraint ==
"DB") {
15575 unsigned MaxSize)
const {
15576 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15579 MVT VT =
Op.getSimpleValueType();
15604 switch (UnalignedClassID) {
15605 case AMDGPU::VReg_64RegClassID:
15606 return AMDGPU::VReg_64_Align2RegClassID;
15607 case AMDGPU::VReg_96RegClassID:
15608 return AMDGPU::VReg_96_Align2RegClassID;
15609 case AMDGPU::VReg_128RegClassID:
15610 return AMDGPU::VReg_128_Align2RegClassID;
15611 case AMDGPU::VReg_160RegClassID:
15612 return AMDGPU::VReg_160_Align2RegClassID;
15613 case AMDGPU::VReg_192RegClassID:
15614 return AMDGPU::VReg_192_Align2RegClassID;
15615 case AMDGPU::VReg_224RegClassID:
15616 return AMDGPU::VReg_224_Align2RegClassID;
15617 case AMDGPU::VReg_256RegClassID:
15618 return AMDGPU::VReg_256_Align2RegClassID;
15619 case AMDGPU::VReg_288RegClassID:
15620 return AMDGPU::VReg_288_Align2RegClassID;
15621 case AMDGPU::VReg_320RegClassID:
15622 return AMDGPU::VReg_320_Align2RegClassID;
15623 case AMDGPU::VReg_352RegClassID:
15624 return AMDGPU::VReg_352_Align2RegClassID;
15625 case AMDGPU::VReg_384RegClassID:
15626 return AMDGPU::VReg_384_Align2RegClassID;
15627 case AMDGPU::VReg_512RegClassID:
15628 return AMDGPU::VReg_512_Align2RegClassID;
15629 case AMDGPU::VReg_1024RegClassID:
15630 return AMDGPU::VReg_1024_Align2RegClassID;
15631 case AMDGPU::AReg_64RegClassID:
15632 return AMDGPU::AReg_64_Align2RegClassID;
15633 case AMDGPU::AReg_96RegClassID:
15634 return AMDGPU::AReg_96_Align2RegClassID;
15635 case AMDGPU::AReg_128RegClassID:
15636 return AMDGPU::AReg_128_Align2RegClassID;
15637 case AMDGPU::AReg_160RegClassID:
15638 return AMDGPU::AReg_160_Align2RegClassID;
15639 case AMDGPU::AReg_192RegClassID:
15640 return AMDGPU::AReg_192_Align2RegClassID;
15641 case AMDGPU::AReg_256RegClassID:
15642 return AMDGPU::AReg_256_Align2RegClassID;
15643 case AMDGPU::AReg_512RegClassID:
15644 return AMDGPU::AReg_512_Align2RegClassID;
15645 case AMDGPU::AReg_1024RegClassID:
15646 return AMDGPU::AReg_1024_Align2RegClassID;
15662 if (
Info->isEntryFunction()) {
15669 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15671 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15672 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15673 &AMDGPU::SGPR_64RegClass);
15674 Info->setSGPRForEXECCopy(SReg);
15677 Info->getStackPtrOffsetReg()));
15678 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15679 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15683 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15684 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15686 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15687 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15689 Info->limitOccupancy(MF);
15691 if (ST.isWave32() && !MF.
empty()) {
15692 for (
auto &
MBB : MF) {
15693 for (
auto &
MI :
MBB) {
15694 TII->fixImplicitOperands(
MI);
15704 if (ST.needsAlignedVGPRs()) {
15705 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
15711 if (NewClassID != -1)
15712 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15721 const APInt &DemandedElts,
15723 unsigned Depth)
const {
15725 unsigned Opc =
Op.getOpcode();
15728 unsigned IID =
Op.getConstantOperandVal(0);
15730 case Intrinsic::amdgcn_mbcnt_lo:
15731 case Intrinsic::amdgcn_mbcnt_hi: {
15738 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15740 MaxActiveBits += Src1ValBits ? 1 : 0;
15741 unsigned Size =
Op.getValueType().getSizeInBits();
15742 if (MaxActiveBits <
Size)
15751 Op, Known, DemandedElts, DAG,
Depth);
15766 unsigned MaxValue =
15775 switch (
MI->getOpcode()) {
15776 case AMDGPU::G_INTRINSIC:
15777 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15779 case Intrinsic::amdgcn_workitem_id_x:
15782 case Intrinsic::amdgcn_workitem_id_y:
15785 case Intrinsic::amdgcn_workitem_id_z:
15788 case Intrinsic::amdgcn_mbcnt_lo:
15789 case Intrinsic::amdgcn_mbcnt_hi: {
15791 unsigned Size =
MRI.getType(R).getSizeInBits();
15795 case Intrinsic::amdgcn_groupstaticsize: {
15806 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15809 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15812 case AMDGPU::G_AMDGPU_SMED3:
15813 case AMDGPU::G_AMDGPU_UMED3: {
15814 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15841 unsigned Depth)
const {
15843 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15849 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15876 if (Header->getAlignment() != PrefAlign)
15877 return Header->getAlignment();
15879 unsigned LoopSize = 0;
15887 LoopSize +=
TII->getInstSizeInBytes(
MI);
15888 if (LoopSize > 192)
15893 if (LoopSize <= 64)
15896 if (LoopSize <= 128)
15897 return CacheLineAlign;
15903 auto I = Exit->getFirstNonDebugInstr();
15904 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15905 return CacheLineAlign;
15914 if (PreTerm == Pre->
begin() ||
15915 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15919 auto ExitHead = Exit->getFirstNonDebugInstr();
15920 if (ExitHead == Exit->end() ||
15921 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15926 return CacheLineAlign;
15934 N =
N->getOperand(0).getNode();
15945 switch (
N->getOpcode()) {
15953 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15954 return !
TRI->isSGPRReg(
MRI, Reg);
15960 return !
TRI->isSGPRReg(
MRI, Reg);
15964 unsigned AS = L->getAddressSpace();
15995 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
15997 return A->readMem() &&
A->writeMem();
16032 unsigned Depth)
const {
16037 if (
Info->getMode().DX10Clamp)
16050static bool fpModeMatchesGlobalFPAtomicMode(
const AtomicRMWInst *RMW) {
16052 auto DenormMode = RMW->
getParent()->getParent()->getDenormalMode(
Flt);
16064 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
16077 <<
"Hardware instruction generated for atomic "
16079 <<
" operation at memory scope " << MemScope;
16083 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16084 Type *EltTy = VT->getElementType();
16085 return VT->getNumElements() == 2 &&
16117 bool HasSystemScope =
16204 if (HasSystemScope)
16259 if (HasSystemScope)
16294 if (HasSystemScope)
16331 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16333 : &AMDGPU::SReg_32RegClass;
16334 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16335 return TRI->getEquivalentSGPRClass(RC);
16336 if (
TRI->isSGPRClass(RC) && isDivergent)
16337 return TRI->getEquivalentVGPRClass(RC);
16349 unsigned WaveSize) {
16354 if (!
IT ||
IT->getBitWidth() != WaveSize)
16357 if (!isa<Instruction>(V))
16359 if (!Visited.
insert(V).second)
16361 bool Result =
false;
16362 for (
const auto *U : V->users()) {
16363 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16364 if (V == U->getOperand(1)) {
16365 switch (Intrinsic->getIntrinsicID()) {
16369 case Intrinsic::amdgcn_if_break:
16370 case Intrinsic::amdgcn_if:
16371 case Intrinsic::amdgcn_else:
16376 if (V == U->getOperand(0)) {
16377 switch (Intrinsic->getIntrinsicID()) {
16381 case Intrinsic::amdgcn_end_cf:
16382 case Intrinsic::amdgcn_loop:
16388 Result =
hasCFUser(U, Visited, WaveSize);
16397 const Value *V)
const {
16398 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16399 if (CI->isInlineAsm()) {
16408 for (
auto &TC : TargetConstraints) {
16412 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16425 for (;
I != E; ++
I) {
16426 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16449 return MRI.hasOneNonDBGUse(N0);
16456 if (
I.getMetadata(
"amdgpu.noclobber"))
16458 if (
I.getMetadata(
"amdgpu.last.use"))
16468 if (!Def->isMachineOpcode())
16478 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16479 PhysReg = AMDGPU::SCC;
16481 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16495 "this cannot be replaced with add");
16501 "target should have atomic fadd instructions");
16504 "generic atomicrmw expansion only supports FP32 operand in flat "
16578 for (
auto &
P : MDs)
16589 {
Addr},
nullptr,
"is.shared");
16590 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16595 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16600 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16606 Value *LoadedPrivate =
16607 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16615 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isHalf2OrBFloat2(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static bool isHalf2(Type *Ty)
bool unsafeFPAtomicsDisabled(Function *F)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool isBFloat2(Type *Ty)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasD16Images() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const