40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
55 "amdgpu-disable-loop-alignment",
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
62 cl::desc(
"Use indirect register addressing for divergent indexes"),
76 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
77 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
79 return AMDGPU::SGPR0 + Reg;
195 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
196 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
197 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
198 MVT::i1, MVT::v32i32},
202 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
203 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
204 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
205 MVT::i1, MVT::v32i32},
274 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
281 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
282 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
283 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
286 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
287 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
288 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
293 MVT::v3i16, MVT::v4i16, MVT::Other},
298 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
314 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
315 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
316 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
317 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
318 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
319 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
320 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
321 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
353 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
367 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
381 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
395 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
409 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
424 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
433 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
434 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
439 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
443 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
444 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
445 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
446 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
527 {MVT::f32, MVT::f64},
Legal);
620 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
621 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
622 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
757 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
766 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
788 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
789 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
792 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
800 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
868 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
869 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
870 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
871 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
875 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
876 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
877 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
878 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
967 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
980 EVT DestVT,
EVT SrcVT)
const {
990 LLT DestTy,
LLT SrcTy)
const {
991 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
992 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1018 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1020 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1047 return (NumElts + 1) / 2;
1053 return NumElts * ((
Size + 31) / 32);
1062 EVT VT,
EVT &IntermediateVT,
1063 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1072 if (ScalarVT == MVT::bf16) {
1073 RegisterVT = MVT::i32;
1074 IntermediateVT = MVT::v2bf16;
1076 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1077 IntermediateVT = RegisterVT;
1079 NumIntermediates = (NumElts + 1) / 2;
1080 return NumIntermediates;
1085 IntermediateVT = RegisterVT;
1086 NumIntermediates = NumElts;
1087 return NumIntermediates;
1090 if (Size < 16 && Subtarget->has16BitInsts()) {
1092 RegisterVT = MVT::i16;
1093 IntermediateVT = ScalarVT;
1094 NumIntermediates = NumElts;
1095 return NumIntermediates;
1100 RegisterVT = MVT::i32;
1101 IntermediateVT = ScalarVT;
1102 NumIntermediates = NumElts;
1103 return NumIntermediates;
1107 RegisterVT = MVT::i32;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts * ((
Size + 31) / 32);
1110 return NumIntermediates;
1115 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1120 unsigned MaxNumLanes) {
1121 assert(MaxNumLanes != 0);
1124 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1125 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1136 unsigned MaxNumLanes) {
1137 auto *ST = dyn_cast<StructType>(Ty);
1142 assert(ST->getNumContainedTypes() == 2 &&
1143 ST->getContainedType(1)->isIntegerTy(32));
1158 DL.getPointerSizeInBits(AS) == 192)
1168 DL.getPointerSizeInBits(AS) == 160) ||
1170 DL.getPointerSizeInBits(AS) == 192))
1178 unsigned IntrID)
const {
1180 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1195 if (RsrcIntr->IsImage) {
1203 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1210 Info.ptrVal = RsrcArg;
1218 if (RsrcIntr->IsImage) {
1219 unsigned MaxNumLanes = 4;
1234 std::numeric_limits<unsigned>::max());
1244 if (RsrcIntr->IsImage) {
1245 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1263 if (RsrcIntr->IsImage && BaseOpcode->
NoReturn) {
1265 Info.memVT = MVT::i32;
1272 case Intrinsic::amdgcn_raw_buffer_load_lds:
1273 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_buffer_load_lds:
1275 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1276 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1281 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1282 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1283 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1284 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1287 std::numeric_limits<unsigned>::max());
1297 case Intrinsic::amdgcn_ds_ordered_add:
1298 case Intrinsic::amdgcn_ds_ordered_swap: {
1311 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1312 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1315 Info.ptrVal =
nullptr;
1320 case Intrinsic::amdgcn_ds_append:
1321 case Intrinsic::amdgcn_ds_consume: {
1334 case Intrinsic::amdgcn_global_atomic_csub: {
1344 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1354 case Intrinsic::amdgcn_global_atomic_fadd:
1355 case Intrinsic::amdgcn_global_atomic_fmin:
1356 case Intrinsic::amdgcn_global_atomic_fmax:
1357 case Intrinsic::amdgcn_global_atomic_fmin_num:
1358 case Intrinsic::amdgcn_global_atomic_fmax_num:
1359 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1360 case Intrinsic::amdgcn_flat_atomic_fadd:
1361 case Intrinsic::amdgcn_flat_atomic_fmin:
1362 case Intrinsic::amdgcn_flat_atomic_fmax:
1363 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1364 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1365 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1366 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1367 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1378 case Intrinsic::amdgcn_global_load_tr_b64:
1379 case Intrinsic::amdgcn_global_load_tr_b128: {
1387 case Intrinsic::amdgcn_ds_gws_init:
1388 case Intrinsic::amdgcn_ds_gws_barrier:
1389 case Intrinsic::amdgcn_ds_gws_sema_v:
1390 case Intrinsic::amdgcn_ds_gws_sema_br:
1391 case Intrinsic::amdgcn_ds_gws_sema_p:
1392 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1402 Info.memVT = MVT::i32;
1406 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1412 case Intrinsic::amdgcn_global_load_lds: {
1414 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1420 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1430 Info.memVT = MVT::i32;
1445 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1448 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1449 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1461 Type *&AccessTy)
const {
1463 switch (
II->getIntrinsicID()) {
1464 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1465 case Intrinsic::amdgcn_ds_append:
1466 case Intrinsic::amdgcn_ds_consume:
1467 case Intrinsic::amdgcn_ds_ordered_add:
1468 case Intrinsic::amdgcn_ds_ordered_swap:
1469 case Intrinsic::amdgcn_flat_atomic_fadd:
1470 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1471 case Intrinsic::amdgcn_flat_atomic_fmax:
1472 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1473 case Intrinsic::amdgcn_flat_atomic_fmin:
1474 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1475 case Intrinsic::amdgcn_global_atomic_csub:
1476 case Intrinsic::amdgcn_global_atomic_fadd:
1477 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1478 case Intrinsic::amdgcn_global_atomic_fmax:
1479 case Intrinsic::amdgcn_global_atomic_fmax_num:
1480 case Intrinsic::amdgcn_global_atomic_fmin:
1481 case Intrinsic::amdgcn_global_atomic_fmin_num:
1482 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1483 case Intrinsic::amdgcn_global_load_tr_b64:
1484 case Intrinsic::amdgcn_global_load_tr_b128:
1485 Ptr =
II->getArgOperand(0);
1487 case Intrinsic::amdgcn_global_load_lds:
1488 Ptr =
II->getArgOperand(1);
1493 AccessTy =
II->getType();
1499 unsigned AddrSpace)
const {
1511 return AM.
Scale == 0 &&
1513 AM.
BaseOffs, AddrSpace, FlatVariant));
1533 return isLegalMUBUFAddressingMode(AM);
1536bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1547 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1559 if (AM.HasBaseReg) {
1590 return isLegalMUBUFAddressingMode(AM);
1597 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1647 : isLegalMUBUFAddressingMode(AM);
1694 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1708 Alignment < RequiredAlignment)
1729 RequiredAlignment =
Align(4);
1747 *IsFast = (Alignment >= RequiredAlignment) ? 64
1748 : (Alignment <
Align(4)) ? 32
1770 *IsFast = (Alignment >= RequiredAlignment) ? 96
1771 : (Alignment <
Align(4)) ? 32
1784 RequiredAlignment =
Align(8);
1795 *IsFast = (Alignment >= RequiredAlignment) ? 128
1796 : (Alignment <
Align(4)) ? 32
1813 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1815 return Alignment >= RequiredAlignment ||
1820 bool AlignedBy4 = Alignment >=
Align(4);
1822 *IsFast = AlignedBy4;
1824 return AlignedBy4 ||
1834 bool AlignedBy4 = Alignment >=
Align(4);
1836 *IsFast = AlignedBy4;
1847 return Alignment >=
Align(4) ||
1861 return Size >= 32 && Alignment >=
Align(4);
1866 unsigned *IsFast)
const {
1868 Alignment, Flags, IsFast);
1878 if (
Op.size() >= 16 &&
1882 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1890 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1900 unsigned DestAS)
const {
1908 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1912 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1932 unsigned Index)
const {
1979 std::tie(InputPtrReg, RC, ArgTy) =
1989 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1995 const SDLoc &SL)
const {
2002 const SDLoc &SL)
const {
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2034 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2043SDValue SITargetLowering::lowerKernargMemParameter(
2055 int64_t OffsetDiff =
Offset - AlignDownOffset;
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2071 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2082 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2129 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2158 Reg = &WorkGroupIDX;
2159 RC = &AMDGPU::SReg_32RegClass;
2163 Reg = &WorkGroupIDY;
2164 RC = &AMDGPU::SReg_32RegClass;
2168 Reg = &WorkGroupIDZ;
2169 RC = &AMDGPU::SReg_32RegClass;
2200 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2204 "vector type argument should have been split");
2209 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2218 "unexpected vector split in ps argument type");
2232 Info->markPSInputAllocated(PSInputNum);
2234 Info->markPSInputEnabled(PSInputNum);
2251 if (
Info.hasWorkItemIDX()) {
2257 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2261 if (
Info.hasWorkItemIDY()) {
2267 unsigned Reg = AMDGPU::VGPR1;
2275 if (
Info.hasWorkItemIDZ()) {
2281 unsigned Reg = AMDGPU::VGPR2;
2301 if (RegIdx == ArgVGPRs.
size()) {
2308 unsigned Reg = ArgVGPRs[RegIdx];
2310 assert(Reg != AMDGPU::NoRegister);
2320 unsigned NumArgRegs) {
2323 if (RegIdx == ArgSGPRs.
size())
2326 unsigned Reg = ArgSGPRs[RegIdx];
2328 assert(Reg != AMDGPU::NoRegister);
2342 assert(Reg != AMDGPU::NoRegister);
2368 const unsigned Mask = 0x3ff;
2371 if (
Info.hasWorkItemIDX()) {
2373 Info.setWorkItemIDX(Arg);
2376 if (
Info.hasWorkItemIDY()) {
2378 Info.setWorkItemIDY(Arg);
2381 if (
Info.hasWorkItemIDZ())
2393 const unsigned Mask = 0x3ff;
2418 if (
Info.hasImplicitArgPtr())
2426 if (
Info.hasWorkGroupIDX())
2429 if (
Info.hasWorkGroupIDY())
2432 if (
Info.hasWorkGroupIDZ())
2435 if (
Info.hasLDSKernelId())
2447 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2454 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2460 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2468 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2483 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2489 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2495 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2510 unsigned LastExplicitArgOffset =
2513 bool InPreloadSequence =
true;
2515 for (
auto &Arg :
F.args()) {
2516 if (!InPreloadSequence || !Arg.hasInRegAttr())
2519 int ArgIdx = Arg.getArgNo();
2522 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2523 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2526 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2527 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2529 assert(ArgLocs[ArgIdx].isMemLoc());
2530 auto &ArgLoc = ArgLocs[InIdx];
2532 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2534 unsigned NumAllocSGPRs =
2535 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2538 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2539 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2540 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2544 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2545 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2547 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2549 InPreloadSequence =
false;
2555 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2557 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2559 if (PreloadRegs->
size() > 1)
2560 RC = &AMDGPU::SGPR_32RegClass;
2561 for (
auto &Reg : *PreloadRegs) {
2567 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2576 if (
Info.hasLDSKernelId()) {
2578 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2588 bool IsShader)
const {
2596 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2598 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2602 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2603 Info.hasWorkGroupIDY() +
2604 Info.hasWorkGroupIDZ() +
2605 Info.hasWorkGroupInfo();
2606 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2608 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 if (!HasArchitectedSGPRs) {
2614 if (
Info.hasWorkGroupIDX()) {
2616 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 if (
Info.hasWorkGroupIDY()) {
2622 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 if (
Info.hasWorkGroupIDZ()) {
2628 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 if (
Info.hasWorkGroupInfo()) {
2635 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2654 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2656 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2661 Info.getNumPreloadedSGPRs() >= 16);
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(
true);
2682 HasStackObjects =
true;
2686 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2698 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2727 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2735 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!
MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2742 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2756 return !
Info->isEntryFunction();
2768 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2777 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2778 RC = &AMDGPU::SGPR_64RegClass;
2779 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2780 RC = &AMDGPU::SGPR_32RegClass;
2786 Entry->addLiveIn(*
I);
2791 for (
auto *Exit : Exits)
2793 TII->get(TargetOpcode::COPY), *
I)
2811 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2830 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2831 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2839 !
Info->hasWorkGroupIDZ());
2858 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2859 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2862 Info->markPSInputAllocated(0);
2863 Info->markPSInputEnabled(0);
2874 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2875 if ((PsInputBits & 0x7F) == 0 ||
2876 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2879 }
else if (IsKernel) {
2882 Splits.
append(Ins.begin(), Ins.end());
2895 }
else if (!IsGraphics) {
2920 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2930 if (IsEntryFunc && VA.
isMemLoc()) {
2953 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2957 int64_t OffsetDiff =
Offset - AlignDownOffset;
2964 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2975 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2976 Ins[i].Flags.isSExt(), &Ins[i]);
2984 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2987 if (PreloadRegs.
size() == 1) {
2988 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2993 TRI->getRegSizeInBits(*RC)));
3001 for (
auto Reg : PreloadRegs) {
3008 PreloadRegs.size()),
3025 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3026 Ins[i].Flags.isSExt(), &Ins[i]);
3031 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3032 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3037 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3051 if (!IsEntryFunc && VA.
isMemLoc()) {
3052 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3063 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3064 RC = &AMDGPU::VGPR_32RegClass;
3065 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3066 RC = &AMDGPU::SGPR_32RegClass;
3127 Info->setBytesInStackArgArea(StackArgSize);
3129 return Chains.
empty() ? Chain :
3147 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3153 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3154 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3155 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3178 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3196 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3197 ++
I, ++RealRVLocIdx) {
3201 SDValue Arg = OutVals[RealRVLocIdx];
3229 if (!
Info->isEntryFunction()) {
3235 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3237 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3253 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3338 auto &ArgUsageInfo =
3340 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3370 std::tie(OutgoingArg, ArgRC, ArgTy) =
3378 std::tie(IncomingArg, IncomingArgRC, Ty) =
3380 assert(IncomingArgRC == ArgRC);
3383 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3391 InputReg = getImplicitArgPtr(DAG,
DL);
3393 std::optional<uint32_t> Id =
3395 if (Id.has_value()) {
3407 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3411 unsigned SpecialArgOffset =
3425 std::tie(OutgoingArg, ArgRC, Ty) =
3428 std::tie(OutgoingArg, ArgRC, Ty) =
3431 std::tie(OutgoingArg, ArgRC, Ty) =
3446 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3447 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3448 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3465 InputReg = InputReg.
getNode() ?
3474 InputReg = InputReg.
getNode() ?
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3489 IncomingArgX ? *IncomingArgX :
3490 IncomingArgY ? *IncomingArgY :
3491 *IncomingArgZ, ~0u);
3498 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3539 if (Callee->isDivergent())
3546 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3550 if (!CallerPreserved)
3553 bool CCMatch = CallerCC == CalleeCC;
3566 if (Arg.hasByValAttr())
3580 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3581 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3590 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3624 if (IsChainCallConv) {
3628 RequestedExec = CLI.
Args.back();
3629 assert(RequestedExec.
Node &&
"No node for EXEC");
3634 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3635 CLI.
Outs.pop_back();
3639 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3640 CLI.
Outs.pop_back();
3645 "Haven't popped all the pieces of the EXEC mask");
3656 bool IsSibCall =
false;
3670 "unsupported call to variadic function ");
3678 "unsupported required tail call to function ");
3683 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3687 "site marked musttail or on llvm.amdgcn.cs.chain");
3694 if (!TailCallOpt && IsTailCall)
3739 if (!IsSibCall || IsChainCallConv) {
3746 RegsToPass.emplace_back(IsChainCallConv
3747 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3748 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3755 MVT PtrVT = MVT::i32;
3758 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3786 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3794 int32_t
Offset = LocMemOffset;
3801 unsigned OpSize = Flags.isByVal() ?
3807 ? Flags.getNonZeroByValAlign()
3834 if (Outs[i].Flags.isByVal()) {
3836 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3839 Outs[i].Flags.getNonZeroByValAlign(),
3841 nullptr, std::nullopt, DstInfo,
3847 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3853 if (!MemOpChains.
empty())
3859 for (
auto &RegToPass : RegsToPass) {
3861 RegToPass.second, InGlue);
3870 if (IsTailCall && !IsSibCall) {
3875 std::vector<SDValue> Ops;
3876 Ops.push_back(Chain);
3877 Ops.push_back(Callee);
3894 if (IsChainCallConv)
3895 Ops.push_back(RequestedExec.
Node);
3899 for (
auto &RegToPass : RegsToPass) {
3901 RegToPass.second.getValueType()));
3906 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3907 assert(Mask &&
"Missing call preserved mask for calling convention");
3917 MVT::Glue, GlueOps),
3922 Ops.push_back(InGlue);
3941 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3946 Chain = Call.getValue(0);
3947 InGlue = Call.getValue(1);
3949 uint64_t CalleePopBytes = NumBytes;
3968 EVT VT =
Op.getValueType();
3983 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3994 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3995 if (Alignment && *Alignment > StackAlign) {
4016 if (isa<ConstantSDNode>(
Size))
4023 if (
Op.getValueType() != MVT::i32)
4042 assert(
Op.getValueType() == MVT::i32);
4051 Op.getOperand(0), IntrinID, GetRoundBothImm);
4085 SDValue RoundModeTimesNumBits =
4105 TableEntry, EnumOffset);
4119 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4121 static_cast<uint32_t>(ConstMode->getZExtValue()),
4133 if (UseReducedTable) {
4139 SDValue RoundModeTimesNumBits =
4159 SDValue RoundModeTimesNumBits =
4168 NewMode = TruncTable;
4177 ReadFirstLaneID, NewMode);
4190 IntrinID, RoundBothImm, NewMode);
4196 if (
Op->isDivergent())
4199 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4215 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4216 EVT SrcVT = Src.getValueType();
4225 EVT DstVT =
Op.getValueType();
4234 if (
Op.getValueType() != MVT::i64)
4248 Op.getOperand(0), IntrinID, ModeHwRegImm);
4250 Op.getOperand(0), IntrinID, TrapHwRegImm);
4264 if (
Op.getOperand(1).getValueType() != MVT::i64)
4276 ReadFirstLaneID, NewModeReg);
4278 ReadFirstLaneID, NewTrapReg);
4280 unsigned ModeHwReg =
4283 unsigned TrapHwReg =
4291 IntrinID, ModeHwRegImm, NewModeReg);
4294 IntrinID, TrapHwRegImm, NewTrapReg);
4301 .
Case(
"m0", AMDGPU::M0)
4302 .
Case(
"exec", AMDGPU::EXEC)
4303 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4304 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4305 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4306 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4307 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4310 if (Reg == AMDGPU::NoRegister) {
4324 case AMDGPU::EXEC_LO:
4325 case AMDGPU::EXEC_HI:
4326 case AMDGPU::FLAT_SCR_LO:
4327 case AMDGPU::FLAT_SCR_HI:
4332 case AMDGPU::FLAT_SCR:
4351 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4360static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4382 auto Next = std::next(
I);
4395 return std::pair(LoopBB, RemainderBB);
4402 auto I =
MI.getIterator();
4403 auto E = std::next(
I);
4425 Src->setIsKill(
false);
4441 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4444 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4466 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4467 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4476 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4477 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4478 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4479 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4487 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4494 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4498 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4503 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4504 : AMDGPU::S_AND_SAVEEXEC_B64),
4508 MRI.setSimpleHint(NewExec, CondReg);
4510 if (UseGPRIdxMode) {
4512 SGPRIdxReg = CurrentIdxReg;
4514 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4515 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4522 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4525 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4532 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4534 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4535 : AMDGPU::S_XOR_B64_term), Exec)
4556 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4557 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4565 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4567 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4568 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4569 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4570 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4585 InitResultReg, DstReg, PhiReg, TmpExec,
4586 Offset, UseGPRIdxMode, SGPRIdxReg);
4603static std::pair<unsigned, int>
4608 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4613 return std::pair(AMDGPU::sub0,
Offset);
4627 assert(
Idx->getReg() != AMDGPU::NoRegister);
4648 return Idx->getReg();
4650 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4667 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4668 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4677 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4680 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4684 if (UseGPRIdxMode) {
4691 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4704 MI.eraseFromParent();
4713 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4714 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4720 UseGPRIdxMode, SGPRIdxReg);
4724 if (UseGPRIdxMode) {
4726 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4728 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4733 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4738 MI.eraseFromParent();
4755 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4766 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4768 if (
Idx->getReg() == AMDGPU::NoRegister) {
4779 MI.eraseFromParent();
4784 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4788 if (UseGPRIdxMode) {
4792 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4801 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4802 TRI.getRegSizeInBits(*VecRC), 32,
false);
4808 MI.eraseFromParent();
4818 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4822 UseGPRIdxMode, SGPRIdxReg);
4825 if (UseGPRIdxMode) {
4827 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4829 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4835 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4836 TRI.getRegSizeInBits(*VecRC), 32,
false);
4837 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4843 MI.eraseFromParent();
4858 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4886 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4887 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4889 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4890 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4891 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4893 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4894 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4896 bool IsWave32 = ST.isWave32();
4897 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4898 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4903 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4906 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4911 I = ComputeLoop->end();
4913 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4917 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4918 .
addReg(TmpSReg->getOperand(0).getReg())
4922 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4923 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4924 .
addReg(ActiveBits->getOperand(0).getReg());
4925 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4926 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4928 .
addReg(FF1->getOperand(0).getReg());
4929 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4931 .
addReg(LaneValue->getOperand(0).getReg());
4934 unsigned BITSETOpc =
4935 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4936 auto NewActiveBits =
4937 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4938 .
addReg(FF1->getOperand(0).getReg())
4939 .
addReg(ActiveBits->getOperand(0).getReg());
4942 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4943 .addMBB(ComputeLoop);
4944 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4945 .addMBB(ComputeLoop);
4948 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4950 .
addReg(NewActiveBits->getOperand(0).getReg())
4952 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4957 MI.eraseFromParent();
4968 switch (
MI.getOpcode()) {
4969 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4971 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4973 case AMDGPU::S_UADDO_PSEUDO:
4974 case AMDGPU::S_USUBO_PSEUDO: {
4981 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4983 : AMDGPU::S_SUB_I32;
4990 MI.eraseFromParent();
4993 case AMDGPU::S_ADD_U64_PSEUDO:
4994 case AMDGPU::S_SUB_U64_PSEUDO: {
5003 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5005 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5013 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5014 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5017 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5019 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5022 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5024 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5026 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5027 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5040 MI.eraseFromParent();
5043 case AMDGPU::V_ADD_U64_PSEUDO:
5044 case AMDGPU::V_SUB_U64_PSEUDO: {
5050 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5056 if (IsAdd && ST.hasLshlAddB64()) {
5062 TII->legalizeOperands(*
Add);
5063 MI.eraseFromParent();
5067 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5069 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5070 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5072 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5073 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5077 : &AMDGPU::VReg_64RegClass;
5080 : &AMDGPU::VReg_64RegClass;
5083 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5085 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5088 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5090 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5093 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5095 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5097 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5104 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5118 TII->legalizeOperands(*LoHalf);
5119 TII->legalizeOperands(*HiHalf);
5120 MI.eraseFromParent();
5123 case AMDGPU::S_ADD_CO_PSEUDO:
5124 case AMDGPU::S_SUB_CO_PSEUDO: {
5138 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5139 ? AMDGPU::S_ADDC_U32
5140 : AMDGPU::S_SUBB_U32;
5142 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5143 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5148 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5149 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5153 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5155 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5161 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5162 assert(WaveSize == 64 || WaveSize == 32);
5164 if (WaveSize == 64) {
5165 if (ST.hasScalarCompareEq64()) {
5171 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5173 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5175 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5176 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5178 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5195 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5201 MI.eraseFromParent();
5204 case AMDGPU::SI_INIT_M0: {
5206 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5207 .
add(
MI.getOperand(0));
5208 MI.eraseFromParent();
5211 case AMDGPU::GET_GROUPSTATICSIZE: {
5216 .
add(
MI.getOperand(0))
5218 MI.eraseFromParent();
5221 case AMDGPU::GET_SHADERCYCLESHILO: {
5235 using namespace AMDGPU::Hwreg;
5236 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5238 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5239 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5241 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5242 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5244 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5248 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5253 .
add(
MI.getOperand(0))
5258 MI.eraseFromParent();
5261 case AMDGPU::SI_INDIRECT_SRC_V1:
5262 case AMDGPU::SI_INDIRECT_SRC_V2:
5263 case AMDGPU::SI_INDIRECT_SRC_V4:
5264 case AMDGPU::SI_INDIRECT_SRC_V8:
5265 case AMDGPU::SI_INDIRECT_SRC_V9:
5266 case AMDGPU::SI_INDIRECT_SRC_V10:
5267 case AMDGPU::SI_INDIRECT_SRC_V11:
5268 case AMDGPU::SI_INDIRECT_SRC_V12:
5269 case AMDGPU::SI_INDIRECT_SRC_V16:
5270 case AMDGPU::SI_INDIRECT_SRC_V32:
5272 case AMDGPU::SI_INDIRECT_DST_V1:
5273 case AMDGPU::SI_INDIRECT_DST_V2:
5274 case AMDGPU::SI_INDIRECT_DST_V4:
5275 case AMDGPU::SI_INDIRECT_DST_V8:
5276 case AMDGPU::SI_INDIRECT_DST_V9:
5277 case AMDGPU::SI_INDIRECT_DST_V10:
5278 case AMDGPU::SI_INDIRECT_DST_V11:
5279 case AMDGPU::SI_INDIRECT_DST_V12:
5280 case AMDGPU::SI_INDIRECT_DST_V16:
5281 case AMDGPU::SI_INDIRECT_DST_V32:
5283 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5284 case AMDGPU::SI_KILL_I1_PSEUDO:
5286 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5295 Register SrcCond =
MI.getOperand(3).getReg();
5297 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5298 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5299 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5300 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5304 : &AMDGPU::VReg_64RegClass;
5307 : &AMDGPU::VReg_64RegClass;
5310 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5312 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5315 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5317 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5320 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5322 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5344 MI.eraseFromParent();
5347 case AMDGPU::SI_BR_UNDEF: {
5351 .
add(
MI.getOperand(0));
5353 MI.eraseFromParent();
5356 case AMDGPU::ADJCALLSTACKUP:
5357 case AMDGPU::ADJCALLSTACKDOWN: {
5364 case AMDGPU::SI_CALL_ISEL: {
5368 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5371 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5377 MI.eraseFromParent();
5380 case AMDGPU::V_ADD_CO_U32_e32:
5381 case AMDGPU::V_SUB_CO_U32_e32:
5382 case AMDGPU::V_SUBREV_CO_U32_e32: {
5385 unsigned Opc =
MI.getOpcode();
5387 bool NeedClampOperand =
false;
5388 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5390 NeedClampOperand =
true;
5394 if (
TII->isVOP3(*
I)) {
5399 I.add(
MI.getOperand(1))
5400 .add(
MI.getOperand(2));
5401 if (NeedClampOperand)
5404 TII->legalizeOperands(*
I);
5406 MI.eraseFromParent();
5409 case AMDGPU::V_ADDC_U32_e32:
5410 case AMDGPU::V_SUBB_U32_e32:
5411 case AMDGPU::V_SUBBREV_U32_e32:
5414 TII->legalizeOperands(
MI);
5416 case AMDGPU::DS_GWS_INIT:
5417 case AMDGPU::DS_GWS_SEMA_BR:
5418 case AMDGPU::DS_GWS_BARRIER:
5419 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5421 case AMDGPU::DS_GWS_SEMA_V:
5422 case AMDGPU::DS_GWS_SEMA_P:
5423 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5431 case AMDGPU::S_SETREG_B32: {
5446 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5447 const unsigned SetMask = WidthMask <<
Offset;
5450 unsigned SetDenormOp = 0;
5451 unsigned SetRoundOp = 0;
5459 SetRoundOp = AMDGPU::S_ROUND_MODE;
5460 SetDenormOp = AMDGPU::S_DENORM_MODE;
5462 SetRoundOp = AMDGPU::S_ROUND_MODE;
5464 SetDenormOp = AMDGPU::S_DENORM_MODE;
5467 if (SetRoundOp || SetDenormOp) {
5470 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5471 unsigned ImmVal = Def->getOperand(1).getImm();
5485 MI.eraseFromParent();
5494 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5498 case AMDGPU::S_INVERSE_BALLOT_U32:
5499 case AMDGPU::S_INVERSE_BALLOT_U64:
5502 MI.setDesc(
TII->get(AMDGPU::COPY));
5504 case AMDGPU::ENDPGM_TRAP: {
5507 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5525 MI.eraseFromParent();
5528 case AMDGPU::SIMULATED_TRAP: {
5532 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5533 MI.eraseFromParent();
5570 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5657 EVT VT =
N->getValueType(0);
5661 if (VT == MVT::f16) {
5677 unsigned Opc =
Op.getOpcode();
5678 EVT VT =
Op.getValueType();
5679 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5680 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5681 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5682 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5700 unsigned Opc =
Op.getOpcode();
5701 EVT VT =
Op.getValueType();
5702 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5703 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5704 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5705 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5724 unsigned Opc =
Op.getOpcode();
5725 EVT VT =
Op.getValueType();
5726 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5727 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5728 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5729 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5730 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5731 VT == MVT::v32bf16);
5737 : std::pair(Op0, Op0);
5756 switch (
Op.getOpcode()) {
5762 assert((!Result.getNode() ||
5763 Result.getNode()->getNumValues() == 2) &&
5764 "Load should return a value and a chain");
5768 EVT VT =
Op.getValueType();
5770 return lowerFSQRTF32(
Op, DAG);
5772 return lowerFSQRTF64(
Op, DAG);
5777 return LowerTrig(
Op, DAG);
5786 return LowerGlobalAddress(MFI,
Op, DAG);
5793 return lowerINSERT_SUBVECTOR(
Op, DAG);
5795 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5797 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5799 return lowerVECTOR_SHUFFLE(
Op, DAG);
5801 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5803 return lowerBUILD_VECTOR(
Op, DAG);
5806 return lowerFP_ROUND(
Op, DAG);
5808 return lowerFPTRUNC_ROUND(
Op, DAG);
5810 return lowerTRAP(
Op, DAG);
5812 return lowerDEBUGTRAP(
Op, DAG);
5821 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5824 return lowerFLDEXP(
Op, DAG);
5851 return lowerMUL(
Op, DAG);
5854 return lowerXMULO(
Op, DAG);
5857 return lowerXMUL_LOHI(
Op, DAG);
5890 EVT FittingLoadVT = LoadVT;
5922SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5926 bool IsIntrinsic)
const {
5930 EVT LoadVT =
M->getValueType(0);
5932 EVT EquivLoadVT = LoadVT;
5951 VTList, Ops,
M->getMemoryVT(),
5952 M->getMemOperand());
5963 EVT LoadVT =
M->getValueType(0);
5969 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5970 bool IsTFE =
M->getNumValues() == 3;
5983 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
5987 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5988 M->getMemOperand(), DAG);
5993 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5994 M->getMemOperand(), DAG);
6002 EVT VT =
N->getValueType(0);
6003 unsigned CondCode =
N->getConstantOperandVal(3);
6014 EVT CmpVT =
LHS.getValueType();
6015 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6036 EVT VT =
N->getValueType(0);
6038 unsigned CondCode =
N->getConstantOperandVal(3);
6047 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6065 EVT VT =
N->getValueType(0);
6072 Src.getOperand(1), Src.getOperand(2));
6083 Exec = AMDGPU::EXEC_LO;
6085 Exec = AMDGPU::EXEC;
6102 EVT VT =
N->getValueType(0);
6104 unsigned IID =
N->getConstantOperandVal(0);
6105 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6106 IID == Intrinsic::amdgcn_permlanex16;
6114 case Intrinsic::amdgcn_permlane16:
6115 case Intrinsic::amdgcn_permlanex16:
6120 case Intrinsic::amdgcn_writelane:
6123 case Intrinsic::amdgcn_readlane:
6126 case Intrinsic::amdgcn_readfirstlane:
6127 case Intrinsic::amdgcn_permlane64:
6137 if (
SDNode *GL =
N->getGluedNode()) {
6139 GL = GL->getOperand(0).getNode();
6149 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6151 Src1 =
N->getOperand(2);
6152 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6153 Src2 =
N->getOperand(3);
6156 if (ValSize == 32) {
6171 if (IID == Intrinsic::amdgcn_writelane) {
6176 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6178 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6181 if (ValSize % 32 != 0)
6185 EVT VT =
N->getValueType(0);
6189 unsigned NumOperands =
N->getNumOperands();
6191 SDNode *GL =
N->getGluedNode();
6196 for (
unsigned i = 0; i != NE; ++i) {
6197 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6199 SDValue Operand =
N->getOperand(j);
6230 return unrollLaneOp(LaneOp.
getNode());
6237 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6238 for (
unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6246 if (IID == Intrinsic::amdgcn_writelane)
6252 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6253 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6270 if (IID == Intrinsic::amdgcn_writelane)
6273 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6281 switch (
N->getOpcode()) {
6293 unsigned IID =
N->getConstantOperandVal(0);
6295 case Intrinsic::amdgcn_make_buffer_rsrc:
6296 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6298 case Intrinsic::amdgcn_cvt_pkrtz: {
6307 case Intrinsic::amdgcn_cvt_pknorm_i16:
6308 case Intrinsic::amdgcn_cvt_pknorm_u16:
6309 case Intrinsic::amdgcn_cvt_pk_i16:
6310 case Intrinsic::amdgcn_cvt_pk_u16: {
6316 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6318 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6320 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6325 EVT VT =
N->getValueType(0);
6334 case Intrinsic::amdgcn_s_buffer_load: {
6346 EVT VT =
Op.getValueType();
6347 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6359 if (!
Offset->isDivergent()) {
6378 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6390 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6391 Results.push_back(Res.getOperand(
I));
6395 Results.push_back(Res.getValue(1));
6404 EVT VT =
N->getValueType(0);
6409 EVT SelectVT = NewVT;
6410 if (NewVT.
bitsLT(MVT::i32)) {
6413 SelectVT = MVT::i32;
6419 if (NewVT != SelectVT)
6425 if (
N->getValueType(0) != MVT::v2f16)
6438 if (
N->getValueType(0) != MVT::v2f16)
6451 if (
N->getValueType(0) != MVT::f16)
6469 if (
I.getUse().get() !=
Value)
6472 if (
I->getOpcode() == Opcode)
6478unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6480 switch (
Intr->getConstantOperandVal(1)) {
6481 case Intrinsic::amdgcn_if:
6483 case Intrinsic::amdgcn_else:
6485 case Intrinsic::amdgcn_loop:
6487 case Intrinsic::amdgcn_end_cf:
6535 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6548 assert(BR &&
"brcond missing unconditional branch user");
6549 Target = BR->getOperand(1);
6552 unsigned CFNode = isCFIntrinsic(
Intr);
6571 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6601 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6618 Intr->getOperand(0));
6625 MVT VT =
Op.getSimpleValueType();
6628 if (
Op.getConstantOperandVal(0) != 0)
6634 if (
Info->isEntryFunction())
6652 return Op.getValueType().bitsLE(VT) ?
6660 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
6664 int RoundMode =
Op.getConstantOperandVal(1);
6675 unsigned HW_Mode = (RoundMode + 3) % 4;
6679 Op->getOperand(0), RoundFlag);
6683 assert(
Op.getValueType() == MVT::f16 &&
6684 "Do not know how to custom lower FP_ROUND for non-f16 type");
6687 EVT SrcVT = Src.getValueType();
6688 if (SrcVT != MVT::f64)
6704 EVT VT =
Op.getValueType();
6707 bool IsIEEEMode =
Info->getMode().IEEE;
6716 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6724 EVT VT =
Op.getValueType();
6728 EVT ExpVT =
Exp.getValueType();
6729 if (ExpVT == MVT::i16)
6750 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6758 EVT VT =
Op.getValueType();
6764 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6791 if (
Op->isDivergent())
6804 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6806 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6809 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6811 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6817 EVT VT =
Op.getValueType();
6824 const APInt &
C = RHSC->getAPIntValue();
6826 if (
C.isPowerOf2()) {
6828 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6833 SL, VT, Result, ShiftAmt),
6853 if (
Op->isDivergent()) {
6870 return lowerTrapEndpgm(
Op, DAG);
6873 lowerTrapHsaQueuePtr(
Op, DAG);
6876SDValue SITargetLowering::lowerTrapEndpgm(
6884 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6894SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6904 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6910 if (UserSGPR == AMDGPU::NoRegister) {
6935SDValue SITargetLowering::lowerTrapHsa(
6961 "debugtrap handler not supported",
6977SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6981 ? AMDGPU::SRC_SHARED_BASE
6982 : AMDGPU::SRC_PRIVATE_BASE;
7005 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7014 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7020 if (UserSGPR == AMDGPU::NoRegister) {
7027 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7050 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7051 isa<BasicBlockSDNode>(Val))
7054 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7055 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7069 unsigned DestAS, SrcAS;
7071 bool IsNonNull =
false;
7072 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7073 SrcAS = ASC->getSrcAddressSpace();
7074 Src = ASC->getOperand(0);
7075 DestAS = ASC->getDestAddressSpace();
7078 Op.getConstantOperandVal(0) ==
7079 Intrinsic::amdgcn_addrspacecast_nonnull);
7080 Src =
Op->getOperand(1);
7081 SrcAS =
Op->getConstantOperandVal(2);
7082 DestAS =
Op->getConstantOperandVal(3);
7097 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7111 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7119 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7131 Op.getValueType() == MVT::i64) {
7140 Src.getValueType() == MVT::i64)
7164 EVT InsVT =
Ins.getValueType();
7167 unsigned IdxVal =
Idx->getAsZExtVal();
7172 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7177 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7179 MVT::i32, InsNumElts / 2);
7184 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7186 if (InsNumElts == 2) {
7199 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7221 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
7222 if (NumElts == 4 && EltSize == 16 && KIdx) {
7233 unsigned Idx = KIdx->getZExtValue();
7234 bool InsertLo =
Idx < 2;
7236 InsertLo ? LoVec : HiVec,
7251 if (isa<ConstantSDNode>(
Idx))
7257 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7263 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7279 DAG.
getNOT(SL, BFM, IntVT), BCVec);
7291 EVT ResultVT =
Op.getValueType();
7304 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7307 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7312 if (VecSize == 128) {
7320 }
else if (VecSize == 256) {
7323 for (
unsigned P = 0;
P < 4; ++
P) {
7329 Parts[0], Parts[1]));
7331 Parts[2], Parts[3]));
7337 for (
unsigned P = 0;
P < 8; ++
P) {
7344 Parts[0], Parts[1], Parts[2], Parts[3]));
7347 Parts[4], Parts[5],Parts[6], Parts[7]));
7350 EVT IdxVT =
Idx.getValueType();
7367 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7382 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7392 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7398 EVT ResultVT =
Op.getValueType();
7401 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7403 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7419 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7420 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7428 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7429 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7430 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7431 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7450 EVT ResultVT =
Op.getValueType();
7466 EVT VT =
Op.getValueType();
7468 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7469 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7488 { CastLo, CastHi });
7492 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7499 for (
unsigned P = 0;
P < 4; ++
P)
7500 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7503 for (
unsigned P = 0;
P < 4; ++
P) {
7513 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7520 for (
unsigned P = 0;
P < 8; ++
P)
7521 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7524 for (
unsigned P = 0;
P < 8; ++
P) {
7534 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7586 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7624 EVT PtrVT =
Op.getValueType();
7640 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7713 SDValue Param = lowerKernargMemParameter(
7723 "non-hsa intrinsic with hsa target",
7732 "intrinsic not supported on subtarget",
7742 unsigned NumElts = Elts.
size();
7744 if (NumElts <= 12) {
7753 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7759 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7760 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7769 EVT SrcVT = Src.getValueType();
7790 bool Unpacked,
bool IsD16,
int DMaskPop,
7791 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7794 EVT ReqRetVT = ResultTypes[0];
7796 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7797 ? (ReqRetNumElts + 1) / 2
7800 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7802 MVT DataDwordVT = NumDataDwords == 1 ?
7805 MVT MaskPopVT = MaskPopDwords == 1 ?
7811 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7822 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7824 NumDataDwords - MaskPopDwords);
7829 EVT LegalReqRetVT = ReqRetVT;
7831 if (!
Data.getValueType().isInteger())
7833 Data.getValueType().changeTypeToInteger(),
Data);
7854 if (Result->getNumValues() == 1)
7861 SDValue *LWE,
bool &IsTexFail) {
7862 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7881 unsigned DimIdx,
unsigned EndIdx,
7882 unsigned NumGradients) {
7884 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7892 if (((
I + 1) >= EndIdx) ||
7893 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7894 I == DimIdx + NumGradients - 1))) {
7895 if (
Addr.getValueType() != MVT::i16)
7916 unsigned IntrOpcode =
Intr->BaseOpcode;
7927 int NumVDataDwords = 0;
7928 bool AdjustRetType =
false;
7929 bool IsAtomicPacked16Bit =
false;
7932 const unsigned ArgOffset = WithChain ? 2 : 1;
7935 unsigned DMaskLanes = 0;
7937 if (BaseOpcode->Atomic) {
7938 VData =
Op.getOperand(2);
7940 IsAtomicPacked16Bit =
7941 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7942 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7945 if (BaseOpcode->AtomicX2) {
7952 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7953 DMask = Is64Bit ? 0xf : 0x3;
7954 NumVDataDwords = Is64Bit ? 4 : 2;
7956 DMask = Is64Bit ? 0x3 : 0x1;
7957 NumVDataDwords = Is64Bit ? 2 : 1;
7960 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7963 if (BaseOpcode->Store) {
7964 VData =
Op.getOperand(2);
7972 VData = handleD16VData(VData, DAG,
true);
7976 }
else if (!BaseOpcode->NoReturn) {
7989 (!LoadVT.
isVector() && DMaskLanes > 1))
7997 NumVDataDwords = (DMaskLanes + 1) / 2;
7999 NumVDataDwords = DMaskLanes;
8001 AdjustRetType =
true;
8005 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8010 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8012 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8013 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8015 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8017 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8018 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8021 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8022 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8023 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8028 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8032 "Bias needs to be converted to 16 bit in A16 mode");
8037 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8041 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8042 "require 16 bit args for both gradients and addresses");
8047 if (!
ST->hasA16()) {
8048 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8049 "support 16 bit addresses\n");
8059 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8063 IntrOpcode = G16MappingInfo->
G16;
8071 ArgOffset +
Intr->GradientStart,
8072 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8074 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8075 I < ArgOffset + Intr->CoordStart;
I++)
8082 ArgOffset +
Intr->CoordStart, VAddrEnd,
8086 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8104 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8105 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8106 const bool UseNSA =
ST->hasNSAEncoding() &&
8107 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8108 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8109 const bool UsePartialNSA =
8110 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8113 if (UsePartialNSA) {
8115 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8124 if (!BaseOpcode->Sampler) {
8128 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8130 Unorm = UnormConst ? True : False;
8135 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8136 bool IsTexFail =
false;
8137 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8148 NumVDataDwords += 1;
8149 AdjustRetType =
true;
8154 if (AdjustRetType) {
8156 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8159 if (isa<MemSDNode>(
Op))
8164 EVT NewVT = NumVDataDwords > 1 ?
8168 ResultTypes[0] = NewVT;
8169 if (ResultTypes.size() == 3) {
8173 ResultTypes.erase(&ResultTypes[1]);
8177 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8178 if (BaseOpcode->Atomic)
8185 if (BaseOpcode->Store || BaseOpcode->Atomic)
8187 if (UsePartialNSA) {
8196 if (BaseOpcode->Sampler)
8201 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8205 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8213 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8217 if (BaseOpcode->HasD16)
8219 if (isa<MemSDNode>(
Op))
8222 int NumVAddrDwords =
8228 NumVDataDwords, NumVAddrDwords);
8229 }
else if (IsGFX11Plus) {
8231 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8232 : AMDGPU::MIMGEncGfx11Default,
8233 NumVDataDwords, NumVAddrDwords);
8234 }
else if (IsGFX10Plus) {
8236 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8237 : AMDGPU::MIMGEncGfx10Default,
8238 NumVDataDwords, NumVAddrDwords);
8242 NumVDataDwords, NumVAddrDwords);
8245 "requested image instruction is not supported on this GPU");
8250 NumVDataDwords, NumVAddrDwords);
8253 NumVDataDwords, NumVAddrDwords);
8259 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
8264 if (BaseOpcode->AtomicX2) {
8269 if (BaseOpcode->NoReturn)
8273 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8291 if (!
Offset->isDivergent()) {
8336 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8340 unsigned NumLoads = 1;
8346 if (NumElts == 8 || NumElts == 16) {
8347 NumLoads = NumElts / 4;
8355 setBufferOffsets(
Offset, DAG, &Ops[3],
8356 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8359 for (
unsigned i = 0; i < NumLoads; ++i) {
8365 if (NumElts == 8 || NumElts == 16)
8412 EVT VT =
Op.getValueType();
8414 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8418 switch (IntrinsicID) {
8419 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8422 return getPreloadedValue(DAG, *MFI, VT,
8425 case Intrinsic::amdgcn_dispatch_ptr:
8426 case Intrinsic::amdgcn_queue_ptr: {
8429 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8435 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8437 return getPreloadedValue(DAG, *MFI, VT, RegID);
8439 case Intrinsic::amdgcn_implicitarg_ptr: {
8441 return getImplicitArgPtr(DAG,
DL);
8442 return getPreloadedValue(DAG, *MFI, VT,
8445 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8451 return getPreloadedValue(DAG, *MFI, VT,
8454 case Intrinsic::amdgcn_dispatch_id: {
8457 case Intrinsic::amdgcn_rcp:
8459 case Intrinsic::amdgcn_rsq:
8461 case Intrinsic::amdgcn_rsq_legacy:
8465 case Intrinsic::amdgcn_rcp_legacy:
8469 case Intrinsic::amdgcn_rsq_clamp: {
8483 case Intrinsic::r600_read_ngroups_x:
8487 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8490 case Intrinsic::r600_read_ngroups_y:
8494 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8497 case Intrinsic::r600_read_ngroups_z:
8501 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8504 case Intrinsic::r600_read_global_size_x:
8508 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8511 case Intrinsic::r600_read_global_size_y:
8515 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8518 case Intrinsic::r600_read_global_size_z:
8522 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8525 case Intrinsic::r600_read_local_size_x:
8529 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8531 case Intrinsic::r600_read_local_size_y:
8535 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8537 case Intrinsic::r600_read_local_size_z:
8541 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8543 case Intrinsic::amdgcn_workgroup_id_x:
8544 return getPreloadedValue(DAG, *MFI, VT,
8546 case Intrinsic::amdgcn_workgroup_id_y:
8547 return getPreloadedValue(DAG, *MFI, VT,
8549 case Intrinsic::amdgcn_workgroup_id_z:
8550 return getPreloadedValue(DAG, *MFI, VT,
8552 case Intrinsic::amdgcn_wave_id:
8553 return lowerWaveID(DAG,
Op);
8554 case Intrinsic::amdgcn_lds_kernel_id: {
8556 return getLDSKernelId(DAG,
DL);
8557 return getPreloadedValue(DAG, *MFI, VT,
8560 case Intrinsic::amdgcn_workitem_id_x:
8561 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8562 case Intrinsic::amdgcn_workitem_id_y:
8563 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8564 case Intrinsic::amdgcn_workitem_id_z:
8565 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8566 case Intrinsic::amdgcn_wavefrontsize:
8569 case Intrinsic::amdgcn_s_buffer_load: {
8570 unsigned CPol =
Op.getConstantOperandVal(3);
8577 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8580 case Intrinsic::amdgcn_fdiv_fast:
8581 return lowerFDIV_FAST(
Op, DAG);
8582 case Intrinsic::amdgcn_sin:
8585 case Intrinsic::amdgcn_cos:
8588 case Intrinsic::amdgcn_mul_u24:
8590 case Intrinsic::amdgcn_mul_i24:
8593 case Intrinsic::amdgcn_log_clamp: {
8599 case Intrinsic::amdgcn_fract:
8602 case Intrinsic::amdgcn_class:
8604 Op.getOperand(1),
Op.getOperand(2));
8605 case Intrinsic::amdgcn_div_fmas:
8607 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8610 case Intrinsic::amdgcn_div_fixup:
8612 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8614 case Intrinsic::amdgcn_div_scale: {
8627 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8630 Denominator, Numerator);
8632 case Intrinsic::amdgcn_icmp: {
8634 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8635 Op.getConstantOperandVal(2) == 0 &&
8640 case Intrinsic::amdgcn_fcmp: {
8643 case Intrinsic::amdgcn_ballot:
8645 case Intrinsic::amdgcn_fmed3:
8647 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8648 case Intrinsic::amdgcn_fdot2:
8650 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8652 case Intrinsic::amdgcn_fmul_legacy:
8654 Op.getOperand(1),
Op.getOperand(2));
8655 case Intrinsic::amdgcn_sffbh:
8657 case Intrinsic::amdgcn_sbfe:
8659 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8660 case Intrinsic::amdgcn_ubfe:
8662 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8663 case Intrinsic::amdgcn_cvt_pkrtz:
8664 case Intrinsic::amdgcn_cvt_pknorm_i16:
8665 case Intrinsic::amdgcn_cvt_pknorm_u16:
8666 case Intrinsic::amdgcn_cvt_pk_i16:
8667 case Intrinsic::amdgcn_cvt_pk_u16: {
8669 EVT VT =
Op.getValueType();
8672 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8674 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8676 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8678 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8684 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8687 Op.getOperand(1),
Op.getOperand(2));
8690 case Intrinsic::amdgcn_fmad_ftz:
8692 Op.getOperand(2),
Op.getOperand(3));
8694 case Intrinsic::amdgcn_if_break:
8696 Op->getOperand(1),
Op->getOperand(2)), 0);
8698 case Intrinsic::amdgcn_groupstaticsize: {
8710 case Intrinsic::amdgcn_is_shared:
8711 case Intrinsic::amdgcn_is_private: {
8713 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8715 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8723 case Intrinsic::amdgcn_perm:
8725 Op.getOperand(2),
Op.getOperand(3));
8726 case Intrinsic::amdgcn_reloc_constant: {
8730 auto RelocSymbol = cast<GlobalVariable>(
8736 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8737 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8738 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8739 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8740 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8741 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8742 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8743 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8744 if (
Op.getOperand(4).getValueType() == MVT::i32)
8750 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8751 Op.getOperand(3), IndexKeyi32);
8753 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8754 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8755 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8756 if (
Op.getOperand(6).getValueType() == MVT::i32)
8762 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8763 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8764 IndexKeyi32, Op.getOperand(7)});
8766 case Intrinsic::amdgcn_addrspacecast_nonnull:
8767 return lowerADDRSPACECAST(
Op, DAG);
8768 case Intrinsic::amdgcn_readlane:
8769 case Intrinsic::amdgcn_readfirstlane:
8770 case Intrinsic::amdgcn_writelane:
8771 case Intrinsic::amdgcn_permlane16:
8772 case Intrinsic::amdgcn_permlanex16:
8773 case Intrinsic::amdgcn_permlane64:
8778 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8789 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8795 unsigned NewOpcode)
const {
8799 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8800 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8814 auto *
M = cast<MemSDNode>(
Op);
8818 M->getMemOperand());
8823 unsigned NewOpcode)
const {
8827 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8828 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8842 auto *
M = cast<MemSDNode>(
Op);
8846 M->getMemOperand());
8851 unsigned IntrID =
Op.getConstantOperandVal(1);
8855 case Intrinsic::amdgcn_ds_ordered_add:
8856 case Intrinsic::amdgcn_ds_ordered_swap: {
8861 unsigned IndexOperand =
M->getConstantOperandVal(7);
8862 unsigned WaveRelease =
M->getConstantOperandVal(8);
8863 unsigned WaveDone =
M->getConstantOperandVal(9);
8865 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8866 IndexOperand &= ~0x3f;
8867 unsigned CountDw = 0;
8870 CountDw = (IndexOperand >> 24) & 0xf;
8871 IndexOperand &= ~(0xf << 24);
8873 if (CountDw < 1 || CountDw > 4) {
8875 "ds_ordered_count: dword count must be between 1 and 4");
8882 if (WaveDone && !WaveRelease)
8885 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8886 unsigned ShaderType =
8888 unsigned Offset0 = OrderedCountIndex << 2;
8889 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8892 Offset1 |= (CountDw - 1) << 6;
8895 Offset1 |= ShaderType << 2;
8897 unsigned Offset = Offset0 | (Offset1 << 8);
8906 M->getVTList(), Ops,
M->getMemoryVT(),
8907 M->getMemOperand());
8909 case Intrinsic::amdgcn_raw_buffer_load:
8910 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8911 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8912 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8913 case Intrinsic::amdgcn_raw_buffer_load_format:
8914 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8915 const bool IsFormat =
8916 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8917 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8919 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8920 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8933 auto *
M = cast<MemSDNode>(
Op);
8934 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8936 case Intrinsic::amdgcn_struct_buffer_load:
8937 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8938 case Intrinsic::amdgcn_struct_buffer_load_format:
8939 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8940 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8941 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
8942 const bool IsFormat =
8943 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8944 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8946 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8947 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8960 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8962 case Intrinsic::amdgcn_raw_tbuffer_load:
8963 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8965 EVT LoadVT =
Op.getValueType();
8966 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8967 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8986 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8989 case Intrinsic::amdgcn_struct_tbuffer_load:
8990 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8992 EVT LoadVT =
Op.getValueType();
8993 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8994 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9013 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9016 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9017 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9019 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9020 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9022 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9025 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9026 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9028 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9031 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9032 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9034 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9037 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9038 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9040 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9043 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9046 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9049 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9050 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9052 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9055 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9056 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9058 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9061 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9062 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9064 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9065 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9067 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9068 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9070 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9071 return lowerRawBufferAtomicIntrin(
Op, DAG,
9073 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9074 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9075 return lowerStructBufferAtomicIntrin(
Op, DAG,
9077 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9080 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9083 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9084 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9085 return lowerStructBufferAtomicIntrin(
Op, DAG,
9087 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9089 return lowerStructBufferAtomicIntrin(
Op, DAG,
9091 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9092 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9093 return lowerStructBufferAtomicIntrin(
Op, DAG,
9095 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9097 return lowerStructBufferAtomicIntrin(
Op, DAG,
9099 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9100 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9102 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9105 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9106 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9108 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9109 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9111 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9112 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9114 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9115 return lowerStructBufferAtomicIntrin(
Op, DAG,
9118 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9119 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9120 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9121 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9135 EVT VT =
Op.getValueType();
9136 auto *
M = cast<MemSDNode>(
Op);
9139 Op->getVTList(), Ops, VT,
M->getMemOperand());
9141 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9142 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9143 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9144 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9158 EVT VT =
Op.getValueType();
9159 auto *
M = cast<MemSDNode>(
Op);
9162 Op->getVTList(), Ops, VT,
M->getMemOperand());
9164 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9166 SDValue NodePtr =
M->getOperand(2);
9167 SDValue RayExtent =
M->getOperand(3);
9168 SDValue RayOrigin =
M->getOperand(4);
9170 SDValue RayInvDir =
M->getOperand(6);
9188 const unsigned NumVDataDwords = 4;
9189 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9190 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9194 const unsigned BaseOpcodes[2][2] = {
9195 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9196 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9197 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9201 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9202 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9203 : AMDGPU::MIMGEncGfx10NSA,
9204 NumVDataDwords, NumVAddrDwords);
9208 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9209 : AMDGPU::MIMGEncGfx10Default,
9210 NumVDataDwords, NumVAddrDwords);
9216 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9219 if (Lanes[0].getValueSizeInBits() == 32) {
9220 for (
unsigned I = 0;
I < 3; ++
I)
9227 { Lanes[0], Lanes[1] })));
9234 { Elt0, Lanes[0] })));
9238 { Lanes[1], Lanes[2] })));
9243 if (UseNSA && IsGFX11Plus) {
9251 for (
unsigned I = 0;
I < 3; ++
I) {
9254 {DirLanes[I], InvDirLanes[I]})));
9269 packLanes(RayOrigin,
true);
9270 packLanes(RayDir,
true);
9271 packLanes(RayInvDir,
false);
9276 if (NumVAddrDwords > 12) {
9296 case Intrinsic::amdgcn_global_atomic_fmin:
9297 case Intrinsic::amdgcn_global_atomic_fmax:
9298 case Intrinsic::amdgcn_global_atomic_fmin_num:
9299 case Intrinsic::amdgcn_global_atomic_fmax_num:
9300 case Intrinsic::amdgcn_flat_atomic_fmin:
9301 case Intrinsic::amdgcn_flat_atomic_fmax:
9302 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9303 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9310 unsigned Opcode = 0;
9312 case Intrinsic::amdgcn_global_atomic_fmin:
9313 case Intrinsic::amdgcn_global_atomic_fmin_num:
9314 case Intrinsic::amdgcn_flat_atomic_fmin:
9315 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9319 case Intrinsic::amdgcn_global_atomic_fmax:
9320 case Intrinsic::amdgcn_global_atomic_fmax_num:
9321 case Intrinsic::amdgcn_flat_atomic_fmax:
9322 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9330 Ops,
M->getMemOperand());
9332 case Intrinsic::amdgcn_s_get_barrier_state: {
9336 bool IsInlinableBarID =
false;
9339 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9340 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9344 if (IsInlinableBarID) {
9345 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9349 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9361 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9369SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9379 bool IsTFE = VTList.
NumVTs == 3;
9382 unsigned NumOpDWords = NumValueDWords + 1;
9387 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9388 OpDWordsVT, OpDWordsMMO, DAG);
9403 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9409 WidenedMemVT, WidenedMMO);
9419 bool ImageStore)
const {
9454 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9460 if ((NumElements % 2) == 1) {
9462 unsigned I = Elts.
size() / 2;
9478 if (NumElements == 3) {
9499 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9502 switch (IntrinsicID) {
9503 case Intrinsic::amdgcn_exp_compr: {
9507 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9530 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9533 case Intrinsic::amdgcn_s_barrier: {
9536 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9537 if (WGSize <=
ST.getWavefrontSize())
9539 Op.getOperand(0)), 0);
9543 if (
ST.hasSplitBarriers()) {
9548 MVT::Other, K,
Op.getOperand(0)),
9560 case Intrinsic::amdgcn_struct_tbuffer_store:
9561 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9565 VData = handleD16VData(VData, DAG);
9566 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9567 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9585 M->getMemoryVT(),
M->getMemOperand());
9588 case Intrinsic::amdgcn_raw_tbuffer_store:
9589 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9593 VData = handleD16VData(VData, DAG);
9594 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9595 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9613 M->getMemoryVT(),
M->getMemOperand());
9616 case Intrinsic::amdgcn_raw_buffer_store:
9617 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9618 case Intrinsic::amdgcn_raw_buffer_store_format:
9619 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9620 const bool IsFormat =
9621 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9622 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9629 VData = handleD16VData(VData, DAG);
9639 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9640 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9660 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9663 M->getMemoryVT(),
M->getMemOperand());
9666 case Intrinsic::amdgcn_struct_buffer_store:
9667 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9668 case Intrinsic::amdgcn_struct_buffer_store_format:
9669 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9670 const bool IsFormat =
9671 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9672 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9680 VData = handleD16VData(VData, DAG);
9690 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9691 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9712 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9715 M->getMemoryVT(),
M->getMemOperand());
9717 case Intrinsic::amdgcn_raw_buffer_load_lds:
9718 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9719 case Intrinsic::amdgcn_struct_buffer_load_lds:
9720 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9724 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9725 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9726 unsigned OpOffset = HasVIndex ? 1 : 0;
9727 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9729 unsigned Size =
Op->getConstantOperandVal(4);
9735 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9736 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9737 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9738 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9741 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9742 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9743 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9744 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9747 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9748 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9749 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9750 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9758 if (HasVIndex && HasVOffset)
9764 else if (HasVOffset)
9767 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9771 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9779 auto *
M = cast<MemSDNode>(
Op);
9806 case Intrinsic::amdgcn_global_load_lds: {
9808 unsigned Size =
Op->getConstantOperandVal(4);
9813 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9816 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9819 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9823 auto *
M = cast<MemSDNode>(
Op);
9836 if (
LHS->isDivergent())
9840 RHS.getOperand(0).getValueType() == MVT::i32) {
9843 VOffset =
RHS.getOperand(0);
9848 if (!
Addr->isDivergent()) {
9864 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9884 case Intrinsic::amdgcn_end_cf:
9886 Op->getOperand(2), Chain), 0);
9887 case Intrinsic::amdgcn_s_barrier_init:
9888 case Intrinsic::amdgcn_s_barrier_join:
9889 case Intrinsic::amdgcn_s_wakeup_barrier: {
9894 bool IsInlinableBarID =
false;
9897 if (isa<ConstantSDNode>(BarOp)) {
9898 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9902 if (IsInlinableBarID) {
9903 switch (IntrinsicID) {
9906 case Intrinsic::amdgcn_s_barrier_init:
9907 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9909 case Intrinsic::amdgcn_s_barrier_join:
9910 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9912 case Intrinsic::amdgcn_s_wakeup_barrier:
9913 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9920 switch (IntrinsicID) {
9923 case Intrinsic::amdgcn_s_barrier_init:
9924 Opc = AMDGPU::S_BARRIER_INIT_M0;
9926 case Intrinsic::amdgcn_s_barrier_join:
9927 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9929 case Intrinsic::amdgcn_s_wakeup_barrier:
9930 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9935 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9941 if (!IsInlinableBarID) {
9946 Op.getOperand(2), M0Val),
9950 }
else if (!IsInlinableBarID) {
9960 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9973std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9980 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9997 unsigned Overflow = ImmOffset & ~MaxImm;
9998 ImmOffset -= Overflow;
9999 if ((int32_t)Overflow < 0) {
10000 Overflow += ImmOffset;
10005 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10009 SDValue Ops[] = { N0, OverflowVal };
10024void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10026 Align Alignment)
const {
10029 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10032 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10043 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10045 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10062SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10065 return MaybePointer;
10081 SDValue NumRecords =
Op->getOperand(3);
10084 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10087 std::optional<uint32_t> ConstStride = std::nullopt;
10088 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10089 ConstStride = ConstNode->getZExtValue();
10092 if (!ConstStride || *ConstStride != 0) {
10095 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10106 NewHighHalf, NumRecords, Flags);
10116 bool IsTFE)
const {
10126 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10153 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10157 Ops[1] = BufferStoreExt;
10162 M->getMemOperand());
10187SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10203 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10210 "unexpected vector extload");
10223 "unexpected fp extload");
10241 DCI.AddToWorklist(Cvt.
getNode());
10246 DCI.AddToWorklist(Cvt.
getNode());
10257 if (
Info.isEntryFunction())
10258 return Info.getUserSGPRInfo().hasFlatScratchInit();
10266 EVT MemVT =
Load->getMemoryVT();
10279 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10282 BasePtr, RealMemVT, MMO);
10312 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10313 "Custom lowering for non-i32 vectors hasn't been implemented.");
10316 unsigned AS =
Load->getAddressSpace();
10335 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10352 Alignment >=
Align(4) && NumElements < 32) {
10367 if (NumElements > 4)
10387 if (NumElements > 2)
10392 if (NumElements > 4)
10404 auto Flags =
Load->getMemOperand()->getFlags();
10406 Load->getAlign(), Flags, &
Fast) &&
10415 MemVT, *
Load->getMemOperand())) {
10425 EVT VT =
Op.getValueType();
10462 EVT VT =
Op.getValueType();
10465 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10472 if (!AllowInaccurateRcp && VT != MVT::f16)
10475 if (CLHS->isExactlyValue(1.0)) {
10492 if (CLHS->isExactlyValue(-1.0)) {
10501 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10515 EVT VT =
Op.getValueType();
10518 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10520 if (!AllowInaccurateDiv)
10541 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10554 return DAG.
getNode(Opcode, SL, VTList,
10563 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10576 return DAG.
getNode(Opcode, SL, VTList,
10582 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10583 return FastLowered;
10610 const APFloat K0Val(0x1p+96f);
10613 const APFloat K1Val(0x1p-32f);
10640 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10641 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10642 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10647 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10648 return FastLowered;
10655 Flags.setNoFPExcept(
true);
10672 DenominatorScaled, Flags);
10674 DenominatorScaled, Flags);
10676 using namespace AMDGPU::Hwreg;
10677 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10685 const bool HasDynamicDenormals =
10691 if (!PreservesDenormals) {
10699 if (HasDynamicDenormals) {
10703 SavedDenormMode =
SDValue(GetReg, 0);
10711 const SDValue EnableDenormValue =
10720 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10721 {EnableDenormValue,
BitField, Glue});
10734 ApproxRcp, One, NegDivScale0, Flags);
10737 ApproxRcp, Fma0, Flags);
10740 Fma1, Fma1, Flags);
10743 NumeratorScaled,
Mul, Flags);
10746 Fma2, Fma1,
Mul, Fma2, Flags);
10749 NumeratorScaled, Fma3, Flags);
10751 if (!PreservesDenormals) {
10758 Fma4.
getValue(1), DisableDenormValue,
10761 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10762 const SDValue DisableDenormValue =
10763 HasDynamicDenormals
10768 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10779 {Fma4, Fma1, Fma3, Scale},
Flags);
10785 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10786 return FastLowered;
10814 NegDivScale0,
Mul, DivScale1);
10846 Fma4, Fma3,
Mul, Scale);
10852 EVT VT =
Op.getValueType();
10854 if (VT == MVT::f32)
10855 return LowerFDIV32(
Op, DAG);
10857 if (VT == MVT::f64)
10858 return LowerFDIV64(
Op, DAG);
10860 if (VT == MVT::f16)
10861 return LowerFDIV16(
Op, DAG);
10870 EVT ResultExpVT =
Op->getValueType(1);
10871 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10901 if (VT == MVT::i1) {
10904 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10908 Store->getValue().getValueType().getScalarType() == MVT::i32);
10910 unsigned AS =
Store->getAddressSpace();
10929 if (NumElements > 4)
10936 VT, *
Store->getMemOperand()))
10946 if (NumElements > 2)
10950 if (NumElements > 4 ||
10959 auto Flags =
Store->getMemOperand()->getFlags();
10994 MVT VT =
Op.getValueType().getSimpleVT();
11163 EVT VT =
Op.getValueType();
11180 switch (
Op.getOpcode()) {
11206 EVT VT =
Op.getValueType();
11222 DAGCombinerInfo &DCI)
const {
11223 EVT VT =
N->getValueType(0);
11225 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11232 EVT SrcVT = Src.getValueType();
11238 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11241 DCI.AddToWorklist(Cvt.
getNode());
11244 if (ScalarVT != MVT::f32) {
11256 DAGCombinerInfo &DCI)
const {
11257 SDValue MagnitudeOp =
N->getOperand(0);
11258 SDValue SignOp =
N->getOperand(1);
11316 unsigned AddrSpace,
11318 DAGCombinerInfo &DCI)
const {
11348 AM.HasBaseReg =
true;
11349 AM.BaseOffs =
Offset.getSExtValue();
11354 EVT VT =
N->getValueType(0);
11360 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11371 switch (
N->getOpcode()) {
11382 DAGCombinerInfo &DCI)
const {
11391 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11392 N->getMemoryVT(), DCI);
11396 NewOps[PtrIdx] = NewPtr;
11405 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11406 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11415SDValue SITargetLowering::splitBinaryBitConstantOp(
11416 DAGCombinerInfo &DCI,
11438 if (V.getValueType() != MVT::i1)
11440 switch (V.getOpcode()) {
11459 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11460 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11461 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11462 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11463 uint32_t NonZeroByteMask = ~ZeroByteMask;
11464 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11477 assert(V.getValueSizeInBits() == 32);
11479 if (V.getNumOperands() != 2)
11488 switch (V.getOpcode()) {
11493 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11498 return (0x03020100 & ~ConstMask) | ConstMask;
11505 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11511 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11518 DAGCombinerInfo &DCI)
const {
11519 if (DCI.isBeforeLegalize())
11523 EVT VT =
N->getValueType(0);
11529 if (VT == MVT::i64 && CRHS) {
11535 if (CRHS && VT == MVT::i32) {
11544 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11545 unsigned Shift = CShift->getZExtValue();
11547 unsigned Offset = NB + Shift;
11548 if ((
Offset & (Bits - 1)) == 0) {
11551 LHS->getOperand(0),
11566 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11572 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11587 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11592 if (
X !=
LHS.getOperand(1))
11630 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11631 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11634 Mask->getZExtValue() & ~OrdMask :
11635 Mask->getZExtValue() & OrdMask;
11643 if (VT == MVT::i32 &&
11656 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11659 if (LHSMask != ~0u && RHSMask != ~0u) {
11662 if (LHSMask > RHSMask) {
11669 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11670 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11673 if (!(LHSUsedLanes & RHSUsedLanes) &&
11676 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11683 for (
unsigned I = 0;
I < 32;
I += 8) {
11685 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11686 Mask &= (0x0c <<
I) & 0xffffffff;
11695 LHS.getOperand(0),
RHS.getOperand(0),
11744static const std::optional<ByteProvider<SDValue>>
11746 unsigned Depth = 0) {
11749 return std::nullopt;
11751 if (
Op.getValueSizeInBits() < 8)
11752 return std::nullopt;
11754 if (
Op.getValueType().isVector())
11757 switch (
Op->getOpcode()) {
11768 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11769 NarrowVT = VTSign->getVT();
11772 return std::nullopt;
11775 if (SrcIndex >= NarrowByteWidth)
11776 return std::nullopt;
11782 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11784 return std::nullopt;
11786 uint64_t BitShift = ShiftOp->getZExtValue();
11788 if (BitShift % 8 != 0)
11789 return std::nullopt;
11791 SrcIndex += BitShift / 8;
11809static const std::optional<ByteProvider<SDValue>>
11811 unsigned StartingIndex = 0) {
11815 return std::nullopt;
11817 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11819 return std::nullopt;
11821 return std::nullopt;
11823 bool IsVec =
Op.getValueType().isVector();
11824 switch (
Op.getOpcode()) {
11827 return std::nullopt;
11832 return std::nullopt;
11836 return std::nullopt;
11839 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11840 return std::nullopt;
11841 if (!
LHS ||
LHS->isConstantZero())
11843 if (!
RHS ||
RHS->isConstantZero())
11845 return std::nullopt;
11850 return std::nullopt;
11852 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11854 return std::nullopt;
11856 uint32_t BitMask = BitMaskOp->getZExtValue();
11860 if ((IndexMask & BitMask) != IndexMask) {
11863 if (IndexMask & BitMask)
11864 return std::nullopt;
11873 return std::nullopt;
11876 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11877 if (!ShiftOp ||
Op.getValueType().isVector())
11878 return std::nullopt;
11880 uint64_t BitsProvided =
Op.getValueSizeInBits();
11881 if (BitsProvided % 8 != 0)
11882 return std::nullopt;
11884 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11886 return std::nullopt;
11888 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11889 uint64_t ByteShift = BitShift / 8;
11891 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11892 uint64_t BytesProvided = BitsProvided / 8;
11893 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11894 NewIndex %= BytesProvided;
11901 return std::nullopt;
11903 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11905 return std::nullopt;
11907 uint64_t BitShift = ShiftOp->getZExtValue();
11909 return std::nullopt;
11911 auto BitsProvided =
Op.getScalarValueSizeInBits();
11912 if (BitsProvided % 8 != 0)
11913 return std::nullopt;
11915 uint64_t BytesProvided = BitsProvided / 8;
11916 uint64_t ByteShift = BitShift / 8;
11921 return BytesProvided - ByteShift >
Index
11929 return std::nullopt;
11931 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11933 return std::nullopt;
11935 uint64_t BitShift = ShiftOp->getZExtValue();
11936 if (BitShift % 8 != 0)
11937 return std::nullopt;
11938 uint64_t ByteShift = BitShift / 8;
11944 return Index < ByteShift
11947 Depth + 1, StartingIndex);
11956 return std::nullopt;
11963 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11964 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11966 if (NarrowBitWidth % 8 != 0)
11967 return std::nullopt;
11968 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11970 if (
Index >= NarrowByteWidth)
11972 ? std::optional<ByteProvider<SDValue>>(
11980 return std::nullopt;
11984 if (NarrowByteWidth >=
Index) {
11989 return std::nullopt;
11996 return std::nullopt;
12000 auto L = cast<LoadSDNode>(
Op.getNode());
12002 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12003 if (NarrowBitWidth % 8 != 0)
12004 return std::nullopt;
12005 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12010 if (
Index >= NarrowByteWidth) {
12012 ? std::optional<ByteProvider<SDValue>>(
12017 if (NarrowByteWidth >
Index) {
12021 return std::nullopt;
12026 return std::nullopt;
12029 Depth + 1, StartingIndex);
12033 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12035 return std::nullopt;
12036 auto VecIdx = IdxOp->getZExtValue();
12037 auto ScalarSize =
Op.getScalarValueSizeInBits();
12038 if (ScalarSize < 32)
12039 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
12041 StartingIndex,
Index);
12046 return std::nullopt;
12048 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12050 return std::nullopt;
12053 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
12054 if (IdxMask > 0x07 && IdxMask != 0x0c)
12055 return std::nullopt;
12057 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12058 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12060 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12066 return std::nullopt;
12081 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12085 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12088 auto MemVT = L->getMemoryVT();
12091 return L->getMemoryVT().getSizeInBits() == 16;
12101 int Low8 = Mask & 0xff;
12102 int Hi8 = (Mask & 0xff00) >> 8;
12104 assert(Low8 < 8 && Hi8 < 8);
12106 bool IsConsecutive = (Hi8 - Low8 == 1);
12111 bool Is16Aligned = !(Low8 % 2);
12113 return IsConsecutive && Is16Aligned;
12121 int Low16 = PermMask & 0xffff;
12122 int Hi16 = (PermMask & 0xffff0000) >> 16;
12132 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12134 if (!OtherOpIs16Bit)
12142 unsigned DWordOffset) {
12145 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12147 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12152 if (Src.getValueType().isVector()) {
12153 auto ScalarTySize = Src.getScalarValueSizeInBits();
12154 auto ScalarTy = Src.getValueType().getScalarType();
12155 if (ScalarTySize == 32) {
12159 if (ScalarTySize > 32) {
12162 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12163 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12170 assert(ScalarTySize < 32);
12171 auto NumElements =
TypeSize / ScalarTySize;
12172 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12173 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12174 auto NumElementsIn32 = 32 / ScalarTySize;
12175 auto NumAvailElements = DWordOffset < Trunc32Elements
12177 : NumElements - NormalizedTrunc;
12190 auto ShiftVal = 32 * DWordOffset;
12198 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12203 for (
int i = 0; i < 4; i++) {
12205 std::optional<ByteProvider<SDValue>>
P =
12208 if (!
P ||
P->isConstantZero())
12213 if (PermNodes.
size() != 4)
12216 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12217 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12219 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12220 auto PermOp = PermNodes[i];
12223 int SrcByteAdjust = 4;
12227 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12228 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12230 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12231 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12235 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12236 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12239 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12241 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12244 SDValue Op = *PermNodes[FirstSrc.first].Src;
12246 assert(
Op.getValueSizeInBits() == 32);
12250 int Low16 = PermMask & 0xffff;
12251 int Hi16 = (PermMask & 0xffff0000) >> 16;
12253 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12254 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12257 if (WellFormedLow && WellFormedHi)
12261 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12270 assert(
Op.getValueType().isByteSized() &&
12288 DAGCombinerInfo &DCI)
const {
12293 EVT VT =
N->getValueType(0);
12294 if (VT == MVT::i1) {
12299 if (Src !=
RHS.getOperand(0))
12304 if (!CLHS || !CRHS)
12308 static const uint32_t MaxMask = 0x3ff;
12322 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12327 Sel |=
LHS.getConstantOperandVal(2);
12336 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12340 auto usesCombinedOperand = [](
SDNode *OrUse) {
12343 !OrUse->getValueType(0).isVector())
12347 for (
auto VUse : OrUse->uses()) {
12348 if (!VUse->getValueType(0).isVector())
12355 if (VUse->getOpcode() == VectorwiseOp)
12361 if (!
any_of(
N->uses(), usesCombinedOperand))
12367 if (LHSMask != ~0u && RHSMask != ~0u) {
12370 if (LHSMask > RHSMask) {
12377 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12378 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12381 if (!(LHSUsedLanes & RHSUsedLanes) &&
12384 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12386 LHSMask &= ~RHSUsedLanes;
12387 RHSMask &= ~LHSUsedLanes;
12389 LHSMask |= LHSUsedLanes & 0x04040404;
12395 LHS.getOperand(0),
RHS.getOperand(0),
12399 if (LHSMask == ~0u || RHSMask == ~0u) {
12405 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12420 if (SrcVT == MVT::i32) {
12426 DCI.AddToWorklist(LowOr.
getNode());
12427 DCI.AddToWorklist(HiBits.
getNode());
12435 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12439 N->getOperand(0), CRHS))
12447 DAGCombinerInfo &DCI)
const {
12448 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12457 EVT VT =
N->getValueType(0);
12458 if (CRHS && VT == MVT::i64) {
12480 LHS->getOperand(0), FNegLHS, FNegRHS);
12489 DAGCombinerInfo &DCI)
const {
12494 EVT VT =
N->getValueType(0);
12495 if (VT != MVT::i32)
12499 if (Src.getValueType() != MVT::i16)
12506SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12507 DAGCombinerInfo &DCI)
const {
12509 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12514 VTSign->getVT() == MVT::i8) ||
12516 VTSign->getVT() == MVT::i16))) {
12518 "s_buffer_load_{u8, i8} are supported "
12519 "in GFX12 (or newer) architectures.");
12520 EVT VT = Src.getValueType();
12525 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12531 auto *
M = cast<MemSDNode>(Src);
12532 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12533 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12538 VTSign->getVT() == MVT::i8) ||
12540 VTSign->getVT() == MVT::i16)) &&
12542 auto *
M = cast<MemSDNode>(Src);
12554 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12555 Src.getOperand(0).getValueType());
12558 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12560 Ops,
M->getMemoryVT(),
12561 M->getMemOperand());
12562 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12569 DAGCombinerInfo &DCI)
const {
12577 if (
N->getOperand(0).isUndef())
12584 DAGCombinerInfo &DCI)
const {
12585 EVT VT =
N->getValueType(0);
12611 unsigned Opcode =
Op.getOpcode();
12615 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12616 const auto &
F = CFP->getValueAPF();
12617 if (
F.isNaN() &&
F.isSignaling())
12619 if (!
F.isDenormal())
12682 if (
Op.getValueType() == MVT::i32) {
12687 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12688 if (
RHS->getZExtValue() == 0xffff0000) {
12698 return Op.getValueType().getScalarType() != MVT::f16;
12766 if (
Op.getValueType() == MVT::i16) {
12777 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12779 switch (IntrinsicID) {
12780 case Intrinsic::amdgcn_cvt_pkrtz:
12781 case Intrinsic::amdgcn_cubeid:
12782 case Intrinsic::amdgcn_frexp_mant:
12783 case Intrinsic::amdgcn_fdot2:
12784 case Intrinsic::amdgcn_rcp:
12785 case Intrinsic::amdgcn_rsq:
12786 case Intrinsic::amdgcn_rsq_clamp:
12787 case Intrinsic::amdgcn_rcp_legacy:
12788 case Intrinsic::amdgcn_rsq_legacy:
12789 case Intrinsic::amdgcn_trig_preop:
12790 case Intrinsic::amdgcn_log:
12791 case Intrinsic::amdgcn_exp2:
12792 case Intrinsic::amdgcn_sqrt:
12813 unsigned Opcode =
MI->getOpcode();
12815 if (Opcode == AMDGPU::G_FCANONICALIZE)
12818 std::optional<FPValueAndVReg> FCR;
12821 if (FCR->Value.isSignaling())
12823 if (!FCR->Value.isDenormal())
12834 case AMDGPU::G_FADD:
12835 case AMDGPU::G_FSUB:
12836 case AMDGPU::G_FMUL:
12837 case AMDGPU::G_FCEIL:
12838 case AMDGPU::G_FFLOOR:
12839 case AMDGPU::G_FRINT:
12840 case AMDGPU::G_FNEARBYINT:
12841 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12842 case AMDGPU::G_INTRINSIC_TRUNC:
12843 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12844 case AMDGPU::G_FMA:
12845 case AMDGPU::G_FMAD:
12846 case AMDGPU::G_FSQRT:
12847 case AMDGPU::G_FDIV:
12848 case AMDGPU::G_FREM:
12849 case AMDGPU::G_FPOW:
12850 case AMDGPU::G_FPEXT:
12851 case AMDGPU::G_FLOG:
12852 case AMDGPU::G_FLOG2:
12853 case AMDGPU::G_FLOG10:
12854 case AMDGPU::G_FPTRUNC:
12855 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12856 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12857 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12858 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12859 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12861 case AMDGPU::G_FNEG:
12862 case AMDGPU::G_FABS:
12863 case AMDGPU::G_FCOPYSIGN:
12865 case AMDGPU::G_FMINNUM:
12866 case AMDGPU::G_FMAXNUM:
12867 case AMDGPU::G_FMINNUM_IEEE:
12868 case AMDGPU::G_FMAXNUM_IEEE:
12869 case AMDGPU::G_FMINIMUM:
12870 case AMDGPU::G_FMAXIMUM: {
12878 case AMDGPU::G_BUILD_VECTOR:
12883 case AMDGPU::G_INTRINSIC:
12884 case AMDGPU::G_INTRINSIC_CONVERGENT:
12886 case Intrinsic::amdgcn_fmul_legacy:
12887 case Intrinsic::amdgcn_fmad_ftz:
12888 case Intrinsic::amdgcn_sqrt:
12889 case Intrinsic::amdgcn_fmed3:
12890 case Intrinsic::amdgcn_sin:
12891 case Intrinsic::amdgcn_cos:
12892 case Intrinsic::amdgcn_log:
12893 case Intrinsic::amdgcn_exp2:
12894 case Intrinsic::amdgcn_log_clamp:
12895 case Intrinsic::amdgcn_rcp:
12896 case Intrinsic::amdgcn_rcp_legacy:
12897 case Intrinsic::amdgcn_rsq:
12898 case Intrinsic::amdgcn_rsq_clamp:
12899 case Intrinsic::amdgcn_rsq_legacy:
12900 case Intrinsic::amdgcn_div_scale:
12901 case Intrinsic::amdgcn_div_fmas:
12902 case Intrinsic::amdgcn_div_fixup:
12903 case Intrinsic::amdgcn_fract:
12904 case Intrinsic::amdgcn_cvt_pkrtz:
12905 case Intrinsic::amdgcn_cubeid:
12906 case Intrinsic::amdgcn_cubema:
12907 case Intrinsic::amdgcn_cubesc:
12908 case Intrinsic::amdgcn_cubetc:
12909 case Intrinsic::amdgcn_frexp_mant:
12910 case Intrinsic::amdgcn_fdot2:
12911 case Intrinsic::amdgcn_trig_preop:
12926SDValue SITargetLowering::getCanonicalConstantFP(
12929 if (
C.isDenormal()) {
12943 if (
C.isSignaling()) {
12962 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12965SDValue SITargetLowering::performFCanonicalizeCombine(
12967 DAGCombinerInfo &DCI)
const {
12970 EVT VT =
N->getValueType(0);
12979 EVT VT =
N->getValueType(0);
12980 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
12996 EVT EltVT =
Lo.getValueType();
12999 for (
unsigned I = 0;
I != 2; ++
I) {
13002 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
13003 CFP->getValueAPF());
13004 }
else if (
Op.isUndef()) {
13016 if (isa<ConstantFPSDNode>(NewElts[1]))
13017 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13022 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13073 if (!MinK || !MaxK)
13086 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13087 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13129 if (
Info->getMode().DX10Clamp) {
13138 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13170 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13173 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.
hasIEEEMinMax3();
13178 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13187 DAGCombinerInfo &DCI)
const {
13190 EVT VT =
N->getValueType(0);
13191 unsigned Opc =
N->getOpcode();
13205 N->getValueType(0),
13218 N->getValueType(0),
13228 if (
SDValue Med3 = performIntMed3ImmCombine(
13233 if (
SDValue Med3 = performIntMed3ImmCombine(
13239 if (
SDValue Med3 = performIntMed3ImmCombine(
13244 if (
SDValue Med3 = performIntMed3ImmCombine(
13254 (VT == MVT::f32 || VT == MVT::f64 ||
13258 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13269 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13270 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13279 DAGCombinerInfo &DCI)
const {
13280 EVT VT =
N->getValueType(0);
13303 if (
Info->getMode().DX10Clamp) {
13306 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13309 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13312 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13323 DAGCombinerInfo &DCI)
const {
13327 return DCI.DAG.getUNDEF(
N->getValueType(0));
13335 bool IsDivergentIdx,
13340 unsigned VecSize = EltSize * NumElem;
13343 if (VecSize <= 64 && EltSize < 32)
13352 if (IsDivergentIdx)
13356 unsigned NumInsts = NumElem +
13357 ((EltSize + 31) / 32) * NumElem ;
13362 return NumInsts <= 16;
13366 return NumInsts <= 15;
13371 if (isa<ConstantSDNode>(
Idx))
13384SDValue SITargetLowering::performExtractVectorEltCombine(
13385 SDNode *
N, DAGCombinerInfo &DCI)
const {
13391 EVT ResVT =
N->getValueType(0);
13410 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13438 DCI.AddToWorklist(Elt0.
getNode());
13439 DCI.AddToWorklist(Elt1.
getNode());
13461 if (!DCI.isBeforeLegalize())
13467 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13468 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13469 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13472 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13473 unsigned EltIdx = BitIndex / 32;
13474 unsigned LeftoverBitIdx = BitIndex % 32;
13478 DCI.AddToWorklist(Cast.
getNode());
13482 DCI.AddToWorklist(Elt.
getNode());
13485 DCI.AddToWorklist(Srl.
getNode());
13489 DCI.AddToWorklist(Trunc.
getNode());
13491 if (VecEltVT == ResVT) {
13503SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13504 DAGCombinerInfo &DCI)
const {
13518 EVT IdxVT =
Idx.getValueType();
13535 Src.getOperand(0).getValueType() == MVT::f16) {
13536 return Src.getOperand(0);
13539 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13540 APFloat Val = CFP->getValueAPF();
13541 bool LosesInfo =
true;
13551 DAGCombinerInfo &DCI)
const {
13553 "combine only useful on gfx8");
13555 SDValue TruncSrc =
N->getOperand(0);
13556 EVT VT =
N->getValueType(0);
13557 if (VT != MVT::f16)
13595unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13597 const SDNode *N1)
const {
13602 if (((VT == MVT::f32 &&
13604 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13624 EVT VT =
N->getValueType(0);
13625 if (VT != MVT::i32 && VT != MVT::i64)
13631 unsigned Opc =
N->getOpcode();
13654 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13676 DAGCombinerInfo &DCI)
const {
13680 EVT VT =
N->getValueType(0);
13690 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13694 if (NumBits <= 32 || NumBits > 64)
13706 unsigned NumUsers = 0;
13731 bool MulSignedLo =
false;
13732 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13741 if (VT != MVT::i64) {
13764 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13766 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13768 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13770 if (!MulLHSUnsigned32) {
13777 if (!MulRHSUnsigned32) {
13788 if (VT != MVT::i64)
13795static std::optional<ByteProvider<SDValue>>
13798 if (!Byte0 || Byte0->isConstantZero()) {
13799 return std::nullopt;
13802 if (Byte1 && !Byte1->isConstantZero()) {
13803 return std::nullopt;
13809 unsigned FirstCs =
First & 0x0c0c0c0c;
13810 unsigned SecondCs = Second & 0x0c0c0c0c;
13811 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13812 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13814 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13815 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13816 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13817 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13819 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13843 for (
int BPI = 0; BPI < 2; BPI++) {
13846 BPP = {Src1, Src0};
13848 unsigned ZeroMask = 0x0c0c0c0c;
13849 unsigned FMask = 0xFF << (8 * (3 - Step));
13851 unsigned FirstMask =
13852 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13853 unsigned SecondMask =
13854 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13858 int FirstGroup = -1;
13859 for (
int I = 0;
I < 2;
I++) {
13861 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13862 return IterElt.SrcOp == *BPP.first.Src &&
13863 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13873 if (FirstGroup != -1) {
13875 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13876 return IterElt.SrcOp == *BPP.second.Src &&
13877 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13883 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13891 unsigned ZeroMask = 0x0c0c0c0c;
13892 unsigned FMask = 0xFF << (8 * (3 - Step));
13896 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13900 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13911 if (Srcs.
size() == 1) {
13912 auto Elt = Srcs.
begin();
13916 if (Elt->PermMask == 0x3020100)
13923 auto FirstElt = Srcs.
begin();
13924 auto SecondElt = std::next(FirstElt);
13931 auto FirstMask = FirstElt->PermMask;
13932 auto SecondMask = SecondElt->PermMask;
13934 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13935 unsigned FirstPlusFour = FirstMask | 0x04040404;
13938 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13950 FirstElt = std::next(SecondElt);
13951 if (FirstElt == Srcs.
end())
13954 SecondElt = std::next(FirstElt);
13957 if (SecondElt == Srcs.
end()) {
13963 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13969 return Perms.
size() == 2
13975 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13976 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13977 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13978 EntryMask += ZeroMask;
13983 auto Opcode =
Op.getOpcode();
13989static std::optional<bool>
14000 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14003 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14005 assert(!(S0IsUnsigned && S0IsSigned));
14006 assert(!(S1IsUnsigned && S1IsSigned));
14014 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14020 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14021 return std::nullopt;
14033 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14034 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14039 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14045 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14046 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14047 return std::nullopt;
14053 DAGCombinerInfo &DCI)
const {
14055 EVT VT =
N->getValueType(0);
14062 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14067 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14074 std::optional<bool> IsSigned;
14080 int ChainLength = 0;
14081 for (
int I = 0;
I < 4;
I++) {
14082 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14085 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14088 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14093 TempNode->getOperand(MulIdx), *Src0, *Src1,
14094 TempNode->getOperand(MulIdx)->getOperand(0),
14095 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14099 IsSigned = *IterIsSigned;
14100 if (*IterIsSigned != *IsSigned)
14103 auto AddIdx = 1 - MulIdx;
14106 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14107 Src2s.
push_back(TempNode->getOperand(AddIdx));
14117 TempNode->getOperand(AddIdx), *Src0, *Src1,
14118 TempNode->getOperand(AddIdx)->getOperand(0),
14119 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14123 if (*IterIsSigned != *IsSigned)
14127 ChainLength =
I + 2;
14131 TempNode = TempNode->getOperand(AddIdx);
14133 ChainLength =
I + 1;
14134 if (TempNode->getNumOperands() < 2)
14136 LHS = TempNode->getOperand(0);
14137 RHS = TempNode->getOperand(1);
14140 if (ChainLength < 2)
14146 if (ChainLength < 4) {
14156 bool UseOriginalSrc =
false;
14157 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14158 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14159 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14160 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14162 auto Src0Mask = Src0s.
begin()->PermMask;
14163 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14164 bool UniqueEntries =
true;
14165 for (
auto I = 1;
I < 4;
I++) {
14166 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14169 UniqueEntries =
false;
14175 if (UniqueEntries) {
14176 UseOriginalSrc =
true;
14178 auto FirstElt = Src0s.
begin();
14182 auto SecondElt = Src1s.
begin();
14184 SecondElt->DWordOffset);
14193 if (!UseOriginalSrc) {
14200 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14203 : Intrinsic::amdgcn_udot4,
14213 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14218 unsigned Opc =
LHS.getOpcode();
14223 Opc =
RHS.getOpcode();
14229 auto Cond =
RHS.getOperand(0);
14237 return DAG.
getNode(Opc, SL, VTList, Args);
14251 DAGCombinerInfo &DCI)
const {
14253 EVT VT =
N->getValueType(0);
14255 if (VT != MVT::i32)
14264 unsigned Opc =
RHS.getOpcode();
14270 auto Cond =
RHS.getOperand(0);
14278 return DAG.
getNode(Opc, SL, VTList, Args);
14292SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14293 DAGCombinerInfo &DCI)
const {
14295 if (
N->getValueType(0) != MVT::i32)
14306 unsigned LHSOpc =
LHS.getOpcode();
14307 unsigned Opc =
N->getOpcode();
14317 DAGCombinerInfo &DCI)
const {
14322 EVT VT =
N->getValueType(0);
14334 if (
A ==
LHS.getOperand(1)) {
14335 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14336 if (FusedOp != 0) {
14338 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14346 if (
A ==
RHS.getOperand(1)) {
14347 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14348 if (FusedOp != 0) {
14350 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14359 DAGCombinerInfo &DCI)
const {
14365 EVT VT =
N->getValueType(0);
14378 if (
A ==
LHS.getOperand(1)) {
14379 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14384 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14393 if (
A ==
RHS.getOperand(1)) {
14394 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14397 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14406 DAGCombinerInfo &DCI)
const {
14409 EVT VT =
N->getValueType(0);
14423 bool IsNegative =
false;
14424 if (CLHS->isExactlyValue(1.0) ||
14425 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14441 DAGCombinerInfo &DCI)
const {
14443 EVT VT =
N->getValueType(0);
14465 (
N->getFlags().hasAllowContract() &&
14466 FMA->getFlags().hasAllowContract())) {
14500 if (Vec1 == Vec2 || Vec3 == Vec4)
14506 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14507 (Vec1 == Vec4 && Vec2 == Vec3)) {
14516 DAGCombinerInfo &DCI)
const {
14522 EVT VT =
LHS.getValueType();
14525 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14527 CRHS = dyn_cast<ConstantSDNode>(LHS);
14551 return LHS.getOperand(0);
14557 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14558 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14559 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14566 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14567 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14575 return LHS.getOperand(0);
14579 if (VT != MVT::f32 && VT != MVT::f64 &&
14612 DAGCombinerInfo &DCI)
const {
14630 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14634 unsigned ShiftOffset = 8 *
Offset;
14636 ShiftOffset -=
C->getZExtValue();
14638 ShiftOffset +=
C->getZExtValue();
14640 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14642 MVT::f32, Shifted);
14653 DCI.AddToWorklist(
N);
14660 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14666 DAGCombinerInfo &DCI)
const {
14676 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14679 APFloat One(
F.getSemantics(),
"1.0");
14681 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14691 switch (
N->getOpcode()) {
14693 return performAddCombine(
N, DCI);
14695 return performSubCombine(
N, DCI);
14698 return performAddCarrySubCarryCombine(
N, DCI);
14700 return performFAddCombine(
N, DCI);
14702 return performFSubCombine(
N, DCI);
14704 return performFDivCombine(
N, DCI);
14706 return performSetCCCombine(
N, DCI);
14719 return performMinMaxCombine(
N, DCI);
14721 return performFMACombine(
N, DCI);
14723 return performAndCombine(
N, DCI);
14725 return performOrCombine(
N, DCI);
14728 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14729 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14735 return performXorCombine(
N, DCI);
14737 return performZeroExtendCombine(
N, DCI);
14739 return performSignExtendInRegCombine(
N , DCI);
14741 return performClassCombine(
N, DCI);
14743 return performFCanonicalizeCombine(
N, DCI);
14745 return performRcpCombine(
N, DCI);
14760 return performUCharToFloatCombine(
N, DCI);
14762 return performFCopySignCombine(
N, DCI);
14767 return performCvtF32UByteNCombine(
N, DCI);
14769 return performFMed3Combine(
N, DCI);
14771 return performCvtPkRTZCombine(
N, DCI);
14773 return performClampCombine(
N, DCI);
14776 EVT VT =
N->getValueType(0);
14779 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14782 EVT EltVT = Src.getValueType();
14783 if (EltVT != MVT::i16)
14793 return performExtractVectorEltCombine(
N, DCI);
14795 return performInsertVectorEltCombine(
N, DCI);
14797 return performFPRoundCombine(
N, DCI);
14799 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14805 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14806 return performMemSDNodeCombine(MemNode, DCI);
14819 default:
return ~0u;
14820 case AMDGPU::sub0:
return 0;
14821 case AMDGPU::sub1:
return 1;
14822 case AMDGPU::sub2:
return 2;
14823 case AMDGPU::sub3:
return 3;
14824 case AMDGPU::sub4:
return 4;
14831 unsigned Opcode =
Node->getMachineOpcode();
14835 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14841 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14842 unsigned NewDmask = 0;
14845 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14846 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14849 unsigned TFCLane = 0;
14850 bool HasChain =
Node->getNumValues() > 1;
14852 if (OldDmask == 0) {
14860 TFCLane = OldBitsSet;
14868 if (
I.getUse().getResNo() != 0)
14872 if (!
I->isMachineOpcode() ||
14873 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14885 if (UsesTFC && Lane == TFCLane) {
14890 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14892 Dmask &= ~(1 << Comp);
14900 NewDmask |= 1 << Comp;
14905 bool NoChannels = !NewDmask;
14912 if (OldBitsSet == 1)
14918 if (NewDmask == OldDmask)
14927 unsigned NewChannels = BitsSet + UsesTFC;
14931 assert(NewOpcode != -1 &&
14932 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14933 "failed to find equivalent MIMG op");
14941 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14943 MVT ResultVT = NewChannels == 1 ?
14945 NewChannels == 5 ? 8 : NewChannels);
14959 if (NewChannels == 1) {
14969 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14974 if (i || !NoChannels)
14979 if (NewUser !=
User) {
14987 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14988 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14989 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14990 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
15000 Op =
Op.getOperand(0);
15002 return isa<FrameIndexSDNode>(
Op);
15011 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15012 SDValue SrcVal = Node->getOperand(2);
15020 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15022 SDNode *Glued = Node->getGluedNode();
15024 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15031 return ToResultReg.
getNode();
15036 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15044 Node->getOperand(i).getValueType(),
15045 Node->getOperand(i)), 0));
15056 unsigned Opcode = Node->getMachineOpcode();
15058 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15059 !
TII->isGather4(Opcode) &&
15061 return adjustWritemask(Node, DAG);
15064 if (Opcode == AMDGPU::INSERT_SUBREG ||
15065 Opcode == AMDGPU::REG_SEQUENCE) {
15071 case AMDGPU::V_DIV_SCALE_F32_e64:
15072 case AMDGPU::V_DIV_SCALE_F64_e64: {
15076 SDValue Src0 = Node->getOperand(1);
15077 SDValue Src1 = Node->getOperand(3);
15078 SDValue Src2 = Node->getOperand(5);
15082 (Src0 == Src1 || Src0 == Src2))
15139 unsigned InitIdx = 0;
15141 if (
TII->isImage(
MI)) {
15149 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15150 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15151 unsigned D16Val = D16 ? D16->getImm() : 0;
15153 if (!TFEVal && !LWEVal)
15164 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15166 unsigned dmask = MO_Dmask->
getImm();
15173 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15179 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15180 if (DstSize < InitIdx)
15183 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15191 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15192 unsigned NewDst = 0;
15201 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15202 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15220 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15233 if (
TII->isVOP3(
MI.getOpcode())) {
15235 TII->legalizeOperandsVOP3(
MRI,
MI);
15240 if (!
MI.getDesc().operands().empty()) {
15241 unsigned Opc =
MI.getOpcode();
15242 bool HasAGPRs =
Info->mayNeedAGPRs();
15250 if ((
I == Src2Idx) && (HasAGPRs))
15253 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15255 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15256 if (!
TRI->hasAGPRs(RC))
15258 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15259 if (!Src || !Src->isCopy() ||
15260 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15262 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15266 MRI.setRegClass(
Op.getReg(), NewRC);
15273 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15274 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15275 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15276 if (
TRI->isVectorSuperClass(RC)) {
15277 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15278 MRI.setRegClass(Src2->getReg(), NewRC);
15279 if (Src2->isTied())
15280 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15289 if (
TII->isImage(
MI))
15290 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15316 MVT::v2i32, Ops0), 0);
15346 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15368std::pair<unsigned, const TargetRegisterClass *>
15375 if (Constraint.
size() == 1) {
15377 switch (Constraint[0]) {
15384 RC = &AMDGPU::SReg_32RegClass;
15387 RC = &AMDGPU::SGPR_64RegClass;
15392 return std::pair(0U,
nullptr);
15399 RC = &AMDGPU::VGPR_32RegClass;
15404 return std::pair(0U,
nullptr);
15413 RC = &AMDGPU::AGPR_32RegClass;
15418 return std::pair(0U,
nullptr);
15427 return std::pair(0U, RC);
15432 if (
RegName.consume_front(
"v")) {
15433 RC = &AMDGPU::VGPR_32RegClass;
15434 }
else if (
RegName.consume_front(
"s")) {
15435 RC = &AMDGPU::SGPR_32RegClass;
15436 }
else if (
RegName.consume_front(
"a")) {
15437 RC = &AMDGPU::AGPR_32RegClass;
15442 if (
RegName.consume_front(
"[")) {
15452 RC =
TRI->getVGPRClassForBitWidth(Width);
15454 RC =
TRI->getSGPRClassForBitWidth(Width);
15456 RC =
TRI->getAGPRClassForBitWidth(Width);
15458 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15459 return std::pair(Reg, RC);
15464 if (!
Failed && Idx < RC->getNumRegs())
15472 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15478 if (Constraint.
size() == 1) {
15479 switch (Constraint[0]) {
15488 }
else if (Constraint ==
"DA" ||
15489 Constraint ==
"DB") {
15497 if (Constraint.
size() == 1) {
15498 switch (Constraint[0]) {
15514 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15521 std::vector<SDValue> &Ops,
15536 unsigned Size =
Op.getScalarValueSizeInBits();
15544 Val =
C->getSExtValue();
15548 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15554 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15557 Val =
C->getSExtValue();
15561 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15571 if (Constraint.
size() == 1) {
15572 switch (Constraint[0]) {
15576 return isInt<16>(Val);
15580 return isInt<32>(Val);
15587 }
else if (Constraint.
size() == 2) {
15588 if (Constraint ==
"DA") {
15589 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15590 int64_t LoBits =
static_cast<int32_t
>(Val);
15594 if (Constraint ==
"DB") {
15602 unsigned MaxSize)
const {
15603 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15606 MVT VT =
Op.getSimpleValueType();
15631 switch (UnalignedClassID) {
15632 case AMDGPU::VReg_64RegClassID:
15633 return AMDGPU::VReg_64_Align2RegClassID;
15634 case AMDGPU::VReg_96RegClassID:
15635 return AMDGPU::VReg_96_Align2RegClassID;
15636 case AMDGPU::VReg_128RegClassID:
15637 return AMDGPU::VReg_128_Align2RegClassID;
15638 case AMDGPU::VReg_160RegClassID:
15639 return AMDGPU::VReg_160_Align2RegClassID;
15640 case AMDGPU::VReg_192RegClassID:
15641 return AMDGPU::VReg_192_Align2RegClassID;
15642 case AMDGPU::VReg_224RegClassID:
15643 return AMDGPU::VReg_224_Align2RegClassID;
15644 case AMDGPU::VReg_256RegClassID:
15645 return AMDGPU::VReg_256_Align2RegClassID;
15646 case AMDGPU::VReg_288RegClassID:
15647 return AMDGPU::VReg_288_Align2RegClassID;
15648 case AMDGPU::VReg_320RegClassID:
15649 return AMDGPU::VReg_320_Align2RegClassID;
15650 case AMDGPU::VReg_352RegClassID:
15651 return AMDGPU::VReg_352_Align2RegClassID;
15652 case AMDGPU::VReg_384RegClassID:
15653 return AMDGPU::VReg_384_Align2RegClassID;
15654 case AMDGPU::VReg_512RegClassID:
15655 return AMDGPU::VReg_512_Align2RegClassID;
15656 case AMDGPU::VReg_1024RegClassID:
15657 return AMDGPU::VReg_1024_Align2RegClassID;
15658 case AMDGPU::AReg_64RegClassID:
15659 return AMDGPU::AReg_64_Align2RegClassID;
15660 case AMDGPU::AReg_96RegClassID:
15661 return AMDGPU::AReg_96_Align2RegClassID;
15662 case AMDGPU::AReg_128RegClassID:
15663 return AMDGPU::AReg_128_Align2RegClassID;
15664 case AMDGPU::AReg_160RegClassID:
15665 return AMDGPU::AReg_160_Align2RegClassID;
15666 case AMDGPU::AReg_192RegClassID:
15667 return AMDGPU::AReg_192_Align2RegClassID;
15668 case AMDGPU::AReg_256RegClassID:
15669 return AMDGPU::AReg_256_Align2RegClassID;
15670 case AMDGPU::AReg_512RegClassID:
15671 return AMDGPU::AReg_512_Align2RegClassID;
15672 case AMDGPU::AReg_1024RegClassID:
15673 return AMDGPU::AReg_1024_Align2RegClassID;
15689 if (
Info->isEntryFunction()) {
15696 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15698 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15699 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15700 &AMDGPU::SGPR_64RegClass);
15701 Info->setSGPRForEXECCopy(SReg);
15704 Info->getStackPtrOffsetReg()));
15705 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15706 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15710 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15711 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15713 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15714 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15716 Info->limitOccupancy(MF);
15718 if (ST.isWave32() && !MF.
empty()) {
15719 for (
auto &
MBB : MF) {
15720 for (
auto &
MI :
MBB) {
15721 TII->fixImplicitOperands(
MI);
15731 if (ST.needsAlignedVGPRs()) {
15732 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
15738 if (NewClassID != -1)
15739 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15748 const APInt &DemandedElts,
15750 unsigned Depth)
const {
15752 unsigned Opc =
Op.getOpcode();
15755 unsigned IID =
Op.getConstantOperandVal(0);
15757 case Intrinsic::amdgcn_mbcnt_lo:
15758 case Intrinsic::amdgcn_mbcnt_hi: {
15764 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
15774 Op, Known, DemandedElts, DAG,
Depth);
15789 unsigned MaxValue =
15798 switch (
MI->getOpcode()) {
15799 case AMDGPU::G_INTRINSIC:
15800 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15803 case Intrinsic::amdgcn_workitem_id_x:
15806 case Intrinsic::amdgcn_workitem_id_y:
15809 case Intrinsic::amdgcn_workitem_id_z:
15812 case Intrinsic::amdgcn_mbcnt_lo:
15813 case Intrinsic::amdgcn_mbcnt_hi: {
15825 case Intrinsic::amdgcn_groupstaticsize: {
15836 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15839 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15842 case AMDGPU::G_AMDGPU_SMED3:
15843 case AMDGPU::G_AMDGPU_UMED3: {
15844 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15871 unsigned Depth)
const {
15873 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15879 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15906 if (Header->getAlignment() != PrefAlign)
15907 return Header->getAlignment();
15909 unsigned LoopSize = 0;
15917 LoopSize +=
TII->getInstSizeInBytes(
MI);
15918 if (LoopSize > 192)
15923 if (LoopSize <= 64)
15926 if (LoopSize <= 128)
15927 return CacheLineAlign;
15933 auto I = Exit->getFirstNonDebugInstr();
15934 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15935 return CacheLineAlign;
15944 if (PreTerm == Pre->
begin() ||
15945 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15949 auto ExitHead = Exit->getFirstNonDebugInstr();
15950 if (ExitHead == Exit->end() ||
15951 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15956 return CacheLineAlign;
15964 N =
N->getOperand(0).getNode();
15975 switch (
N->getOpcode()) {
15983 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15984 return !
TRI->isSGPRReg(
MRI, Reg);
15990 return !
TRI->isSGPRReg(
MRI, Reg);
15994 unsigned AS = L->getAddressSpace();
16025 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16027 return A->readMem() &&
A->writeMem();
16062 unsigned Depth)
const {
16067 if (
Info->getMode().DX10Clamp)
16079 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16102 <<
"Hardware instruction generated for atomic "
16104 <<
" operation at memory scope " << MemScope;
16108 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16109 Type *EltTy = VT->getElementType();
16110 return VT->getNumElements() == 2 &&
16129 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16130 unsigned BW =
IT->getBitWidth();
16131 return BW == 32 || BW == 64;
16143 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16145 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16146 return BW == 32 || BW == 64;
16153 return VT->getNumElements() == 2 &&
16154 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16164 bool HasSystemScope) {
16171 if (HasSystemScope) {
16178 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16204 bool HasSystemScope =
16394 if (HasSystemScope)
16434 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16436 : &AMDGPU::SReg_32RegClass;
16437 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16438 return TRI->getEquivalentSGPRClass(RC);
16439 if (
TRI->isSGPRClass(RC) && isDivergent)
16440 return TRI->getEquivalentVGPRClass(RC);
16452 unsigned WaveSize) {
16457 if (!
IT ||
IT->getBitWidth() != WaveSize)
16460 if (!isa<Instruction>(V))
16462 if (!Visited.
insert(V).second)
16464 bool Result =
false;
16465 for (
const auto *U : V->users()) {
16466 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16467 if (V == U->getOperand(1)) {
16468 switch (Intrinsic->getIntrinsicID()) {
16472 case Intrinsic::amdgcn_if_break:
16473 case Intrinsic::amdgcn_if:
16474 case Intrinsic::amdgcn_else:
16479 if (V == U->getOperand(0)) {
16480 switch (Intrinsic->getIntrinsicID()) {
16484 case Intrinsic::amdgcn_end_cf:
16485 case Intrinsic::amdgcn_loop:
16491 Result =
hasCFUser(U, Visited, WaveSize);
16500 const Value *V)
const {
16501 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16502 if (CI->isInlineAsm()) {
16511 for (
auto &TC : TargetConstraints) {
16515 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16528 for (;
I != E; ++
I) {
16529 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16552 return MRI.hasOneNonDBGUse(N0);
16559 if (
I.getMetadata(
"amdgpu.noclobber"))
16561 if (
I.getMetadata(
"amdgpu.last.use"))
16571 if (!Def->isMachineOpcode())
16581 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16582 PhysReg = AMDGPU::SCC;
16584 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16598 "this cannot be replaced with add");
16604 "target should have atomic fadd instructions");
16607 "generic atomicrmw expansion only supports FP32 operand in flat "
16654 bool ReturnValueIsUsed = !AI->
use_empty();
16673 std::prev(BB->
end())->eraseFromParent();
16676 {
Addr},
nullptr,
"is.shared");
16677 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16693 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16700 Alignment,
"loaded.private");
16710 Value *LoadedGlobal = AI;
16721 if (ReturnValueIsUsed) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ FPTRUNC_ROUND
FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const