39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
868 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
869 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
874 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
875 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
876 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
877 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
881 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
882 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
883 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
884 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
991 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1004 EVT DestVT,
EVT SrcVT)
const {
1014 LLT DestTy,
LLT SrcTy)
const {
1015 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1016 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1042 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1071 return (NumElts + 1) / 2;
1077 return NumElts * ((
Size + 31) / 32);
1086 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1095 if (ScalarVT == MVT::bf16) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = MVT::v2bf16;
1099 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1100 IntermediateVT = RegisterVT;
1102 NumIntermediates = (NumElts + 1) / 2;
1103 return NumIntermediates;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts;
1110 return NumIntermediates;
1113 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 RegisterVT = MVT::i16;
1116 IntermediateVT = ScalarVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1122 RegisterVT = MVT::i32;
1123 IntermediateVT = ScalarVT;
1124 NumIntermediates = NumElts;
1125 return NumIntermediates;
1129 RegisterVT = MVT::i32;
1130 IntermediateVT = RegisterVT;
1131 NumIntermediates = NumElts * ((
Size + 31) / 32);
1132 return NumIntermediates;
1137 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1142 unsigned MaxNumLanes) {
1143 assert(MaxNumLanes != 0);
1146 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1147 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1158 unsigned MaxNumLanes) {
1159 auto *ST = dyn_cast<StructType>(Ty);
1164 assert(ST->getNumContainedTypes() == 2 &&
1165 ST->getContainedType(1)->isIntegerTy(32));
1180 DL.getPointerSizeInBits(AS) == 192)
1190 DL.getPointerSizeInBits(AS) == 160) ||
1192 DL.getPointerSizeInBits(AS) == 192))
1200 unsigned IntrID)
const {
1202 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1217 if (RsrcIntr->IsImage) {
1225 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1232 Info.ptrVal = RsrcArg;
1235 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1244 if (RsrcIntr->IsImage) {
1245 unsigned MaxNumLanes = 4;
1260 std::numeric_limits<unsigned>::max());
1270 if (RsrcIntr->IsImage) {
1271 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1291 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1293 Info.memVT = MVT::i32;
1300 case Intrinsic::amdgcn_raw_buffer_load_lds:
1301 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1302 case Intrinsic::amdgcn_struct_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1304 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1309 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1310 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1311 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1315 std::numeric_limits<unsigned>::max());
1325 case Intrinsic::amdgcn_ds_ordered_add:
1326 case Intrinsic::amdgcn_ds_ordered_swap: {
1339 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1340 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1343 Info.ptrVal =
nullptr;
1348 case Intrinsic::amdgcn_ds_append:
1349 case Intrinsic::amdgcn_ds_consume: {
1362 case Intrinsic::amdgcn_global_atomic_csub: {
1371 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1381 case Intrinsic::amdgcn_global_atomic_fmin_num:
1382 case Intrinsic::amdgcn_global_atomic_fmax_num:
1383 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1384 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1385 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1386 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1396 case Intrinsic::amdgcn_global_load_tr_b64:
1397 case Intrinsic::amdgcn_global_load_tr_b128:
1398 case Intrinsic::amdgcn_ds_read_tr4_b64:
1399 case Intrinsic::amdgcn_ds_read_tr6_b96:
1400 case Intrinsic::amdgcn_ds_read_tr8_b64:
1401 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1409 case Intrinsic::amdgcn_ds_gws_init:
1410 case Intrinsic::amdgcn_ds_gws_barrier:
1411 case Intrinsic::amdgcn_ds_gws_sema_v:
1412 case Intrinsic::amdgcn_ds_gws_sema_br:
1413 case Intrinsic::amdgcn_ds_gws_sema_p:
1414 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1424 Info.memVT = MVT::i32;
1428 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1434 case Intrinsic::amdgcn_global_load_lds: {
1436 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1442 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1452 Info.memVT = MVT::i32;
1459 case Intrinsic::amdgcn_s_prefetch_data: {
1474 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1477 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1478 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1490 Type *&AccessTy)
const {
1492 switch (
II->getIntrinsicID()) {
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume:
1496 case Intrinsic::amdgcn_ds_read_tr4_b64:
1497 case Intrinsic::amdgcn_ds_read_tr6_b96:
1498 case Intrinsic::amdgcn_ds_read_tr8_b64:
1499 case Intrinsic::amdgcn_ds_read_tr16_b64:
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap:
1502 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1503 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1504 case Intrinsic::amdgcn_global_atomic_csub:
1505 case Intrinsic::amdgcn_global_atomic_fmax_num:
1506 case Intrinsic::amdgcn_global_atomic_fmin_num:
1507 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1508 case Intrinsic::amdgcn_global_load_tr_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b128:
1510 Ptr =
II->getArgOperand(0);
1512 case Intrinsic::amdgcn_global_load_lds:
1513 Ptr =
II->getArgOperand(1);
1518 AccessTy =
II->getType();
1524 unsigned AddrSpace)
const {
1536 return AM.
Scale == 0 &&
1538 AM.
BaseOffs, AddrSpace, FlatVariant));
1558 return isLegalMUBUFAddressingMode(AM);
1561bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1572 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1584 if (AM.HasBaseReg) {
1616 return isLegalMUBUFAddressingMode(AM);
1623 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1673 : isLegalMUBUFAddressingMode(AM);
1720 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1732 Align RequiredAlignment(
1735 Alignment < RequiredAlignment)
1756 RequiredAlignment =
Align(4);
1774 *IsFast = (Alignment >= RequiredAlignment) ? 64
1775 : (Alignment <
Align(4)) ? 32
1797 *IsFast = (Alignment >= RequiredAlignment) ? 96
1798 : (Alignment <
Align(4)) ? 32
1811 RequiredAlignment =
Align(8);
1822 *IsFast = (Alignment >= RequiredAlignment) ? 128
1823 : (Alignment <
Align(4)) ? 32
1840 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1842 return Alignment >= RequiredAlignment ||
1851 bool AlignedBy4 = Alignment >=
Align(4);
1853 *IsFast = AlignedBy4;
1864 return Alignment >=
Align(4) ||
1878 return Size >= 32 && Alignment >=
Align(4);
1883 unsigned *IsFast)
const {
1885 Alignment, Flags, IsFast);
1895 if (
Op.size() >= 16 &&
1899 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1907 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1917 unsigned DestAS)
const {
1925 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1943 unsigned Index)
const {
1979 auto [InputPtrReg, RC, ArgTy] =
1989 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1995 const SDLoc &SL)
const {
2002 const SDLoc &SL)
const {
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2042SDValue SITargetLowering::lowerKernargMemParameter(
2054 int64_t OffsetDiff =
Offset - AlignDownOffset;
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2080 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2128 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2133SDValue SITargetLowering::getPreloadedValue(
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2197 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2201 "vector type argument should have been split");
2206 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2214 "unexpected vector split in ps argument type");
2228 Info->markPSInputAllocated(PSInputNum);
2230 Info->markPSInputEnabled(PSInputNum);
2246 if (
Info.hasWorkItemIDX()) {
2256 if (
Info.hasWorkItemIDY()) {
2259 Info.setWorkItemIDY(
2262 unsigned Reg = AMDGPU::VGPR1;
2270 if (
Info.hasWorkItemIDZ()) {
2273 Info.setWorkItemIDZ(
2276 unsigned Reg = AMDGPU::VGPR2;
2296 if (RegIdx == ArgVGPRs.
size()) {
2303 unsigned Reg = ArgVGPRs[RegIdx];
2305 assert(Reg != AMDGPU::NoRegister);
2315 unsigned NumArgRegs) {
2318 if (RegIdx == ArgSGPRs.
size())
2321 unsigned Reg = ArgSGPRs[RegIdx];
2323 assert(Reg != AMDGPU::NoRegister);
2337 assert(Reg != AMDGPU::NoRegister);
2363 const unsigned Mask = 0x3ff;
2366 if (
Info.hasWorkItemIDX()) {
2368 Info.setWorkItemIDX(Arg);
2371 if (
Info.hasWorkItemIDY()) {
2373 Info.setWorkItemIDY(Arg);
2376 if (
Info.hasWorkItemIDZ())
2388 const unsigned Mask = 0x3ff;
2409 if (
Info.hasImplicitArgPtr())
2417 if (
Info.hasWorkGroupIDX())
2420 if (
Info.hasWorkGroupIDY())
2423 if (
Info.hasWorkGroupIDZ())
2426 if (
Info.hasLDSKernelId())
2438 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2445 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2451 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2457 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2472 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2478 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2484 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2501 bool InPreloadSequence =
true;
2503 bool AlignedForImplictArgs =
false;
2504 unsigned ImplicitArgOffset = 0;
2505 for (
auto &Arg :
F.args()) {
2506 if (!InPreloadSequence || !Arg.hasInRegAttr())
2509 unsigned ArgIdx = Arg.getArgNo();
2512 if (InIdx < Ins.size() &&
2513 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2516 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2517 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2519 assert(ArgLocs[ArgIdx].isMemLoc());
2520 auto &ArgLoc = ArgLocs[InIdx];
2522 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2524 unsigned NumAllocSGPRs =
2525 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2528 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2529 if (!AlignedForImplictArgs) {
2531 alignTo(LastExplicitArgOffset,
2533 LastExplicitArgOffset;
2534 AlignedForImplictArgs =
true;
2536 ArgOffset += ImplicitArgOffset;
2540 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2541 assert(InIdx >= 1 &&
"No previous SGPR");
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2551 InPreloadSequence =
false;
2557 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2559 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2561 if (PreloadRegs->
size() > 1)
2562 RC = &AMDGPU::SGPR_32RegClass;
2563 for (
auto &Reg : *PreloadRegs) {
2569 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2578 if (
Info.hasLDSKernelId()) {
2580 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2589 bool IsShader)
const {
2597 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2599 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2603 unsigned NumRequiredSystemSGPRs =
2604 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2605 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2606 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2608 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 if (!HasArchitectedSGPRs) {
2614 if (
Info.hasWorkGroupIDX()) {
2616 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 if (
Info.hasWorkGroupIDY()) {
2622 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 if (
Info.hasWorkGroupIDZ()) {
2628 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 if (
Info.hasWorkGroupInfo()) {
2635 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2654 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2656 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2661 Info.getNumPreloadedSGPRs() >= 16);
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(
true);
2682 HasStackObjects =
true;
2686 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2698 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2727 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2735 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!
MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2742 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2756 return !
Info->isEntryFunction();
2766 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2775 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2776 RC = &AMDGPU::SGPR_64RegClass;
2777 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2778 RC = &AMDGPU::SGPR_32RegClass;
2784 Entry->addLiveIn(*
I);
2789 for (
auto *Exit : Exits)
2791 TII->get(TargetOpcode::COPY), *
I)
2809 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2828 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2829 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2837 !
Info->hasWorkGroupIDZ());
2856 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2857 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2860 Info->markPSInputAllocated(0);
2861 Info->markPSInputEnabled(0);
2872 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2873 if ((PsInputBits & 0x7F) == 0 ||
2874 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2877 }
else if (IsKernel) {
2880 Splits.
append(Ins.begin(), Ins.end());
2893 }
else if (!IsGraphics) {
2918 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2928 if (IsEntryFunc && VA.
isMemLoc()) {
2951 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2955 int64_t OffsetDiff =
Offset - AlignDownOffset;
2962 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2973 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2974 Ins[i].Flags.isSExt(), &Ins[i]);
2982 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2985 if (PreloadRegs.
size() == 1) {
2986 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2991 TRI->getRegSizeInBits(*RC)));
2999 for (
auto Reg : PreloadRegs) {
3006 PreloadRegs.size()),
3023 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3024 Ins[i].Flags.isSExt(), &Ins[i]);
3036 "hidden argument in kernel signature was not preloaded",
3043 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3044 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3049 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3064 if (!IsEntryFunc && VA.
isMemLoc()) {
3065 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3076 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3077 RC = &AMDGPU::VGPR_32RegClass;
3078 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3079 RC = &AMDGPU::SGPR_32RegClass;
3139 Info->setBytesInStackArgArea(StackArgSize);
3141 return Chains.
empty() ? Chain
3157 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3163 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3164 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3165 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3188 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3206 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3207 ++
I, ++RealRVLocIdx) {
3211 SDValue Arg = OutVals[RealRVLocIdx];
3239 if (!
Info->isEntryFunction()) {
3245 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3247 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3263 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3346 auto &ArgUsageInfo =
3348 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3374 const auto [OutgoingArg, ArgRC, ArgTy] =
3379 const auto [IncomingArg, IncomingArgRC, Ty] =
3381 assert(IncomingArgRC == ArgRC);
3384 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3392 InputReg = getImplicitArgPtr(DAG,
DL);
3394 std::optional<uint32_t> Id =
3396 if (Id.has_value()) {
3407 if (OutgoingArg->isRegister()) {
3408 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3409 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3412 unsigned SpecialArgOffset =
3423 auto [OutgoingArg, ArgRC, Ty] =
3426 std::tie(OutgoingArg, ArgRC, Ty) =
3429 std::tie(OutgoingArg, ArgRC, Ty) =
3444 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3445 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3446 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3490 : IncomingArgY ? *IncomingArgY
3497 if (OutgoingArg->isRegister()) {
3499 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3540 if (Callee->isDivergent())
3547 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3551 if (!CallerPreserved)
3554 bool CCMatch = CallerCC == CalleeCC;
3567 if (Arg.hasByValAttr())
3581 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3582 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3591 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3604 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3606 if (!CCVA.isRegLoc())
3611 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3613 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3642 if (IsChainCallConv) {
3646 RequestedExec = CLI.
Args.back();
3647 assert(RequestedExec.
Node &&
"No node for EXEC");
3652 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3653 CLI.
Outs.pop_back();
3657 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3658 CLI.
Outs.pop_back();
3663 "Haven't popped all the pieces of the EXEC mask");
3674 bool IsSibCall =
false;
3688 "unsupported call to variadic function ");
3696 "unsupported required tail call to function ");
3701 Outs, OutVals, Ins, DAG);
3705 "site marked musttail or on llvm.amdgcn.cs.chain");
3712 if (!TailCallOpt && IsTailCall)
3758 if (!IsSibCall || IsChainCallConv) {
3765 RegsToPass.emplace_back(IsChainCallConv
3766 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3767 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3774 const unsigned NumSpecialInputs = RegsToPass.size();
3776 MVT PtrVT = MVT::i32;
3779 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3807 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3815 int32_t
Offset = LocMemOffset;
3822 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3828 ? Flags.getNonZeroByValAlign()
3855 if (Outs[i].Flags.isByVal()) {
3857 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3860 Outs[i].Flags.getNonZeroByValAlign(),
3862 nullptr, std::nullopt, DstInfo,
3868 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3874 if (!MemOpChains.
empty())
3890 unsigned ArgIdx = 0;
3891 for (
auto [Reg, Val] : RegsToPass) {
3892 if (ArgIdx++ >= NumSpecialInputs &&
3893 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3919 if (IsTailCall && !IsSibCall) {
3924 std::vector<SDValue> Ops({Chain});
3930 Ops.push_back(Callee);
3947 Ops.push_back(Callee);
3958 if (IsChainCallConv)
3959 Ops.push_back(RequestedExec.
Node);
3963 for (
auto &[Reg, Val] : RegsToPass)
3967 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3968 assert(Mask &&
"Missing call preserved mask for calling convention");
3978 MVT::Glue, GlueOps),
3983 Ops.push_back(InGlue);
4000 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4005 Chain = Call.getValue(0);
4006 InGlue = Call.getValue(1);
4008 uint64_t CalleePopBytes = NumBytes;
4029 EVT VT =
Op.getValueType();
4039 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4043 "Stack grows upwards for AMDGPU");
4045 Chain = BaseAddr.getValue(1);
4047 if (Alignment > StackAlign) {
4050 uint64_t StackAlignMask = ScaledAlignment - 1;
4057 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4059 if (isa<ConstantSDNode>(
Size)) {
4090 if (
Op.getValueType() != MVT::i32)
4109 assert(
Op.getValueType() == MVT::i32);
4118 Op.getOperand(0), IntrinID, GetRoundBothImm);
4152 SDValue RoundModeTimesNumBits =
4172 TableEntry, EnumOffset);
4186 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4188 static_cast<uint32_t>(ConstMode->getZExtValue()),
4200 if (UseReducedTable) {
4206 SDValue RoundModeTimesNumBits =
4226 SDValue RoundModeTimesNumBits =
4235 NewMode = TruncTable;
4244 ReadFirstLaneID, NewMode);
4257 IntrinID, RoundBothImm, NewMode);
4263 if (
Op->isDivergent())
4282 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4283 EVT SrcVT = Src.getValueType();
4292 EVT DstVT =
Op.getValueType();
4301 if (
Op.getValueType() != MVT::i64)
4315 Op.getOperand(0), IntrinID, ModeHwRegImm);
4317 Op.getOperand(0), IntrinID, TrapHwRegImm);
4331 if (
Op.getOperand(1).getValueType() != MVT::i64)
4343 ReadFirstLaneID, NewModeReg);
4345 ReadFirstLaneID, NewTrapReg);
4347 unsigned ModeHwReg =
4350 unsigned TrapHwReg =
4358 IntrinID, ModeHwRegImm, NewModeReg);
4361 IntrinID, TrapHwRegImm, NewTrapReg);
4368 .
Case(
"m0", AMDGPU::M0)
4369 .
Case(
"exec", AMDGPU::EXEC)
4370 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4371 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4372 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4373 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4374 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4377 if (Reg == AMDGPU::NoRegister) {
4385 "\" for subtarget."));
4390 case AMDGPU::EXEC_LO:
4391 case AMDGPU::EXEC_HI:
4392 case AMDGPU::FLAT_SCR_LO:
4393 case AMDGPU::FLAT_SCR_HI:
4398 case AMDGPU::FLAT_SCR:
4417 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4426static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4448 auto Next = std::next(
I);
4461 return std::pair(LoopBB, RemainderBB);
4468 auto I =
MI.getIterator();
4469 auto E = std::next(
I);
4491 Src->setIsKill(
false);
4501 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4507 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4510 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4534 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4535 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4544 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4545 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4546 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4547 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4555 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4562 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4566 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4572 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4573 : AMDGPU::S_AND_SAVEEXEC_B64),
4577 MRI.setSimpleHint(NewExec, CondReg);
4579 if (UseGPRIdxMode) {
4581 SGPRIdxReg = CurrentIdxReg;
4583 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4584 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4591 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4594 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4601 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4604 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4605 : AMDGPU::S_XOR_B64_term),
4629 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4630 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4638 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4640 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4641 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4642 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4643 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4658 InitResultReg, DstReg, PhiReg, TmpExec,
4659 Offset, UseGPRIdxMode, SGPRIdxReg);
4665 LoopBB->removeSuccessor(RemainderBB);
4667 LoopBB->addSuccessor(LandingPad);
4678static std::pair<unsigned, int>
4682 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4687 return std::pair(AMDGPU::sub0,
Offset);
4701 assert(
Idx->getReg() != AMDGPU::NoRegister);
4725 return Idx->getReg();
4727 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4744 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4745 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4754 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4757 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4761 if (UseGPRIdxMode) {
4768 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4781 MI.eraseFromParent();
4790 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4791 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4797 UseGPRIdxMode, SGPRIdxReg);
4801 if (UseGPRIdxMode) {
4803 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4805 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4810 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4815 MI.eraseFromParent();
4832 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4842 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4844 if (
Idx->getReg() == AMDGPU::NoRegister) {
4855 MI.eraseFromParent();
4860 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4864 if (UseGPRIdxMode) {
4868 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4877 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4878 TRI.getRegSizeInBits(*VecRC), 32,
false);
4884 MI.eraseFromParent();
4894 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4898 UseGPRIdxMode, SGPRIdxReg);
4901 if (UseGPRIdxMode) {
4903 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4905 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4911 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4912 TRI.getRegSizeInBits(*VecRC), 32,
false);
4913 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4919 MI.eraseFromParent();
4934 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4965 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4966 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4968 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4969 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4970 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4972 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4973 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4975 bool IsWave32 = ST.isWave32();
4976 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4977 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4982 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4985 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4993 I = ComputeLoop->end();
4995 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4999 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5000 .
addReg(TmpSReg->getOperand(0).getReg())
5004 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5005 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5006 .
addReg(ActiveBits->getOperand(0).getReg());
5007 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5008 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5010 .
addReg(FF1->getOperand(0).getReg());
5011 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5013 .
addReg(LaneValue->getOperand(0).getReg());
5016 unsigned BITSETOpc =
5017 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5018 auto NewActiveBits =
5019 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5020 .
addReg(FF1->getOperand(0).getReg())
5021 .
addReg(ActiveBits->getOperand(0).getReg());
5024 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5025 .addMBB(ComputeLoop);
5026 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5027 .addMBB(ComputeLoop);
5030 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5032 .
addReg(NewActiveBits->getOperand(0).getReg())
5034 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5039 MI.eraseFromParent();
5051 switch (
MI.getOpcode()) {
5052 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5054 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5056 case AMDGPU::S_UADDO_PSEUDO:
5057 case AMDGPU::S_USUBO_PSEUDO: {
5064 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5066 : AMDGPU::S_SUB_I32;
5077 MI.eraseFromParent();
5080 case AMDGPU::S_ADD_U64_PSEUDO:
5081 case AMDGPU::S_SUB_U64_PSEUDO: {
5090 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5092 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5102 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5103 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5106 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5108 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5111 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5113 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5115 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5116 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5129 MI.eraseFromParent();
5132 case AMDGPU::V_ADD_U64_PSEUDO:
5133 case AMDGPU::V_SUB_U64_PSEUDO: {
5139 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5145 if (IsAdd && ST.hasLshlAddB64()) {
5151 TII->legalizeOperands(*
Add);
5152 MI.eraseFromParent();
5156 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5158 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5159 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5161 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5162 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5166 : &AMDGPU::VReg_64RegClass;
5169 : &AMDGPU::VReg_64RegClass;
5172 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5174 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5177 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5179 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5182 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5184 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5187 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5194 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5208 TII->legalizeOperands(*LoHalf);
5209 TII->legalizeOperands(*HiHalf);
5210 MI.eraseFromParent();
5213 case AMDGPU::S_ADD_CO_PSEUDO:
5214 case AMDGPU::S_SUB_CO_PSEUDO: {
5228 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5229 ? AMDGPU::S_ADDC_U32
5230 : AMDGPU::S_SUBB_U32;
5232 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5233 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5238 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5239 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5243 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5245 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5251 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5252 assert(WaveSize == 64 || WaveSize == 32);
5254 if (WaveSize == 64) {
5255 if (ST.hasScalarCompareEq64()) {
5261 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5263 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5265 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5266 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5268 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5289 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5295 MI.eraseFromParent();
5298 case AMDGPU::SI_INIT_M0: {
5300 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5301 .
add(
MI.getOperand(0));
5302 MI.eraseFromParent();
5305 case AMDGPU::GET_GROUPSTATICSIZE: {
5310 .
add(
MI.getOperand(0))
5312 MI.eraseFromParent();
5315 case AMDGPU::GET_SHADERCYCLESHILO: {
5329 using namespace AMDGPU::Hwreg;
5330 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5332 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5333 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5335 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5336 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5338 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5342 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5347 .
add(
MI.getOperand(0))
5352 MI.eraseFromParent();
5355 case AMDGPU::SI_INDIRECT_SRC_V1:
5356 case AMDGPU::SI_INDIRECT_SRC_V2:
5357 case AMDGPU::SI_INDIRECT_SRC_V4:
5358 case AMDGPU::SI_INDIRECT_SRC_V8:
5359 case AMDGPU::SI_INDIRECT_SRC_V9:
5360 case AMDGPU::SI_INDIRECT_SRC_V10:
5361 case AMDGPU::SI_INDIRECT_SRC_V11:
5362 case AMDGPU::SI_INDIRECT_SRC_V12:
5363 case AMDGPU::SI_INDIRECT_SRC_V16:
5364 case AMDGPU::SI_INDIRECT_SRC_V32:
5366 case AMDGPU::SI_INDIRECT_DST_V1:
5367 case AMDGPU::SI_INDIRECT_DST_V2:
5368 case AMDGPU::SI_INDIRECT_DST_V4:
5369 case AMDGPU::SI_INDIRECT_DST_V8:
5370 case AMDGPU::SI_INDIRECT_DST_V9:
5371 case AMDGPU::SI_INDIRECT_DST_V10:
5372 case AMDGPU::SI_INDIRECT_DST_V11:
5373 case AMDGPU::SI_INDIRECT_DST_V12:
5374 case AMDGPU::SI_INDIRECT_DST_V16:
5375 case AMDGPU::SI_INDIRECT_DST_V32:
5377 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5378 case AMDGPU::SI_KILL_I1_PSEUDO:
5380 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5389 Register SrcCond =
MI.getOperand(3).getReg();
5391 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5392 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5393 const auto *CondRC =
TRI->getWaveMaskRegClass();
5394 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5398 : &AMDGPU::VReg_64RegClass;
5401 : &AMDGPU::VReg_64RegClass;
5404 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5406 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5409 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5411 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5414 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5416 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5437 MI.eraseFromParent();
5440 case AMDGPU::SI_BR_UNDEF: {
5444 .
add(
MI.getOperand(0));
5446 MI.eraseFromParent();
5449 case AMDGPU::ADJCALLSTACKUP:
5450 case AMDGPU::ADJCALLSTACKDOWN: {
5457 case AMDGPU::SI_CALL_ISEL: {
5461 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5464 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5470 MI.eraseFromParent();
5473 case AMDGPU::V_ADD_CO_U32_e32:
5474 case AMDGPU::V_SUB_CO_U32_e32:
5475 case AMDGPU::V_SUBREV_CO_U32_e32: {
5478 unsigned Opc =
MI.getOpcode();
5480 bool NeedClampOperand =
false;
5481 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5483 NeedClampOperand =
true;
5487 if (
TII->isVOP3(*
I)) {
5492 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5493 if (NeedClampOperand)
5496 TII->legalizeOperands(*
I);
5498 MI.eraseFromParent();
5501 case AMDGPU::V_ADDC_U32_e32:
5502 case AMDGPU::V_SUBB_U32_e32:
5503 case AMDGPU::V_SUBBREV_U32_e32:
5506 TII->legalizeOperands(
MI);
5508 case AMDGPU::DS_GWS_INIT:
5509 case AMDGPU::DS_GWS_SEMA_BR:
5510 case AMDGPU::DS_GWS_BARRIER:
5511 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5513 case AMDGPU::DS_GWS_SEMA_V:
5514 case AMDGPU::DS_GWS_SEMA_P:
5515 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5523 case AMDGPU::S_SETREG_B32: {
5538 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5539 const unsigned SetMask = WidthMask <<
Offset;
5542 unsigned SetDenormOp = 0;
5543 unsigned SetRoundOp = 0;
5551 SetRoundOp = AMDGPU::S_ROUND_MODE;
5552 SetDenormOp = AMDGPU::S_DENORM_MODE;
5554 SetRoundOp = AMDGPU::S_ROUND_MODE;
5556 SetDenormOp = AMDGPU::S_DENORM_MODE;
5559 if (SetRoundOp || SetDenormOp) {
5562 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5563 unsigned ImmVal = Def->getOperand(1).getImm();
5577 MI.eraseFromParent();
5586 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5590 case AMDGPU::S_INVERSE_BALLOT_U32:
5591 case AMDGPU::S_INVERSE_BALLOT_U64:
5594 MI.setDesc(
TII->get(AMDGPU::COPY));
5596 case AMDGPU::ENDPGM_TRAP: {
5599 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5619 MI.eraseFromParent();
5622 case AMDGPU::SIMULATED_TRAP: {
5626 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5627 MI.eraseFromParent();
5664 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5751 EVT VT =
N->getValueType(0);
5755 if (VT == MVT::f16) {
5771 unsigned Opc =
Op.getOpcode();
5772 EVT VT =
Op.getValueType();
5773 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5774 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5775 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5776 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5791 unsigned Opc =
Op.getOpcode();
5792 EVT VT =
Op.getValueType();
5793 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5794 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5795 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5796 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5804 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5806 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5813 unsigned Opc =
Op.getOpcode();
5814 EVT VT =
Op.getValueType();
5815 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5816 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5817 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5818 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5819 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5820 VT == MVT::v32bf16);
5825 : std::pair(Op0, Op0);
5834 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5836 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5842 switch (
Op.getOpcode()) {
5846 return LowerBRCOND(
Op, DAG);
5848 return LowerRETURNADDR(
Op, DAG);
5851 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5852 "Load should return a value and a chain");
5856 EVT VT =
Op.getValueType();
5858 return lowerFSQRTF32(
Op, DAG);
5860 return lowerFSQRTF64(
Op, DAG);
5865 return LowerTrig(
Op, DAG);
5867 return LowerSELECT(
Op, DAG);
5869 return LowerFDIV(
Op, DAG);
5871 return LowerFFREXP(
Op, DAG);
5873 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5875 return LowerSTORE(
Op, DAG);
5879 return LowerGlobalAddress(MFI,
Op, DAG);
5882 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5884 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5886 return LowerINTRINSIC_VOID(
Op, DAG);
5888 return lowerADDRSPACECAST(
Op, DAG);
5890 return lowerINSERT_SUBVECTOR(
Op, DAG);
5892 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5894 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5896 return lowerVECTOR_SHUFFLE(
Op, DAG);
5898 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5900 return lowerBUILD_VECTOR(
Op, DAG);
5903 return lowerFP_ROUND(
Op, DAG);
5905 return lowerTRAP(
Op, DAG);
5907 return lowerDEBUGTRAP(
Op, DAG);
5916 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5919 return lowerFLDEXP(
Op, DAG);
5948 return lowerMUL(
Op, DAG);
5951 return lowerXMULO(
Op, DAG);
5954 return lowerXMUL_LOHI(
Op, DAG);
5987 EVT FittingLoadVT = LoadVT;
6019SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6022 bool IsIntrinsic)
const {
6026 EVT LoadVT =
M->getValueType(0);
6028 EVT EquivLoadVT = LoadVT;
6046 M->getMemoryVT(),
M->getMemOperand());
6057 EVT LoadVT =
M->getValueType(0);
6063 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6064 bool IsTFE =
M->getNumValues() == 3;
6077 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6081 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6082 M->getMemOperand(), DAG);
6087 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6088 M->getMemOperand(), DAG);
6096 EVT VT =
N->getValueType(0);
6097 unsigned CondCode =
N->getConstantOperandVal(3);
6108 EVT CmpVT =
LHS.getValueType();
6109 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6110 unsigned PromoteOp =
6130 EVT VT =
N->getValueType(0);
6132 unsigned CondCode =
N->getConstantOperandVal(3);
6141 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6159 EVT VT =
N->getValueType(0);
6166 Src.getOperand(1), Src.getOperand(2));
6177 Exec = AMDGPU::EXEC_LO;
6179 Exec = AMDGPU::EXEC;
6196 EVT VT =
N->getValueType(0);
6198 unsigned IID =
N->getConstantOperandVal(0);
6199 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6200 IID == Intrinsic::amdgcn_permlanex16;
6201 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6202 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6206 unsigned SplitSize = 32;
6207 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6208 ST->hasDPALU_DPP() &&
6216 case Intrinsic::amdgcn_permlane16:
6217 case Intrinsic::amdgcn_permlanex16:
6218 case Intrinsic::amdgcn_update_dpp:
6223 case Intrinsic::amdgcn_writelane:
6226 case Intrinsic::amdgcn_readlane:
6227 case Intrinsic::amdgcn_set_inactive:
6228 case Intrinsic::amdgcn_set_inactive_chain_arg:
6229 case Intrinsic::amdgcn_mov_dpp8:
6232 case Intrinsic::amdgcn_readfirstlane:
6233 case Intrinsic::amdgcn_permlane64:
6243 if (
SDNode *GL =
N->getGluedNode()) {
6245 GL = GL->getOperand(0).getNode();
6255 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6256 IID == Intrinsic::amdgcn_mov_dpp8 ||
6257 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6258 Src1 =
N->getOperand(2);
6259 if (IID == Intrinsic::amdgcn_writelane ||
6260 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6261 Src2 =
N->getOperand(3);
6264 if (ValSize == SplitSize) {
6274 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6279 if (IID == Intrinsic::amdgcn_writelane) {
6284 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6286 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6289 if (ValSize % SplitSize != 0)
6293 EVT VT =
N->getValueType(0);
6297 unsigned NumOperands =
N->getNumOperands();
6299 SDNode *GL =
N->getGluedNode();
6304 for (
unsigned i = 0; i != NE; ++i) {
6305 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6307 SDValue Operand =
N->getOperand(j);
6337 if (SplitSize == 32) {
6339 return unrollLaneOp(LaneOp.
getNode());
6345 unsigned SubVecNumElt =
6349 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6350 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6354 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6359 if (IID == Intrinsic::amdgcn_writelane)
6364 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6365 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6366 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6367 EltIdx += SubVecNumElt;
6381 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6384 if (IID == Intrinsic::amdgcn_writelane)
6387 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6395 switch (
N->getOpcode()) {
6407 unsigned IID =
N->getConstantOperandVal(0);
6409 case Intrinsic::amdgcn_make_buffer_rsrc:
6410 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6412 case Intrinsic::amdgcn_cvt_pkrtz: {
6421 case Intrinsic::amdgcn_cvt_pknorm_i16:
6422 case Intrinsic::amdgcn_cvt_pknorm_u16:
6423 case Intrinsic::amdgcn_cvt_pk_i16:
6424 case Intrinsic::amdgcn_cvt_pk_u16: {
6430 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6432 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6434 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6439 EVT VT =
N->getValueType(0);
6448 case Intrinsic::amdgcn_s_buffer_load: {
6460 EVT VT =
Op.getValueType();
6461 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6473 if (!
Offset->isDivergent()) {
6492 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6504 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6505 Results.push_back(Res.getOperand(
I));
6509 Results.push_back(Res.getValue(1));
6518 EVT VT =
N->getValueType(0);
6523 EVT SelectVT = NewVT;
6524 if (NewVT.
bitsLT(MVT::i32)) {
6527 SelectVT = MVT::i32;
6533 if (NewVT != SelectVT)
6539 if (
N->getValueType(0) != MVT::v2f16)
6551 if (
N->getValueType(0) != MVT::v2f16)
6563 if (
N->getValueType(0) != MVT::f16)
6578 if (U.get() !=
Value)
6581 if (U.getUser()->getOpcode() == Opcode)
6587unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6589 switch (
Intr->getConstantOperandVal(1)) {
6590 case Intrinsic::amdgcn_if:
6592 case Intrinsic::amdgcn_else:
6594 case Intrinsic::amdgcn_loop:
6596 case Intrinsic::amdgcn_end_cf:
6643 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6656 assert(BR &&
"brcond missing unconditional branch user");
6657 Target = BR->getOperand(1);
6660 unsigned CFNode = isCFIntrinsic(
Intr);
6679 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6703 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6716 Intr->getOperand(0));
6722 MVT VT =
Op.getSimpleValueType();
6725 if (
Op.getConstantOperandVal(0) != 0)
6731 if (
Info->isEntryFunction())
6748 return Op.getValueType().bitsLE(VT)
6755 assert(
Op.getValueType() == MVT::f16 &&
6756 "Do not know how to custom lower FP_ROUND for non-f16 type");
6759 EVT SrcVT = Src.getValueType();
6760 if (SrcVT != MVT::f64)
6776 EVT VT =
Op.getValueType();
6779 bool IsIEEEMode =
Info->getMode().IEEE;
6788 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6796 EVT VT =
Op.getValueType();
6800 EVT ExpVT =
Exp.getValueType();
6801 if (ExpVT == MVT::i16)
6822 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6829 switch (
Op->getOpcode()) {
6859 DAGCombinerInfo &DCI)
const {
6860 const unsigned Opc =
Op.getOpcode();
6868 :
Op->getOperand(0).getValueType();
6871 if (DCI.isBeforeLegalizeOps() ||
6875 auto &DAG = DCI.DAG;
6881 LHS =
Op->getOperand(1);
6882 RHS =
Op->getOperand(2);
6884 LHS =
Op->getOperand(0);
6885 RHS =
Op->getOperand(1);
6916 EVT VT =
Op.getValueType();
6922 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6949 if (
Op->isDivergent())
6962 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6964 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6967 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6969 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6975 EVT VT =
Op.getValueType();
6982 const APInt &
C = RHSC->getAPIntValue();
6984 if (
C.isPowerOf2()) {
6986 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7013 if (
Op->isDivergent()) {
7030 return lowerTrapEndpgm(
Op, DAG);
7033 : lowerTrapHsaQueuePtr(
Op, DAG);
7043SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7045 ImplicitParameter Param)
const {
7065 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7071 if (UserSGPR == AMDGPU::NoRegister) {
7113 "debugtrap handler not supported",
7126SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7130 ? AMDGPU::SRC_SHARED_BASE
7131 : AMDGPU::SRC_PRIVATE_BASE;
7154 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7163 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7169 if (UserSGPR == AMDGPU::NoRegister) {
7199 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7200 isa<BasicBlockSDNode>(Val))
7203 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7204 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7218 unsigned DestAS, SrcAS;
7220 bool IsNonNull =
false;
7221 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7222 SrcAS = ASC->getSrcAddressSpace();
7223 Src = ASC->getOperand(0);
7224 DestAS = ASC->getDestAddressSpace();
7227 Op.getConstantOperandVal(0) ==
7228 Intrinsic::amdgcn_addrspacecast_nonnull);
7229 Src =
Op->getOperand(1);
7230 SrcAS =
Op->getConstantOperandVal(2);
7231 DestAS =
Op->getConstantOperandVal(3);
7246 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7260 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7268 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7280 Op.getValueType() == MVT::i64) {
7289 Src.getValueType() == MVT::i64)
7313 EVT InsVT =
Ins.getValueType();
7316 unsigned IdxVal =
Idx->getAsZExtVal();
7321 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7326 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7328 MVT::i32, InsNumElts / 2);
7333 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7335 if (InsNumElts == 2) {
7348 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7370 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7371 if (NumElts == 4 && EltSize == 16 && KIdx) {
7382 unsigned Idx = KIdx->getZExtValue();
7383 bool InsertLo =
Idx < 2;
7400 if (isa<ConstantSDNode>(
Idx))
7406 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7412 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7440 EVT ResultVT =
Op.getValueType();
7453 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7456 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7460 if (VecSize == 128) {
7468 }
else if (VecSize == 256) {
7471 for (
unsigned P = 0;
P < 4; ++
P) {
7477 Parts[0], Parts[1]));
7479 Parts[2], Parts[3]));
7485 for (
unsigned P = 0;
P < 8; ++
P) {
7492 Parts[0], Parts[1], Parts[2], Parts[3]));
7495 Parts[4], Parts[5], Parts[6], Parts[7]));
7498 EVT IdxVT =
Idx.getValueType();
7515 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7530 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7540 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7546 EVT ResultVT =
Op.getValueType();
7549 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7551 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7567 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7568 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7576 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7577 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7578 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7579 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7598 EVT ResultVT =
Op.getValueType();
7614 EVT VT =
Op.getValueType();
7616 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7650 for (
unsigned P = 0;
P < NumParts; ++
P) {
7652 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7685 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7723 EVT PtrVT =
Op.getValueType();
7739 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7811 SDValue Param = lowerKernargMemParameter(
7821 "non-hsa intrinsic with hsa target",
7830 "intrinsic not supported on subtarget",
7840 unsigned NumElts = Elts.
size();
7842 if (NumElts <= 12) {
7851 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7857 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7858 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7867 EVT SrcVT = Src.getValueType();
7888 bool Unpacked,
bool IsD16,
int DMaskPop,
7889 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7893 EVT ReqRetVT = ResultTypes[0];
7895 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7896 ? (ReqRetNumElts + 1) / 2
7899 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7910 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7921 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7923 NumDataDwords - MaskPopDwords);
7928 EVT LegalReqRetVT = ReqRetVT;
7930 if (!
Data.getValueType().isInteger())
7932 Data.getValueType().changeTypeToInteger(),
Data);
7953 if (Result->getNumValues() == 1)
7960 SDValue *LWE,
bool &IsTexFail) {
7961 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7980 unsigned DimIdx,
unsigned EndIdx,
7981 unsigned NumGradients) {
7983 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7991 if (((
I + 1) >= EndIdx) ||
7992 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7993 I == DimIdx + NumGradients - 1))) {
7994 if (
Addr.getValueType() != MVT::i16)
8015 unsigned IntrOpcode =
Intr->BaseOpcode;
8026 int NumVDataDwords = 0;
8027 bool AdjustRetType =
false;
8028 bool IsAtomicPacked16Bit =
false;
8031 const unsigned ArgOffset = WithChain ? 2 : 1;
8034 unsigned DMaskLanes = 0;
8036 if (BaseOpcode->Atomic) {
8037 VData =
Op.getOperand(2);
8039 IsAtomicPacked16Bit =
8040 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8041 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8044 if (BaseOpcode->AtomicX2) {
8051 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8052 DMask = Is64Bit ? 0xf : 0x3;
8053 NumVDataDwords = Is64Bit ? 4 : 2;
8055 DMask = Is64Bit ? 0x3 : 0x1;
8056 NumVDataDwords = Is64Bit ? 2 : 1;
8059 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8062 if (BaseOpcode->Store) {
8063 VData =
Op.getOperand(2);
8071 VData = handleD16VData(VData, DAG,
true);
8075 }
else if (!BaseOpcode->NoReturn) {
8088 (!LoadVT.
isVector() && DMaskLanes > 1))
8096 NumVDataDwords = (DMaskLanes + 1) / 2;
8098 NumVDataDwords = DMaskLanes;
8100 AdjustRetType =
true;
8104 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8109 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8111 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8112 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8114 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8116 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8117 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8120 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8121 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8122 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8127 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8131 "Bias needs to be converted to 16 bit in A16 mode");
8136 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8140 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8141 "require 16 bit args for both gradients and addresses");
8146 if (!
ST->hasA16()) {
8147 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8148 "support 16 bit addresses\n");
8158 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8162 IntrOpcode = G16MappingInfo->
G16;
8170 ArgOffset +
Intr->GradientStart,
8171 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8173 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8174 I < ArgOffset + Intr->CoordStart;
I++)
8181 ArgOffset +
Intr->CoordStart, VAddrEnd,
8185 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8203 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8204 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8205 const bool UseNSA =
ST->hasNSAEncoding() &&
8206 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8207 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8208 const bool UsePartialNSA =
8209 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8212 if (UsePartialNSA) {
8214 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8215 }
else if (!UseNSA) {
8222 if (!BaseOpcode->Sampler) {
8226 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8228 Unorm = UnormConst ? True : False;
8233 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8234 bool IsTexFail =
false;
8235 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8246 NumVDataDwords += 1;
8247 AdjustRetType =
true;
8252 if (AdjustRetType) {
8255 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8258 if (isa<MemSDNode>(
Op))
8264 MVT::i32, NumVDataDwords)
8267 ResultTypes[0] = NewVT;
8268 if (ResultTypes.size() == 3) {
8272 ResultTypes.erase(&ResultTypes[1]);
8276 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8277 if (BaseOpcode->Atomic)
8284 if (BaseOpcode->Store || BaseOpcode->Atomic)
8286 if (UsePartialNSA) {
8295 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8298 if (BaseOpcode->Sampler) {
8307 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8311 ST->hasFeature(AMDGPU::FeatureR128A16)
8321 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8325 if (BaseOpcode->HasD16)
8327 if (isa<MemSDNode>(
Op))
8330 int NumVAddrDwords =
8336 NumVDataDwords, NumVAddrDwords);
8337 }
else if (IsGFX11Plus) {
8339 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8340 : AMDGPU::MIMGEncGfx11Default,
8341 NumVDataDwords, NumVAddrDwords);
8342 }
else if (IsGFX10Plus) {
8344 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8345 : AMDGPU::MIMGEncGfx10Default,
8346 NumVDataDwords, NumVAddrDwords);
8350 NumVDataDwords, NumVAddrDwords);
8353 "requested image instruction is not supported on this GPU");
8358 NumVDataDwords, NumVAddrDwords);
8361 NumVDataDwords, NumVAddrDwords);
8367 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8372 if (BaseOpcode->AtomicX2) {
8377 if (BaseOpcode->NoReturn)
8381 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8399 if (!
Offset->isDivergent()) {
8444 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8448 unsigned NumLoads = 1;
8454 if (NumElts == 8 || NumElts == 16) {
8455 NumLoads = NumElts / 4;
8463 setBufferOffsets(
Offset, DAG, &Ops[3],
8464 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8467 for (
unsigned i = 0; i < NumLoads; ++i) {
8473 if (NumElts == 8 || NumElts == 16)
8520 EVT VT =
Op.getValueType();
8522 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8526 switch (IntrinsicID) {
8527 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8530 return getPreloadedValue(DAG, *MFI, VT,
8533 case Intrinsic::amdgcn_dispatch_ptr:
8534 case Intrinsic::amdgcn_queue_ptr: {
8537 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8543 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8546 return getPreloadedValue(DAG, *MFI, VT, RegID);
8548 case Intrinsic::amdgcn_implicitarg_ptr: {
8550 return getImplicitArgPtr(DAG,
DL);
8551 return getPreloadedValue(DAG, *MFI, VT,
8554 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8560 return getPreloadedValue(DAG, *MFI, VT,
8563 case Intrinsic::amdgcn_dispatch_id: {
8566 case Intrinsic::amdgcn_rcp:
8568 case Intrinsic::amdgcn_rsq:
8570 case Intrinsic::amdgcn_rsq_legacy:
8574 case Intrinsic::amdgcn_rcp_legacy:
8578 case Intrinsic::amdgcn_rsq_clamp: {
8592 case Intrinsic::r600_read_ngroups_x:
8596 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8599 case Intrinsic::r600_read_ngroups_y:
8603 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8606 case Intrinsic::r600_read_ngroups_z:
8610 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8613 case Intrinsic::r600_read_global_size_x:
8617 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8620 case Intrinsic::r600_read_global_size_y:
8624 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8627 case Intrinsic::r600_read_global_size_z:
8631 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8634 case Intrinsic::r600_read_local_size_x:
8638 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8640 case Intrinsic::r600_read_local_size_y:
8644 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8646 case Intrinsic::r600_read_local_size_z:
8650 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8652 case Intrinsic::amdgcn_workgroup_id_x:
8653 return getPreloadedValue(DAG, *MFI, VT,
8655 case Intrinsic::amdgcn_workgroup_id_y:
8656 return getPreloadedValue(DAG, *MFI, VT,
8658 case Intrinsic::amdgcn_workgroup_id_z:
8659 return getPreloadedValue(DAG, *MFI, VT,
8661 case Intrinsic::amdgcn_wave_id:
8662 return lowerWaveID(DAG,
Op);
8663 case Intrinsic::amdgcn_lds_kernel_id: {
8665 return getLDSKernelId(DAG,
DL);
8666 return getPreloadedValue(DAG, *MFI, VT,
8669 case Intrinsic::amdgcn_workitem_id_x:
8670 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8671 case Intrinsic::amdgcn_workitem_id_y:
8672 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8673 case Intrinsic::amdgcn_workitem_id_z:
8674 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8675 case Intrinsic::amdgcn_wavefrontsize:
8678 case Intrinsic::amdgcn_s_buffer_load: {
8679 unsigned CPol =
Op.getConstantOperandVal(3);
8686 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8687 Op.getOperand(3), DAG);
8689 case Intrinsic::amdgcn_fdiv_fast:
8690 return lowerFDIV_FAST(
Op, DAG);
8691 case Intrinsic::amdgcn_sin:
8694 case Intrinsic::amdgcn_cos:
8697 case Intrinsic::amdgcn_mul_u24:
8700 case Intrinsic::amdgcn_mul_i24:
8704 case Intrinsic::amdgcn_log_clamp: {
8710 case Intrinsic::amdgcn_fract:
8713 case Intrinsic::amdgcn_class:
8716 case Intrinsic::amdgcn_div_fmas:
8718 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8720 case Intrinsic::amdgcn_div_fixup:
8722 Op.getOperand(2),
Op.getOperand(3));
8724 case Intrinsic::amdgcn_div_scale: {
8737 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8740 Denominator, Numerator);
8742 case Intrinsic::amdgcn_icmp: {
8744 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8745 Op.getConstantOperandVal(2) == 0 &&
8750 case Intrinsic::amdgcn_fcmp: {
8753 case Intrinsic::amdgcn_ballot:
8755 case Intrinsic::amdgcn_fmed3:
8757 Op.getOperand(2),
Op.getOperand(3));
8758 case Intrinsic::amdgcn_fdot2:
8760 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8761 case Intrinsic::amdgcn_fmul_legacy:
8764 case Intrinsic::amdgcn_sffbh:
8766 case Intrinsic::amdgcn_sbfe:
8768 Op.getOperand(2),
Op.getOperand(3));
8769 case Intrinsic::amdgcn_ubfe:
8771 Op.getOperand(2),
Op.getOperand(3));
8772 case Intrinsic::amdgcn_cvt_pkrtz:
8773 case Intrinsic::amdgcn_cvt_pknorm_i16:
8774 case Intrinsic::amdgcn_cvt_pknorm_u16:
8775 case Intrinsic::amdgcn_cvt_pk_i16:
8776 case Intrinsic::amdgcn_cvt_pk_u16: {
8778 EVT VT =
Op.getValueType();
8781 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8783 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8785 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8787 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8793 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8796 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8799 case Intrinsic::amdgcn_fmad_ftz:
8801 Op.getOperand(2),
Op.getOperand(3));
8803 case Intrinsic::amdgcn_if_break:
8805 Op->getOperand(1),
Op->getOperand(2)),
8808 case Intrinsic::amdgcn_groupstaticsize: {
8820 case Intrinsic::amdgcn_is_shared:
8821 case Intrinsic::amdgcn_is_private: {
8823 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8826 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8834 case Intrinsic::amdgcn_perm:
8836 Op.getOperand(2),
Op.getOperand(3));
8837 case Intrinsic::amdgcn_reloc_constant: {
8841 auto *RelocSymbol = cast<GlobalVariable>(
8847 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8848 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8849 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8850 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8851 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8852 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8853 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8854 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8855 if (
Op.getOperand(4).getValueType() == MVT::i32)
8861 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8862 Op.getOperand(3), IndexKeyi32);
8864 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8865 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8866 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8867 if (
Op.getOperand(6).getValueType() == MVT::i32)
8873 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8874 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8875 IndexKeyi32, Op.getOperand(7)});
8877 case Intrinsic::amdgcn_addrspacecast_nonnull:
8878 return lowerADDRSPACECAST(
Op, DAG);
8879 case Intrinsic::amdgcn_readlane:
8880 case Intrinsic::amdgcn_readfirstlane:
8881 case Intrinsic::amdgcn_writelane:
8882 case Intrinsic::amdgcn_permlane16:
8883 case Intrinsic::amdgcn_permlanex16:
8884 case Intrinsic::amdgcn_permlane64:
8885 case Intrinsic::amdgcn_set_inactive:
8886 case Intrinsic::amdgcn_set_inactive_chain_arg:
8887 case Intrinsic::amdgcn_mov_dpp8:
8888 case Intrinsic::amdgcn_update_dpp:
8893 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8904 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8910 unsigned NewOpcode)
const {
8914 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8915 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
8929 auto *
M = cast<MemSDNode>(
Op);
8933 M->getMemOperand());
8938 unsigned NewOpcode)
const {
8942 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8943 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
8957 auto *
M = cast<MemSDNode>(
Op);
8961 M->getMemOperand());
8966 unsigned IntrID =
Op.getConstantOperandVal(1);
8970 case Intrinsic::amdgcn_ds_ordered_add:
8971 case Intrinsic::amdgcn_ds_ordered_swap: {
8976 unsigned IndexOperand =
M->getConstantOperandVal(7);
8977 unsigned WaveRelease =
M->getConstantOperandVal(8);
8978 unsigned WaveDone =
M->getConstantOperandVal(9);
8980 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8981 IndexOperand &= ~0x3f;
8982 unsigned CountDw = 0;
8985 CountDw = (IndexOperand >> 24) & 0xf;
8986 IndexOperand &= ~(0xf << 24);
8988 if (CountDw < 1 || CountDw > 4) {
8990 "ds_ordered_count: dword count must be between 1 and 4");
8997 if (WaveDone && !WaveRelease)
9000 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9001 unsigned ShaderType =
9003 unsigned Offset0 = OrderedCountIndex << 2;
9004 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9007 Offset1 |= (CountDw - 1) << 6;
9010 Offset1 |= ShaderType << 2;
9012 unsigned Offset = Offset0 | (Offset1 << 8);
9019 M->getVTList(), Ops,
M->getMemoryVT(),
9020 M->getMemOperand());
9022 case Intrinsic::amdgcn_raw_buffer_load:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9024 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9025 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9026 case Intrinsic::amdgcn_raw_buffer_load_format:
9027 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9028 const bool IsFormat =
9029 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9030 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9032 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9033 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9046 auto *
M = cast<MemSDNode>(
Op);
9047 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9049 case Intrinsic::amdgcn_struct_buffer_load:
9050 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9051 case Intrinsic::amdgcn_struct_buffer_load_format:
9052 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9053 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9054 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9055 const bool IsFormat =
9056 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9057 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9059 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9060 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9073 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9075 case Intrinsic::amdgcn_raw_tbuffer_load:
9076 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9078 EVT LoadVT =
Op.getValueType();
9079 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9080 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9099 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9102 case Intrinsic::amdgcn_struct_tbuffer_load:
9103 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9105 EVT LoadVT =
Op.getValueType();
9106 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9107 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9126 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9129 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9130 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9132 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9134 return lowerStructBufferAtomicIntrin(
Op, DAG,
9136 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9137 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9139 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9140 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9141 return lowerStructBufferAtomicIntrin(
Op, DAG,
9143 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9144 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9146 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9147 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9148 return lowerStructBufferAtomicIntrin(
Op, DAG,
9150 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9153 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9154 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9156 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9157 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9159 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9160 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9162 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9163 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9165 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9166 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9168 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9171 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9172 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9174 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9175 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9177 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9180 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9181 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9183 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9184 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9186 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9187 return lowerRawBufferAtomicIntrin(
Op, DAG,
9189 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9190 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9191 return lowerStructBufferAtomicIntrin(
Op, DAG,
9193 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9194 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9196 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9197 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9199 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9200 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9201 return lowerStructBufferAtomicIntrin(
Op, DAG,
9203 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9204 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9205 return lowerStructBufferAtomicIntrin(
Op, DAG,
9207 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9209 return lowerStructBufferAtomicIntrin(
Op, DAG,
9211 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9212 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9213 return lowerStructBufferAtomicIntrin(
Op, DAG,
9215 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9218 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9219 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9221 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9222 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9224 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9227 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9230 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9231 return lowerStructBufferAtomicIntrin(
Op, DAG,
9234 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9236 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9237 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9251 EVT VT =
Op.getValueType();
9252 auto *
M = cast<MemSDNode>(
Op);
9255 Op->getVTList(), Ops, VT,
9256 M->getMemOperand());
9258 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9260 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9261 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9275 EVT VT =
Op.getValueType();
9276 auto *
M = cast<MemSDNode>(
Op);
9279 Op->getVTList(), Ops, VT,
9280 M->getMemOperand());
9282 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9284 SDValue NodePtr =
M->getOperand(2);
9285 SDValue RayExtent =
M->getOperand(3);
9286 SDValue RayOrigin =
M->getOperand(4);
9288 SDValue RayInvDir =
M->getOperand(6);
9306 const unsigned NumVDataDwords = 4;
9307 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9308 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9312 const unsigned BaseOpcodes[2][2] = {
9313 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9314 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9315 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9319 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9320 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9321 : AMDGPU::MIMGEncGfx10NSA,
9322 NumVDataDwords, NumVAddrDwords);
9326 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9327 : AMDGPU::MIMGEncGfx10Default,
9328 NumVDataDwords, NumVAddrDwords);
9334 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9337 if (Lanes[0].getValueSizeInBits() == 32) {
9338 for (
unsigned I = 0;
I < 3; ++
I)
9357 if (UseNSA && IsGFX11Plus) {
9365 for (
unsigned I = 0;
I < 3; ++
I) {
9368 {DirLanes[I], InvDirLanes[I]})));
9383 packLanes(RayOrigin,
true);
9384 packLanes(RayDir,
true);
9385 packLanes(RayInvDir,
false);
9390 if (NumVAddrDwords > 12) {
9410 case Intrinsic::amdgcn_global_atomic_fmin_num:
9411 case Intrinsic::amdgcn_global_atomic_fmax_num:
9412 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9413 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9420 unsigned Opcode = 0;
9422 case Intrinsic::amdgcn_global_atomic_fmin_num:
9423 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9427 case Intrinsic::amdgcn_global_atomic_fmax_num:
9428 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9436 Ops,
M->getMemOperand());
9438 case Intrinsic::amdgcn_s_get_barrier_state:
9439 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9444 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9445 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9446 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9447 BarID = (BarID >> 4) & 0x3F;
9448 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9453 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9454 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9474 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9482SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9492 bool IsTFE = VTList.
NumVTs == 3;
9495 unsigned NumOpDWords = NumValueDWords + 1;
9500 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9501 OpDWordsVT, OpDWordsMMO, DAG);
9516 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9522 WidenedMemVT, WidenedMMO);
9532 bool ImageStore)
const {
9567 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9573 if ((NumElements % 2) == 1) {
9575 unsigned I = Elts.
size() / 2;
9591 if (NumElements == 3) {
9612 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9615 switch (IntrinsicID) {
9616 case Intrinsic::amdgcn_exp_compr: {
9620 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9643 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9646 case Intrinsic::amdgcn_s_barrier:
9647 case Intrinsic::amdgcn_s_barrier_signal:
9648 case Intrinsic::amdgcn_s_barrier_wait: {
9651 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9652 if (WGSize <=
ST.getWavefrontSize()) {
9655 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9656 return Op.getOperand(0);
9659 MVT::Other,
Op.getOperand(0)),
9664 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9670 MVT::Other, K,
Op.getOperand(0)),
9682 case Intrinsic::amdgcn_struct_tbuffer_store:
9683 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9687 VData = handleD16VData(VData, DAG);
9688 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9689 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9707 M->getMemoryVT(),
M->getMemOperand());
9710 case Intrinsic::amdgcn_raw_tbuffer_store:
9711 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9715 VData = handleD16VData(VData, DAG);
9716 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9717 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9735 M->getMemoryVT(),
M->getMemOperand());
9738 case Intrinsic::amdgcn_raw_buffer_store:
9739 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9740 case Intrinsic::amdgcn_raw_buffer_store_format:
9741 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9742 const bool IsFormat =
9743 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9744 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9751 VData = handleD16VData(VData, DAG);
9761 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9762 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9782 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9785 M->getMemoryVT(),
M->getMemOperand());
9788 case Intrinsic::amdgcn_struct_buffer_store:
9789 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9790 case Intrinsic::amdgcn_struct_buffer_store_format:
9791 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9792 const bool IsFormat =
9793 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9794 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9802 VData = handleD16VData(VData, DAG);
9812 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9813 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9834 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9837 M->getMemoryVT(),
M->getMemOperand());
9839 case Intrinsic::amdgcn_raw_buffer_load_lds:
9840 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9841 case Intrinsic::amdgcn_struct_buffer_load_lds:
9842 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9846 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9847 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9848 unsigned OpOffset = HasVIndex ? 1 : 0;
9849 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9851 unsigned Size =
Op->getConstantOperandVal(4);
9857 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9858 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9859 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9860 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9863 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9864 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9865 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9866 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9869 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9870 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9871 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9872 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9877 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9878 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9879 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9880 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9885 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9886 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9887 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9888 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9896 if (HasVIndex && HasVOffset)
9902 else if (HasVOffset)
9905 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9910 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9922 auto *
M = cast<MemSDNode>(
Op);
9949 case Intrinsic::amdgcn_global_load_lds: {
9951 unsigned Size =
Op->getConstantOperandVal(4);
9956 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9959 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9962 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9967 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9972 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
9976 auto *
M = cast<MemSDNode>(
Op);
9989 if (
LHS->isDivergent())
9993 RHS.getOperand(0).getValueType() == MVT::i32) {
9996 VOffset =
RHS.getOperand(0);
10001 if (!
Addr->isDivergent()) {
10018 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10038 case Intrinsic::amdgcn_end_cf:
10040 Op->getOperand(2), Chain),
10042 case Intrinsic::amdgcn_s_barrier_init:
10043 case Intrinsic::amdgcn_s_barrier_signal_var: {
10050 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10051 ? AMDGPU::S_BARRIER_INIT_M0
10052 : AMDGPU::S_BARRIER_SIGNAL_M0;
10067 constexpr unsigned ShAmt = 16;
10079 case Intrinsic::amdgcn_s_barrier_join:
10080 case Intrinsic::amdgcn_s_wakeup_barrier: {
10087 if (isa<ConstantSDNode>(BarOp)) {
10088 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10089 switch (IntrinsicID) {
10092 case Intrinsic::amdgcn_s_barrier_join:
10093 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10095 case Intrinsic::amdgcn_s_wakeup_barrier:
10096 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
10100 unsigned BarID = (BarVal >> 4) & 0x3F;
10105 switch (IntrinsicID) {
10108 case Intrinsic::amdgcn_s_barrier_join:
10109 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10111 case Intrinsic::amdgcn_s_wakeup_barrier:
10112 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
10129 case Intrinsic::amdgcn_s_prefetch_data: {
10132 return Op.getOperand(0);
10135 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10137 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10144 Op->getVTList(), Ops,
M->getMemoryVT(),
10145 M->getMemOperand());
10150 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10163std::pair<SDValue, SDValue>
10170 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10173 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10187 unsigned Overflow = ImmOffset & ~MaxImm;
10188 ImmOffset -= Overflow;
10189 if ((int32_t)Overflow < 0) {
10190 Overflow += ImmOffset;
10195 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10199 SDValue Ops[] = {N0, OverflowVal};
10214void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10216 Align Alignment)
const {
10219 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10222 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10233 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10235 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10252SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10255 return MaybePointer;
10269 SDValue NumRecords =
Op->getOperand(3);
10272 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10275 std::optional<uint32_t> ConstStride = std::nullopt;
10276 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10277 ConstStride = ConstNode->getZExtValue();
10280 if (!ConstStride || *ConstStride != 0) {
10283 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10294 NewHighHalf, NumRecords, Flags);
10304 bool IsTFE)
const {
10314 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10342 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10346 Ops[1] = BufferStoreExt;
10351 M->getMemOperand());
10376 DAGCombinerInfo &DCI)
const {
10392 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10399 "unexpected vector extload");
10412 "unexpected fp extload");
10430 DCI.AddToWorklist(Cvt.
getNode());
10435 DCI.AddToWorklist(Cvt.
getNode());
10446 if (
Info.isEntryFunction())
10447 return Info.getUserSGPRInfo().hasFlatScratchInit();
10455 EVT MemVT =
Load->getMemoryVT();
10468 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10496 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10497 "Custom lowering for non-i32 vectors hasn't been implemented.");
10500 unsigned AS =
Load->getAddressSpace();
10524 Alignment >=
Align(4) && NumElements < 32) {
10538 if (NumElements > 4)
10557 if (NumElements > 2)
10562 if (NumElements > 4)
10574 auto Flags =
Load->getMemOperand()->getFlags();
10576 Load->getAlign(), Flags, &
Fast) &&
10585 MemVT, *
Load->getMemOperand())) {
10594 EVT VT =
Op.getValueType();
10631 EVT VT =
Op.getValueType();
10634 bool AllowInaccurateRcp =
10641 if (!AllowInaccurateRcp && VT != MVT::f16)
10644 if (CLHS->isExactlyValue(1.0)) {
10661 if (CLHS->isExactlyValue(-1.0)) {
10670 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10684 EVT VT =
Op.getValueType();
10687 bool AllowInaccurateDiv =
10689 if (!AllowInaccurateDiv)
10710 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10724 return DAG.
getNode(Opcode, SL, VTList,
10733 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10747 return DAG.
getNode(Opcode, SL, VTList,
10753 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10754 return FastLowered;
10774 unsigned FMADOpCode =
10784 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10786 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10787 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10810 const APFloat K0Val(0x1p+96f);
10813 const APFloat K1Val(0x1p-32f);
10840 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10841 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10842 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10847 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10848 return FastLowered;
10855 Flags.setNoFPExcept(
true);
10876 using namespace AMDGPU::Hwreg;
10877 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10885 const bool HasDynamicDenormals =
10891 if (!PreservesDenormals) {
10899 if (HasDynamicDenormals) {
10903 SavedDenormMode =
SDValue(GetReg, 0);
10911 const SDValue EnableDenormValue =
10918 const SDValue EnableDenormValue =
10920 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10921 {EnableDenormValue,
BitField, Glue});
10931 ApproxRcp, One, NegDivScale0, Flags);
10934 ApproxRcp, Fma0, Flags);
10940 NumeratorScaled,
Mul, Flags);
10946 NumeratorScaled, Fma3, Flags);
10948 if (!PreservesDenormals) {
10956 DisableDenormValue, Fma4.
getValue(2))
10959 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10960 const SDValue DisableDenormValue =
10961 HasDynamicDenormals
10966 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10977 {Fma4, Fma1, Fma3, Scale},
Flags);
10983 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10984 return FastLowered;
11052 EVT VT =
Op.getValueType();
11054 if (VT == MVT::f32)
11055 return LowerFDIV32(
Op, DAG);
11057 if (VT == MVT::f64)
11058 return LowerFDIV64(
Op, DAG);
11060 if (VT == MVT::f16)
11061 return LowerFDIV16(
Op, DAG);
11070 EVT ResultExpVT =
Op->getValueType(1);
11071 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11101 if (VT == MVT::i1) {
11105 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11109 Store->getValue().getValueType().getScalarType() == MVT::i32);
11111 unsigned AS =
Store->getAddressSpace();
11130 if (NumElements > 4)
11137 VT, *
Store->getMemOperand()))
11147 if (NumElements > 2)
11151 if (NumElements > 4 ||
11160 auto Flags =
Store->getMemOperand()->getFlags();
11195 MVT VT =
Op.getValueType().getSimpleVT();
11366 EVT VT =
Op.getValueType();
11383 switch (
Op.getOpcode()) {
11410 EVT VT =
Op.getValueType();
11418 Op->getVTList(), Ops, VT,
11427SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11428 DAGCombinerInfo &DCI)
const {
11429 EVT VT =
N->getValueType(0);
11431 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11438 EVT SrcVT = Src.getValueType();
11444 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11447 DCI.AddToWorklist(Cvt.
getNode());
11450 if (ScalarVT != MVT::f32) {
11462 DAGCombinerInfo &DCI)
const {
11463 SDValue MagnitudeOp =
N->getOperand(0);
11464 SDValue SignOp =
N->getOperand(1);
11520SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11522 DAGCombinerInfo &DCI)
const {
11552 AM.HasBaseReg =
true;
11553 AM.BaseOffs =
Offset.getSExtValue();
11558 EVT VT =
N->getValueType(0);
11564 Flags.setNoUnsignedWrap(
11565 N->getFlags().hasNoUnsignedWrap() &&
11575 switch (
N->getOpcode()) {
11586 DAGCombinerInfo &DCI)
const {
11595 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11596 N->getMemoryVT(), DCI);
11600 NewOps[PtrIdx] = NewPtr;
11609 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11610 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11619SDValue SITargetLowering::splitBinaryBitConstantOp(
11620 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11640 if (V.getValueType() != MVT::i1)
11642 switch (V.getOpcode()) {
11661 if (!(
C & 0x000000ff))
11662 ZeroByteMask |= 0x000000ff;
11663 if (!(
C & 0x0000ff00))
11664 ZeroByteMask |= 0x0000ff00;
11665 if (!(
C & 0x00ff0000))
11666 ZeroByteMask |= 0x00ff0000;
11667 if (!(
C & 0xff000000))
11668 ZeroByteMask |= 0xff000000;
11669 uint32_t NonZeroByteMask = ~ZeroByteMask;
11670 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11683 assert(V.getValueSizeInBits() == 32);
11685 if (V.getNumOperands() != 2)
11694 switch (V.getOpcode()) {
11699 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11704 return (0x03020100 & ~ConstMask) | ConstMask;
11711 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11717 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11724 DAGCombinerInfo &DCI)
const {
11725 if (DCI.isBeforeLegalize())
11729 EVT VT =
N->getValueType(0);
11734 if (VT == MVT::i64 && CRHS) {
11740 if (CRHS && VT == MVT::i32) {
11749 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11750 unsigned Shift = CShift->getZExtValue();
11752 unsigned Offset = NB + Shift;
11753 if ((
Offset & (Bits - 1)) == 0) {
11771 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11777 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11792 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11797 if (
X !=
LHS.getOperand(1))
11802 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11835 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11836 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11838 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11839 :
Mask->getZExtValue() & OrdMask;
11860 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11863 if (LHSMask != ~0u && RHSMask != ~0u) {
11866 if (LHSMask > RHSMask) {
11873 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11874 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11877 if (!(LHSUsedLanes & RHSUsedLanes) &&
11880 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11887 for (
unsigned I = 0;
I < 32;
I += 8) {
11889 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11890 Mask &= (0x0c <<
I) & 0xffffffff;
11948static const std::optional<ByteProvider<SDValue>>
11950 unsigned Depth = 0) {
11953 return std::nullopt;
11955 if (
Op.getValueSizeInBits() < 8)
11956 return std::nullopt;
11958 if (
Op.getValueType().isVector())
11961 switch (
Op->getOpcode()) {
11972 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11973 NarrowVT = VTSign->getVT();
11976 return std::nullopt;
11979 if (SrcIndex >= NarrowByteWidth)
11980 return std::nullopt;
11986 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11988 return std::nullopt;
11990 uint64_t BitShift = ShiftOp->getZExtValue();
11992 if (BitShift % 8 != 0)
11993 return std::nullopt;
11995 SrcIndex += BitShift / 8;
12013static const std::optional<ByteProvider<SDValue>>
12015 unsigned StartingIndex = 0) {
12019 return std::nullopt;
12021 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12023 return std::nullopt;
12025 return std::nullopt;
12027 bool IsVec =
Op.getValueType().isVector();
12028 switch (
Op.getOpcode()) {
12031 return std::nullopt;
12036 return std::nullopt;
12040 return std::nullopt;
12043 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12044 return std::nullopt;
12045 if (!
LHS ||
LHS->isConstantZero())
12047 if (!
RHS ||
RHS->isConstantZero())
12049 return std::nullopt;
12054 return std::nullopt;
12056 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12058 return std::nullopt;
12060 uint32_t BitMask = BitMaskOp->getZExtValue();
12062 uint32_t IndexMask = 0xFF << (Index * 8);
12064 if ((IndexMask & BitMask) != IndexMask) {
12067 if (IndexMask & BitMask)
12068 return std::nullopt;
12077 return std::nullopt;
12080 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12081 if (!ShiftOp ||
Op.getValueType().isVector())
12082 return std::nullopt;
12084 uint64_t BitsProvided =
Op.getValueSizeInBits();
12085 if (BitsProvided % 8 != 0)
12086 return std::nullopt;
12088 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12090 return std::nullopt;
12092 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12093 uint64_t ByteShift = BitShift / 8;
12095 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12096 uint64_t BytesProvided = BitsProvided / 8;
12097 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12098 NewIndex %= BytesProvided;
12105 return std::nullopt;
12107 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12109 return std::nullopt;
12111 uint64_t BitShift = ShiftOp->getZExtValue();
12113 return std::nullopt;
12115 auto BitsProvided =
Op.getScalarValueSizeInBits();
12116 if (BitsProvided % 8 != 0)
12117 return std::nullopt;
12119 uint64_t BytesProvided = BitsProvided / 8;
12120 uint64_t ByteShift = BitShift / 8;
12125 return BytesProvided - ByteShift > Index
12133 return std::nullopt;
12135 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12137 return std::nullopt;
12139 uint64_t BitShift = ShiftOp->getZExtValue();
12140 if (BitShift % 8 != 0)
12141 return std::nullopt;
12142 uint64_t ByteShift = BitShift / 8;
12148 return Index < ByteShift
12151 Depth + 1, StartingIndex);
12160 return std::nullopt;
12167 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12168 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12170 if (NarrowBitWidth % 8 != 0)
12171 return std::nullopt;
12172 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12174 if (Index >= NarrowByteWidth)
12176 ? std::optional<ByteProvider<SDValue>>(
12184 return std::nullopt;
12188 if (NarrowByteWidth >= Index) {
12193 return std::nullopt;
12200 return std::nullopt;
12204 auto *L = cast<LoadSDNode>(
Op.getNode());
12206 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12207 if (NarrowBitWidth % 8 != 0)
12208 return std::nullopt;
12209 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12214 if (Index >= NarrowByteWidth) {
12216 ? std::optional<ByteProvider<SDValue>>(
12221 if (NarrowByteWidth > Index) {
12225 return std::nullopt;
12230 return std::nullopt;
12233 Depth + 1, StartingIndex);
12237 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12239 return std::nullopt;
12240 auto VecIdx = IdxOp->getZExtValue();
12241 auto ScalarSize =
Op.getScalarValueSizeInBits();
12242 if (ScalarSize < 32)
12243 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12245 StartingIndex, Index);
12250 return std::nullopt;
12252 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12254 return std::nullopt;
12257 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12258 if (IdxMask > 0x07 && IdxMask != 0x0c)
12259 return std::nullopt;
12261 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12262 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12264 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12270 return std::nullopt;
12285 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12289 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12292 auto MemVT = L->getMemoryVT();
12295 return L->getMemoryVT().getSizeInBits() == 16;
12305 int Low8 = Mask & 0xff;
12306 int Hi8 = (Mask & 0xff00) >> 8;
12308 assert(Low8 < 8 && Hi8 < 8);
12310 bool IsConsecutive = (Hi8 - Low8 == 1);
12315 bool Is16Aligned = !(Low8 % 2);
12317 return IsConsecutive && Is16Aligned;
12325 int Low16 = PermMask & 0xffff;
12326 int Hi16 = (PermMask & 0xffff0000) >> 16;
12336 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12338 if (!OtherOpIs16Bit)
12346 unsigned DWordOffset) {
12349 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12351 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12356 if (Src.getValueType().isVector()) {
12357 auto ScalarTySize = Src.getScalarValueSizeInBits();
12358 auto ScalarTy = Src.getValueType().getScalarType();
12359 if (ScalarTySize == 32) {
12363 if (ScalarTySize > 32) {
12366 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12367 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12374 assert(ScalarTySize < 32);
12375 auto NumElements =
TypeSize / ScalarTySize;
12376 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12377 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12378 auto NumElementsIn32 = 32 / ScalarTySize;
12379 auto NumAvailElements = DWordOffset < Trunc32Elements
12381 : NumElements - NormalizedTrunc;
12394 auto ShiftVal = 32 * DWordOffset;
12402 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12407 for (
int i = 0; i < 4; i++) {
12409 std::optional<ByteProvider<SDValue>>
P =
12412 if (!
P ||
P->isConstantZero())
12417 if (PermNodes.
size() != 4)
12420 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12421 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12423 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12424 auto PermOp = PermNodes[i];
12427 int SrcByteAdjust = 4;
12431 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12432 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12434 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12435 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12439 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12440 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12443 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12445 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12448 SDValue Op = *PermNodes[FirstSrc.first].Src;
12450 assert(
Op.getValueSizeInBits() == 32);
12454 int Low16 = PermMask & 0xffff;
12455 int Hi16 = (PermMask & 0xffff0000) >> 16;
12457 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12458 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12461 if (WellFormedLow && WellFormedHi)
12465 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12474 assert(
Op.getValueType().isByteSized() &&
12492 DAGCombinerInfo &DCI)
const {
12497 EVT VT =
N->getValueType(0);
12498 if (VT == MVT::i1) {
12503 if (Src !=
RHS.getOperand(0))
12508 if (!CLHS || !CRHS)
12512 static const uint32_t MaxMask = 0x3ff;
12527 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12532 Sel |=
LHS.getConstantOperandVal(2);
12541 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12545 auto usesCombinedOperand = [](
SDNode *OrUse) {
12548 !OrUse->getValueType(0).isVector())
12552 for (
auto *VUser : OrUse->users()) {
12553 if (!VUser->getValueType(0).isVector())
12560 if (VUser->getOpcode() == VectorwiseOp)
12566 if (!
any_of(
N->users(), usesCombinedOperand))
12572 if (LHSMask != ~0u && RHSMask != ~0u) {
12575 if (LHSMask > RHSMask) {
12582 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12583 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12586 if (!(LHSUsedLanes & RHSUsedLanes) &&
12589 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12591 LHSMask &= ~RHSUsedLanes;
12592 RHSMask &= ~LHSUsedLanes;
12594 LHSMask |= LHSUsedLanes & 0x04040404;
12604 if (LHSMask == ~0u || RHSMask == ~0u) {
12610 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12625 if (SrcVT == MVT::i32) {
12630 DCI.AddToWorklist(LowOr.
getNode());
12631 DCI.AddToWorklist(HiBits.getNode());
12639 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12642 N->getOperand(0), CRHS))
12650 DAGCombinerInfo &DCI)
const {
12651 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12660 EVT VT =
N->getValueType(0);
12661 if (CRHS && VT == MVT::i64) {
12683 LHS->getOperand(0), FNegLHS, FNegRHS);
12692 DAGCombinerInfo &DCI)
const {
12697 EVT VT =
N->getValueType(0);
12698 if (VT != MVT::i32)
12702 if (Src.getValueType() != MVT::i16)
12709SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12710 DAGCombinerInfo &DCI)
const {
12712 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12717 VTSign->getVT() == MVT::i8) ||
12719 VTSign->getVT() == MVT::i16))) {
12721 "s_buffer_load_{u8, i8} are supported "
12722 "in GFX12 (or newer) architectures.");
12723 EVT VT = Src.getValueType();
12728 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12734 auto *
M = cast<MemSDNode>(Src);
12735 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12736 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12741 VTSign->getVT() == MVT::i8) ||
12743 VTSign->getVT() == MVT::i16)) &&
12745 auto *
M = cast<MemSDNode>(Src);
12746 SDValue Ops[] = {Src.getOperand(0),
12752 Src.getOperand(6), Src.getOperand(7)};
12755 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12759 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12760 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12761 return DCI.DAG.getMergeValues(
12768 DAGCombinerInfo &DCI)
const {
12776 if (
N->getOperand(0).isUndef())
12783 DAGCombinerInfo &DCI)
const {
12784 EVT VT =
N->getValueType(0);
12810 unsigned Opcode =
Op.getOpcode();
12814 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12815 const auto &
F = CFP->getValueAPF();
12816 if (
F.isNaN() &&
F.isSignaling())
12818 if (!
F.isDenormal())
12881 if (
Op.getValueType() == MVT::i32) {
12886 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12887 if (
RHS->getZExtValue() == 0xffff0000) {
12897 return Op.getValueType().getScalarType() != MVT::f16;
12965 if (
Op.getValueType() == MVT::i16) {
12976 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12978 switch (IntrinsicID) {
12979 case Intrinsic::amdgcn_cvt_pkrtz:
12980 case Intrinsic::amdgcn_cubeid:
12981 case Intrinsic::amdgcn_frexp_mant:
12982 case Intrinsic::amdgcn_fdot2:
12983 case Intrinsic::amdgcn_rcp:
12984 case Intrinsic::amdgcn_rsq:
12985 case Intrinsic::amdgcn_rsq_clamp:
12986 case Intrinsic::amdgcn_rcp_legacy:
12987 case Intrinsic::amdgcn_rsq_legacy:
12988 case Intrinsic::amdgcn_trig_preop:
12989 case Intrinsic::amdgcn_log:
12990 case Intrinsic::amdgcn_exp2:
12991 case Intrinsic::amdgcn_sqrt:
13012 unsigned Opcode =
MI->getOpcode();
13014 if (Opcode == AMDGPU::G_FCANONICALIZE)
13017 std::optional<FPValueAndVReg> FCR;
13020 if (FCR->Value.isSignaling())
13022 if (!FCR->Value.isDenormal())
13033 case AMDGPU::G_FADD:
13034 case AMDGPU::G_FSUB:
13035 case AMDGPU::G_FMUL:
13036 case AMDGPU::G_FCEIL:
13037 case AMDGPU::G_FFLOOR:
13038 case AMDGPU::G_FRINT:
13039 case AMDGPU::G_FNEARBYINT:
13040 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13041 case AMDGPU::G_INTRINSIC_TRUNC:
13042 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13043 case AMDGPU::G_FMA:
13044 case AMDGPU::G_FMAD:
13045 case AMDGPU::G_FSQRT:
13046 case AMDGPU::G_FDIV:
13047 case AMDGPU::G_FREM:
13048 case AMDGPU::G_FPOW:
13049 case AMDGPU::G_FPEXT:
13050 case AMDGPU::G_FLOG:
13051 case AMDGPU::G_FLOG2:
13052 case AMDGPU::G_FLOG10:
13053 case AMDGPU::G_FPTRUNC:
13054 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13055 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13056 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13057 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13058 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13060 case AMDGPU::G_FNEG:
13061 case AMDGPU::G_FABS:
13062 case AMDGPU::G_FCOPYSIGN:
13064 case AMDGPU::G_FMINNUM:
13065 case AMDGPU::G_FMAXNUM:
13066 case AMDGPU::G_FMINNUM_IEEE:
13067 case AMDGPU::G_FMAXNUM_IEEE:
13068 case AMDGPU::G_FMINIMUM:
13069 case AMDGPU::G_FMAXIMUM: {
13077 case AMDGPU::G_BUILD_VECTOR:
13082 case AMDGPU::G_INTRINSIC:
13083 case AMDGPU::G_INTRINSIC_CONVERGENT:
13085 case Intrinsic::amdgcn_fmul_legacy:
13086 case Intrinsic::amdgcn_fmad_ftz:
13087 case Intrinsic::amdgcn_sqrt:
13088 case Intrinsic::amdgcn_fmed3:
13089 case Intrinsic::amdgcn_sin:
13090 case Intrinsic::amdgcn_cos:
13091 case Intrinsic::amdgcn_log:
13092 case Intrinsic::amdgcn_exp2:
13093 case Intrinsic::amdgcn_log_clamp:
13094 case Intrinsic::amdgcn_rcp:
13095 case Intrinsic::amdgcn_rcp_legacy:
13096 case Intrinsic::amdgcn_rsq:
13097 case Intrinsic::amdgcn_rsq_clamp:
13098 case Intrinsic::amdgcn_rsq_legacy:
13099 case Intrinsic::amdgcn_div_scale:
13100 case Intrinsic::amdgcn_div_fmas:
13101 case Intrinsic::amdgcn_div_fixup:
13102 case Intrinsic::amdgcn_fract:
13103 case Intrinsic::amdgcn_cvt_pkrtz:
13104 case Intrinsic::amdgcn_cubeid:
13105 case Intrinsic::amdgcn_cubema:
13106 case Intrinsic::amdgcn_cubesc:
13107 case Intrinsic::amdgcn_cubetc:
13108 case Intrinsic::amdgcn_frexp_mant:
13109 case Intrinsic::amdgcn_fdot2:
13110 case Intrinsic::amdgcn_trig_preop:
13129 if (
C.isDenormal()) {
13143 if (
C.isSignaling()) {
13162 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13166SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13167 DAGCombinerInfo &DCI)
const {
13170 EVT VT =
N->getValueType(0);
13179 EVT VT =
N->getValueType(0);
13180 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13196 EVT EltVT =
Lo.getValueType();
13199 for (
unsigned I = 0;
I != 2; ++
I) {
13203 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13204 }
else if (
Op.isUndef()) {
13216 if (isa<ConstantFPSDNode>(NewElts[1]))
13217 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13223 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13275 if (!MinK || !MaxK)
13288 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13289 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13330 if (
Info->getMode().DX10Clamp) {
13339 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13371 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13380 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13389 DAGCombinerInfo &DCI)
const {
13392 EVT VT =
N->getValueType(0);
13393 unsigned Opc =
N->getOpcode();
13422 if (
SDValue Med3 = performIntMed3ImmCombine(
13427 if (
SDValue Med3 = performIntMed3ImmCombine(
13433 if (
SDValue Med3 = performIntMed3ImmCombine(
13438 if (
SDValue Med3 = performIntMed3ImmCombine(
13448 (VT == MVT::f32 || VT == MVT::f64 ||
13452 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13463 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13464 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13473 DAGCombinerInfo &DCI)
const {
13474 EVT VT =
N->getValueType(0);
13497 if (
Info->getMode().DX10Clamp) {
13500 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13503 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13506 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13517 DAGCombinerInfo &DCI)
const {
13521 return DCI.DAG.getUNDEF(
N->getValueType(0));
13529 bool IsDivergentIdx,
13534 unsigned VecSize = EltSize * NumElem;
13537 if (VecSize <= 64 && EltSize < 32)
13546 if (IsDivergentIdx)
13550 unsigned NumInsts = NumElem +
13551 ((EltSize + 31) / 32) * NumElem ;
13556 return NumInsts <= 16;
13561 return NumInsts <= 15;
13568 if (isa<ConstantSDNode>(
Idx))
13582SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13583 DAGCombinerInfo &DCI)
const {
13589 EVT ResVT =
N->getValueType(0);
13608 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13636 DCI.AddToWorklist(Elt0.
getNode());
13637 DCI.AddToWorklist(Elt1.
getNode());
13659 if (!DCI.isBeforeLegalize())
13665 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13666 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13667 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13670 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13671 unsigned EltIdx = BitIndex / 32;
13672 unsigned LeftoverBitIdx = BitIndex % 32;
13676 DCI.AddToWorklist(Cast.
getNode());
13680 DCI.AddToWorklist(Elt.
getNode());
13683 DCI.AddToWorklist(Srl.
getNode());
13687 DCI.AddToWorklist(Trunc.
getNode());
13689 if (VecEltVT == ResVT) {
13701SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13702 DAGCombinerInfo &DCI)
const {
13716 EVT IdxVT =
Idx.getValueType();
13733 Src.getOperand(0).getValueType() == MVT::f16) {
13734 return Src.getOperand(0);
13737 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13738 APFloat Val = CFP->getValueAPF();
13739 bool LosesInfo =
true;
13749 DAGCombinerInfo &DCI)
const {
13751 "combine only useful on gfx8");
13753 SDValue TruncSrc =
N->getOperand(0);
13754 EVT VT =
N->getValueType(0);
13755 if (VT != MVT::f16)
13793unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13795 const SDNode *N1)
const {
13800 if (((VT == MVT::f32 &&
13802 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13822 EVT VT =
N->getValueType(0);
13823 if (VT != MVT::i32 && VT != MVT::i64)
13829 unsigned Opc =
N->getOpcode();
13852 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13872 DAGCombinerInfo &DCI)
const {
13876 EVT VT =
N->getValueType(0);
13886 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13890 if (NumBits <= 32 || NumBits > 64)
13902 unsigned NumUsers = 0;
13927 bool MulSignedLo =
false;
13928 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13937 if (VT != MVT::i64) {
13960 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13962 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13963 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13965 if (!MulLHSUnsigned32) {
13972 if (!MulRHSUnsigned32) {
13983 if (VT != MVT::i64)
13990static std::optional<ByteProvider<SDValue>>
13993 if (!Byte0 || Byte0->isConstantZero()) {
13994 return std::nullopt;
13997 if (Byte1 && !Byte1->isConstantZero()) {
13998 return std::nullopt;
14004 unsigned FirstCs =
First & 0x0c0c0c0c;
14005 unsigned SecondCs = Second & 0x0c0c0c0c;
14006 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14007 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14009 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14010 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14011 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14012 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14014 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14038 for (
int BPI = 0; BPI < 2; BPI++) {
14041 BPP = {Src1, Src0};
14043 unsigned ZeroMask = 0x0c0c0c0c;
14044 unsigned FMask = 0xFF << (8 * (3 - Step));
14046 unsigned FirstMask =
14047 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14048 unsigned SecondMask =
14049 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14053 int FirstGroup = -1;
14054 for (
int I = 0;
I < 2;
I++) {
14056 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14057 return IterElt.SrcOp == *BPP.first.Src &&
14058 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14068 if (FirstGroup != -1) {
14070 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14071 return IterElt.SrcOp == *BPP.second.Src &&
14072 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14078 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14086 unsigned ZeroMask = 0x0c0c0c0c;
14087 unsigned FMask = 0xFF << (8 * (3 - Step));
14091 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14095 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14104 if (Srcs.
size() == 1) {
14105 auto *Elt = Srcs.
begin();
14109 if (Elt->PermMask == 0x3020100)
14116 auto *FirstElt = Srcs.
begin();
14117 auto *SecondElt = std::next(FirstElt);
14124 auto FirstMask = FirstElt->PermMask;
14125 auto SecondMask = SecondElt->PermMask;
14127 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14128 unsigned FirstPlusFour = FirstMask | 0x04040404;
14131 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14143 FirstElt = std::next(SecondElt);
14144 if (FirstElt == Srcs.
end())
14147 SecondElt = std::next(FirstElt);
14150 if (SecondElt == Srcs.
end()) {
14156 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14162 return Perms.
size() == 2
14168 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14169 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14170 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14171 EntryMask += ZeroMask;
14176 auto Opcode =
Op.getOpcode();
14182static std::optional<bool>
14193 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14196 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14198 assert(!(S0IsUnsigned && S0IsSigned));
14199 assert(!(S1IsUnsigned && S1IsSigned));
14207 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14213 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14214 return std::nullopt;
14226 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14227 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14232 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14238 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14239 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14240 return std::nullopt;
14246 DAGCombinerInfo &DCI)
const {
14248 EVT VT =
N->getValueType(0);
14255 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14260 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14267 std::optional<bool> IsSigned;
14273 int ChainLength = 0;
14274 for (
int I = 0;
I < 4;
I++) {
14275 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14278 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14281 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14286 TempNode->getOperand(MulIdx), *Src0, *Src1,
14287 TempNode->getOperand(MulIdx)->getOperand(0),
14288 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14292 IsSigned = *IterIsSigned;
14293 if (*IterIsSigned != *IsSigned)
14296 auto AddIdx = 1 - MulIdx;
14299 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14300 Src2s.
push_back(TempNode->getOperand(AddIdx));
14310 TempNode->getOperand(AddIdx), *Src0, *Src1,
14311 TempNode->getOperand(AddIdx)->getOperand(0),
14312 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14316 if (*IterIsSigned != *IsSigned)
14320 ChainLength =
I + 2;
14324 TempNode = TempNode->getOperand(AddIdx);
14326 ChainLength =
I + 1;
14327 if (TempNode->getNumOperands() < 2)
14329 LHS = TempNode->getOperand(0);
14330 RHS = TempNode->getOperand(1);
14333 if (ChainLength < 2)
14339 if (ChainLength < 4) {
14349 bool UseOriginalSrc =
false;
14350 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14351 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14352 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14353 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14355 auto Src0Mask = Src0s.
begin()->PermMask;
14356 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14357 bool UniqueEntries =
true;
14358 for (
auto I = 1;
I < 4;
I++) {
14359 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14362 UniqueEntries =
false;
14368 if (UniqueEntries) {
14369 UseOriginalSrc =
true;
14371 auto *FirstElt = Src0s.
begin();
14375 auto *SecondElt = Src1s.
begin();
14377 SecondElt->DWordOffset);
14386 if (!UseOriginalSrc) {
14393 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14396 : Intrinsic::amdgcn_udot4,
14406 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14411 unsigned Opc =
LHS.getOpcode();
14416 Opc =
RHS.getOpcode();
14423 auto Cond =
RHS.getOperand(0);
14431 return DAG.
getNode(Opc, SL, VTList, Args);
14445 DAGCombinerInfo &DCI)
const {
14447 EVT VT =
N->getValueType(0);
14449 if (VT != MVT::i32)
14458 unsigned Opc =
RHS.getOpcode();
14465 auto Cond =
RHS.getOperand(0);
14473 return DAG.
getNode(Opc, SL, VTList, Args);
14488SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14489 DAGCombinerInfo &DCI)
const {
14491 if (
N->getValueType(0) != MVT::i32)
14502 unsigned LHSOpc =
LHS.getOpcode();
14503 unsigned Opc =
N->getOpcode();
14513 DAGCombinerInfo &DCI)
const {
14518 EVT VT =
N->getValueType(0);
14530 if (
A ==
LHS.getOperand(1)) {
14531 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14532 if (FusedOp != 0) {
14534 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14542 if (
A ==
RHS.getOperand(1)) {
14543 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14544 if (FusedOp != 0) {
14546 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14555 DAGCombinerInfo &DCI)
const {
14561 EVT VT =
N->getValueType(0);
14574 if (
A ==
LHS.getOperand(1)) {
14575 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14576 if (FusedOp != 0) {
14580 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14589 if (
A ==
RHS.getOperand(1)) {
14590 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14591 if (FusedOp != 0) {
14593 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14602 DAGCombinerInfo &DCI)
const {
14605 EVT VT =
N->getValueType(0);
14619 bool IsNegative =
false;
14620 if (CLHS->isExactlyValue(1.0) ||
14621 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14637 DAGCombinerInfo &DCI)
const {
14639 EVT VT =
N->getValueType(0);
14653 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14668 if (ScalarVT == MVT::f32 &&
14674 if (TrueNodeExpVal == INT_MIN)
14677 if (FalseNodeExpVal == INT_MIN)
14697 DAGCombinerInfo &DCI)
const {
14699 EVT VT =
N->getValueType(0);
14720 (
N->getFlags().hasAllowContract() &&
14721 FMA->getFlags().hasAllowContract())) {
14755 if (Vec1 == Vec2 || Vec3 == Vec4)
14761 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14770 DAGCombinerInfo &DCI)
const {
14776 EVT VT =
LHS.getValueType();
14779 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14781 CRHS = dyn_cast<ConstantSDNode>(LHS);
14805 return LHS.getOperand(0);
14811 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14812 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14813 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14820 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14821 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14829 return LHS.getOperand(0);
14833 if (VT != MVT::f32 && VT != MVT::f64 &&
14849 const unsigned IsInfMask =
14851 const unsigned IsFiniteMask =
14865SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
14866 DAGCombinerInfo &DCI)
const {
14884 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14888 unsigned ShiftOffset = 8 *
Offset;
14890 ShiftOffset -=
C->getZExtValue();
14892 ShiftOffset +=
C->getZExtValue();
14894 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14896 MVT::f32, Shifted);
14907 DCI.AddToWorklist(
N);
14914 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14920 DAGCombinerInfo &DCI)
const {
14930 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14933 APFloat One(
F.getSemantics(),
"1.0");
14935 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14942 switch (
N->getOpcode()) {
14958 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
14968 switch (
N->getOpcode()) {
14970 return performAddCombine(
N, DCI);
14972 return performSubCombine(
N, DCI);
14975 return performAddCarrySubCarryCombine(
N, DCI);
14977 return performFAddCombine(
N, DCI);
14979 return performFSubCombine(
N, DCI);
14981 return performFDivCombine(
N, DCI);
14983 return performFMulCombine(
N, DCI);
14985 return performSetCCCombine(
N, DCI);
14998 return performMinMaxCombine(
N, DCI);
15000 return performFMACombine(
N, DCI);
15002 return performAndCombine(
N, DCI);
15004 return performOrCombine(
N, DCI);
15007 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15008 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15014 return performXorCombine(
N, DCI);
15016 return performZeroExtendCombine(
N, DCI);
15018 return performSignExtendInRegCombine(
N, DCI);
15020 return performClassCombine(
N, DCI);
15022 return performFCanonicalizeCombine(
N, DCI);
15024 return performRcpCombine(
N, DCI);
15039 return performUCharToFloatCombine(
N, DCI);
15041 return performFCopySignCombine(
N, DCI);
15046 return performCvtF32UByteNCombine(
N, DCI);
15048 return performFMed3Combine(
N, DCI);
15050 return performCvtPkRTZCombine(
N, DCI);
15052 return performClampCombine(
N, DCI);
15055 EVT VT =
N->getValueType(0);
15058 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15061 EVT EltVT = Src.getValueType();
15062 if (EltVT != MVT::i16)
15072 return performExtractVectorEltCombine(
N, DCI);
15074 return performInsertVectorEltCombine(
N, DCI);
15076 return performFPRoundCombine(
N, DCI);
15078 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15084 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15085 return performMemSDNodeCombine(MemNode, DCI);
15116 unsigned Opcode =
Node->getMachineOpcode();
15120 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15125 unsigned DmaskIdx =
15127 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15128 unsigned NewDmask = 0;
15131 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15132 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15135 unsigned TFCLane = 0;
15136 bool HasChain =
Node->getNumValues() > 1;
15138 if (OldDmask == 0) {
15146 TFCLane = OldBitsSet;
15153 if (
Use.getResNo() != 0)
15159 if (!
User->isMachineOpcode() ||
15160 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15172 if (UsesTFC && Lane == TFCLane) {
15177 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15179 Dmask &= ~(1 << Comp);
15187 NewDmask |= 1 << Comp;
15192 bool NoChannels = !NewDmask;
15199 if (OldBitsSet == 1)
15205 if (NewDmask == OldDmask)
15214 unsigned NewChannels = BitsSet + UsesTFC;
15218 assert(NewOpcode != -1 &&
15219 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15220 "failed to find equivalent MIMG op");
15228 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15230 MVT ResultVT = NewChannels == 1
15233 : NewChannels == 5 ? 8
15247 if (NewChannels == 1) {
15257 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15262 if (i || !NoChannels)
15267 if (NewUser !=
User) {
15277 Idx = AMDGPU::sub1;
15280 Idx = AMDGPU::sub2;
15283 Idx = AMDGPU::sub3;
15286 Idx = AMDGPU::sub4;
15297 Op =
Op.getOperand(0);
15299 return isa<FrameIndexSDNode>(
Op);
15309 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15310 SDValue SrcVal = Node->getOperand(2);
15318 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15320 SDNode *Glued = Node->getGluedNode();
15322 Node->getOperand(0), SL, VReg, SrcVal,
15328 return ToResultReg.
getNode();
15333 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15341 Node->getOperand(i).getValueType(),
15342 Node->getOperand(i)),
15354 unsigned Opcode = Node->getMachineOpcode();
15356 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15357 !
TII->isGather4(Opcode) &&
15359 return adjustWritemask(Node, DAG);
15362 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15368 case AMDGPU::V_DIV_SCALE_F32_e64:
15369 case AMDGPU::V_DIV_SCALE_F64_e64: {
15373 SDValue Src0 = Node->getOperand(1);
15374 SDValue Src1 = Node->getOperand(3);
15375 SDValue Src2 = Node->getOperand(5);
15379 (Src0 == Src1 || Src0 == Src2))
15436 unsigned InitIdx = 0;
15438 if (
TII->isImage(
MI)) {
15446 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15447 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15448 unsigned D16Val = D16 ? D16->getImm() : 0;
15450 if (!TFEVal && !LWEVal)
15461 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15463 unsigned dmask = MO_Dmask->
getImm();
15470 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15476 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15477 if (DstSize < InitIdx)
15480 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15488 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15489 unsigned NewDst = 0;
15498 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15499 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15519 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15532 if (
TII->isVOP3(
MI.getOpcode())) {
15534 TII->legalizeOperandsVOP3(
MRI,
MI);
15539 if (!
MI.getDesc().operands().empty()) {
15540 unsigned Opc =
MI.getOpcode();
15541 bool HasAGPRs =
Info->mayNeedAGPRs();
15549 if ((
I == Src2Idx) && (HasAGPRs))
15552 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15554 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15555 if (!
TRI->hasAGPRs(RC))
15557 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15558 if (!Src || !Src->isCopy() ||
15559 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15561 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15565 MRI.setRegClass(
Op.getReg(), NewRC);
15568 if (
TII->isMAI(
MI)) {
15574 AMDGPU::OpName::scale_src0);
15575 if (Src0Idx != -1) {
15577 AMDGPU::OpName::scale_src1);
15578 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15579 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15580 TII->legalizeOpWithMove(
MI, Src1Idx);
15588 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15589 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15590 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15591 if (
TRI->isVectorSuperClass(RC)) {
15592 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15593 MRI.setRegClass(Src2->getReg(), NewRC);
15594 if (Src2->isTied())
15595 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15604 if (
TII->isImage(
MI))
15605 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15679std::pair<unsigned, const TargetRegisterClass *>
15686 if (Constraint.
size() == 1) {
15688 switch (Constraint[0]) {
15695 RC = &AMDGPU::SReg_32RegClass;
15698 RC = &AMDGPU::SGPR_64RegClass;
15703 return std::pair(0U,
nullptr);
15710 RC = &AMDGPU::VGPR_32RegClass;
15715 return std::pair(0U,
nullptr);
15724 RC = &AMDGPU::AGPR_32RegClass;
15729 return std::pair(0U,
nullptr);
15738 return std::pair(0U, RC);
15743 if (
RegName.consume_front(
"v")) {
15744 RC = &AMDGPU::VGPR_32RegClass;
15745 }
else if (
RegName.consume_front(
"s")) {
15746 RC = &AMDGPU::SGPR_32RegClass;
15747 }
else if (
RegName.consume_front(
"a")) {
15748 RC = &AMDGPU::AGPR_32RegClass;
15753 if (
RegName.consume_front(
"[")) {
15764 return std::pair(0U,
nullptr);
15767 RC =
TRI->getVGPRClassForBitWidth(Width);
15769 RC =
TRI->getSGPRClassForBitWidth(Width);
15771 RC =
TRI->getAGPRClassForBitWidth(Width);
15773 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15774 return std::pair(Reg, RC);
15780 return std::pair(0U,
nullptr);
15782 if (!
Failed && Idx < RC->getNumRegs())
15790 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15796 if (Constraint.
size() == 1) {
15797 switch (Constraint[0]) {
15807 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
15815 if (Constraint.
size() == 1) {
15816 switch (Constraint[0]) {
15833 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15840 std::vector<SDValue> &Ops,
15855 unsigned Size =
Op.getScalarValueSizeInBits();
15863 Val =
C->getSExtValue();
15867 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15873 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15876 Val =
C->getSExtValue();
15880 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15890 if (Constraint.
size() == 1) {
15891 switch (Constraint[0]) {
15895 return isInt<16>(Val);
15899 return isInt<32>(Val);
15906 }
else if (Constraint.
size() == 2) {
15907 if (Constraint ==
"DA") {
15908 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15909 int64_t LoBits =
static_cast<int32_t
>(Val);
15913 if (Constraint ==
"DB") {
15921 unsigned MaxSize)
const {
15922 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15925 MVT VT =
Op.getSimpleValueType();
15950 switch (UnalignedClassID) {
15951 case AMDGPU::VReg_64RegClassID:
15952 return AMDGPU::VReg_64_Align2RegClassID;
15953 case AMDGPU::VReg_96RegClassID:
15954 return AMDGPU::VReg_96_Align2RegClassID;
15955 case AMDGPU::VReg_128RegClassID:
15956 return AMDGPU::VReg_128_Align2RegClassID;
15957 case AMDGPU::VReg_160RegClassID:
15958 return AMDGPU::VReg_160_Align2RegClassID;
15959 case AMDGPU::VReg_192RegClassID:
15960 return AMDGPU::VReg_192_Align2RegClassID;
15961 case AMDGPU::VReg_224RegClassID:
15962 return AMDGPU::VReg_224_Align2RegClassID;
15963 case AMDGPU::VReg_256RegClassID:
15964 return AMDGPU::VReg_256_Align2RegClassID;
15965 case AMDGPU::VReg_288RegClassID:
15966 return AMDGPU::VReg_288_Align2RegClassID;
15967 case AMDGPU::VReg_320RegClassID:
15968 return AMDGPU::VReg_320_Align2RegClassID;
15969 case AMDGPU::VReg_352RegClassID:
15970 return AMDGPU::VReg_352_Align2RegClassID;
15971 case AMDGPU::VReg_384RegClassID:
15972 return AMDGPU::VReg_384_Align2RegClassID;
15973 case AMDGPU::VReg_512RegClassID:
15974 return AMDGPU::VReg_512_Align2RegClassID;
15975 case AMDGPU::VReg_1024RegClassID:
15976 return AMDGPU::VReg_1024_Align2RegClassID;
15977 case AMDGPU::AReg_64RegClassID:
15978 return AMDGPU::AReg_64_Align2RegClassID;
15979 case AMDGPU::AReg_96RegClassID:
15980 return AMDGPU::AReg_96_Align2RegClassID;
15981 case AMDGPU::AReg_128RegClassID:
15982 return AMDGPU::AReg_128_Align2RegClassID;
15983 case AMDGPU::AReg_160RegClassID:
15984 return AMDGPU::AReg_160_Align2RegClassID;
15985 case AMDGPU::AReg_192RegClassID:
15986 return AMDGPU::AReg_192_Align2RegClassID;
15987 case AMDGPU::AReg_256RegClassID:
15988 return AMDGPU::AReg_256_Align2RegClassID;
15989 case AMDGPU::AReg_512RegClassID:
15990 return AMDGPU::AReg_512_Align2RegClassID;
15991 case AMDGPU::AReg_1024RegClassID:
15992 return AMDGPU::AReg_1024_Align2RegClassID;
16008 if (
Info->isEntryFunction()) {
16015 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16017 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16018 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16019 &AMDGPU::SGPR_64RegClass);
16020 Info->setSGPRForEXECCopy(SReg);
16023 Info->getStackPtrOffsetReg()));
16024 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16025 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16029 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16030 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16032 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16033 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16035 Info->limitOccupancy(MF);
16037 if (ST.isWave32() && !MF.
empty()) {
16038 for (
auto &
MBB : MF) {
16039 for (
auto &
MI :
MBB) {
16040 TII->fixImplicitOperands(
MI);
16050 if (ST.needsAlignedVGPRs()) {
16051 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16057 if (NewClassID != -1)
16058 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16067 const APInt &DemandedElts,
16069 unsigned Depth)
const {
16071 unsigned Opc =
Op.getOpcode();
16074 unsigned IID =
Op.getConstantOperandVal(0);
16076 case Intrinsic::amdgcn_mbcnt_lo:
16077 case Intrinsic::amdgcn_mbcnt_hi: {
16083 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16093 Op, Known, DemandedElts, DAG,
Depth);
16108 unsigned MaxValue =
16117 switch (
MI->getOpcode()) {
16118 case AMDGPU::G_INTRINSIC:
16119 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16122 case Intrinsic::amdgcn_workitem_id_x:
16125 case Intrinsic::amdgcn_workitem_id_y:
16128 case Intrinsic::amdgcn_workitem_id_z:
16131 case Intrinsic::amdgcn_mbcnt_lo:
16132 case Intrinsic::amdgcn_mbcnt_hi: {
16144 case Intrinsic::amdgcn_groupstaticsize: {
16155 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16158 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16161 case AMDGPU::G_AMDGPU_SMED3:
16162 case AMDGPU::G_AMDGPU_UMED3: {
16163 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16190 unsigned Depth)
const {
16192 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16198 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16225 if (Header->getAlignment() != PrefAlign)
16226 return Header->getAlignment();
16228 unsigned LoopSize = 0;
16236 LoopSize +=
TII->getInstSizeInBytes(
MI);
16237 if (LoopSize > 192)
16242 if (LoopSize <= 64)
16245 if (LoopSize <= 128)
16246 return CacheLineAlign;
16252 auto I = Exit->getFirstNonDebugInstr();
16253 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16254 return CacheLineAlign;
16263 if (PreTerm == Pre->
begin() ||
16264 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16268 auto ExitHead = Exit->getFirstNonDebugInstr();
16269 if (ExitHead == Exit->end() ||
16270 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16275 return CacheLineAlign;
16283 N =
N->getOperand(0).getNode();
16293 switch (
N->getOpcode()) {
16301 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16302 return !
TRI->isSGPRReg(
MRI, Reg);
16304 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16308 return !
TRI->isSGPRReg(
MRI, Reg);
16312 unsigned AS = L->getAddressSpace();
16343 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16345 return A->readMem() &&
A->writeMem();
16380 unsigned Depth)
const {
16385 if (
Info->getMode().DX10Clamp)
16397 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16417 <<
"Hardware instruction generated for atomic "
16419 <<
" operation at memory scope " << MemScope;
16423 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16424 Type *EltTy = VT->getElementType();
16425 return VT->getNumElements() == 2 &&
16444 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16445 unsigned BW =
IT->getBitWidth();
16446 return BW == 32 || BW == 64;
16458 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16460 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16461 return BW == 32 || BW == 64;
16468 return VT->getNumElements() == 2 &&
16469 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16479 bool HasSystemScope) {
16486 if (HasSystemScope) {
16493 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16506 const MDNode *NoaliasAddrSpaceMD =
16507 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16508 if (!NoaliasAddrSpaceMD)
16511 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16513 auto *
Low = mdconst::extract<ConstantInt>(
16516 auto *
High = mdconst::extract<ConstantInt>(
16538 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16551 bool HasSystemScope =
16738 if (HasSystemScope)
16790 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16791 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16792 : &AMDGPU::SReg_32RegClass;
16793 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16794 return TRI->getEquivalentSGPRClass(RC);
16795 if (
TRI->isSGPRClass(RC) && isDivergent)
16796 return TRI->getEquivalentVGPRClass(RC);
16808 unsigned WaveSize) {
16813 if (!
IT ||
IT->getBitWidth() != WaveSize)
16816 if (!isa<Instruction>(V))
16818 if (!Visited.
insert(V).second)
16820 bool Result =
false;
16821 for (
const auto *U : V->users()) {
16822 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16823 if (V == U->getOperand(1)) {
16824 switch (Intrinsic->getIntrinsicID()) {
16828 case Intrinsic::amdgcn_if_break:
16829 case Intrinsic::amdgcn_if:
16830 case Intrinsic::amdgcn_else:
16835 if (V == U->getOperand(0)) {
16836 switch (Intrinsic->getIntrinsicID()) {
16840 case Intrinsic::amdgcn_end_cf:
16841 case Intrinsic::amdgcn_loop:
16847 Result =
hasCFUser(U, Visited, WaveSize);
16856 const Value *V)
const {
16857 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16858 if (CI->isInlineAsm()) {
16867 for (
auto &TC : TargetConstraints) {
16909 return MRI.hasOneNonDBGUse(N0);
16916 if (
I.getMetadata(
"amdgpu.noclobber"))
16918 if (
I.getMetadata(
"amdgpu.last.use"))
16928 if (!Def->isMachineOpcode())
16938 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16939 PhysReg = AMDGPU::SCC;
16941 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16996 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17007 Alignment = RMW->getAlign();
17022 RMW->getType()->isFloatTy();
17025 bool ReturnValueIsUsed = !AI->
use_empty();
17034 if (FullFlatEmulation) {
17045 std::prev(BB->
end())->eraseFromParent();
17048 Value *LoadedShared =
nullptr;
17049 if (FullFlatEmulation) {
17051 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17052 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17060 LoadedShared = Clone;
17067 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17075 Value *LoadedPrivate;
17078 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17081 LoadedPrivate, RMW->getValOperand());
17085 auto [ResultLoad, Equal] =
17100 if (FullFlatEmulation) {
17110 if (!FullFlatEmulation) {
17115 MDNode *RangeNotPrivate =
17118 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17126 if (ReturnValueIsUsed) {
17129 if (FullFlatEmulation)
17144 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17145 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const