39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
868 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
869 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
874 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
875 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
876 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
877 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
881 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
882 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
883 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
884 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
991 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1004 EVT DestVT,
EVT SrcVT)
const {
1014 LLT DestTy,
LLT SrcTy)
const {
1015 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1016 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1042 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1071 return (NumElts + 1) / 2;
1077 return NumElts * ((
Size + 31) / 32);
1086 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1095 if (ScalarVT == MVT::bf16) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = MVT::v2bf16;
1099 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1100 IntermediateVT = RegisterVT;
1102 NumIntermediates = (NumElts + 1) / 2;
1103 return NumIntermediates;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts;
1110 return NumIntermediates;
1113 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 RegisterVT = MVT::i16;
1116 IntermediateVT = ScalarVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1122 RegisterVT = MVT::i32;
1123 IntermediateVT = ScalarVT;
1124 NumIntermediates = NumElts;
1125 return NumIntermediates;
1129 RegisterVT = MVT::i32;
1130 IntermediateVT = RegisterVT;
1131 NumIntermediates = NumElts * ((
Size + 31) / 32);
1132 return NumIntermediates;
1137 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1142 unsigned MaxNumLanes) {
1143 assert(MaxNumLanes != 0);
1146 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1147 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1158 unsigned MaxNumLanes) {
1159 auto *ST = dyn_cast<StructType>(Ty);
1164 assert(ST->getNumContainedTypes() == 2 &&
1165 ST->getContainedType(1)->isIntegerTy(32));
1180 DL.getPointerSizeInBits(AS) == 192)
1190 DL.getPointerSizeInBits(AS) == 160) ||
1192 DL.getPointerSizeInBits(AS) == 192))
1200 unsigned IntrID)
const {
1202 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1217 if (RsrcIntr->IsImage) {
1225 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1232 Info.ptrVal = RsrcArg;
1235 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1244 if (RsrcIntr->IsImage) {
1245 unsigned MaxNumLanes = 4;
1260 std::numeric_limits<unsigned>::max());
1270 if (RsrcIntr->IsImage) {
1271 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1291 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1293 Info.memVT = MVT::i32;
1300 case Intrinsic::amdgcn_raw_buffer_load_lds:
1301 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1302 case Intrinsic::amdgcn_struct_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1304 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1309 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1310 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1311 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1315 std::numeric_limits<unsigned>::max());
1325 case Intrinsic::amdgcn_ds_ordered_add:
1326 case Intrinsic::amdgcn_ds_ordered_swap: {
1339 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1340 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1343 Info.ptrVal =
nullptr;
1348 case Intrinsic::amdgcn_ds_append:
1349 case Intrinsic::amdgcn_ds_consume: {
1362 case Intrinsic::amdgcn_global_atomic_csub: {
1371 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1381 case Intrinsic::amdgcn_global_atomic_fmin_num:
1382 case Intrinsic::amdgcn_global_atomic_fmax_num:
1383 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1384 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1385 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1386 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1396 case Intrinsic::amdgcn_global_load_tr_b64:
1397 case Intrinsic::amdgcn_global_load_tr_b128:
1398 case Intrinsic::amdgcn_ds_read_tr4_b64:
1399 case Intrinsic::amdgcn_ds_read_tr6_b96:
1400 case Intrinsic::amdgcn_ds_read_tr8_b64:
1401 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1409 case Intrinsic::amdgcn_ds_gws_init:
1410 case Intrinsic::amdgcn_ds_gws_barrier:
1411 case Intrinsic::amdgcn_ds_gws_sema_v:
1412 case Intrinsic::amdgcn_ds_gws_sema_br:
1413 case Intrinsic::amdgcn_ds_gws_sema_p:
1414 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1424 Info.memVT = MVT::i32;
1428 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1434 case Intrinsic::amdgcn_global_load_lds: {
1436 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1442 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1452 Info.memVT = MVT::i32;
1459 case Intrinsic::amdgcn_s_prefetch_data: {
1474 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1477 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1478 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1490 Type *&AccessTy)
const {
1492 switch (
II->getIntrinsicID()) {
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume:
1496 case Intrinsic::amdgcn_ds_read_tr4_b64:
1497 case Intrinsic::amdgcn_ds_read_tr6_b96:
1498 case Intrinsic::amdgcn_ds_read_tr8_b64:
1499 case Intrinsic::amdgcn_ds_read_tr16_b64:
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap:
1502 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1503 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1504 case Intrinsic::amdgcn_global_atomic_csub:
1505 case Intrinsic::amdgcn_global_atomic_fmax_num:
1506 case Intrinsic::amdgcn_global_atomic_fmin_num:
1507 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1508 case Intrinsic::amdgcn_global_load_tr_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b128:
1510 Ptr =
II->getArgOperand(0);
1512 case Intrinsic::amdgcn_global_load_lds:
1513 Ptr =
II->getArgOperand(1);
1518 AccessTy =
II->getType();
1524 unsigned AddrSpace)
const {
1536 return AM.
Scale == 0 &&
1538 AM.
BaseOffs, AddrSpace, FlatVariant));
1558 return isLegalMUBUFAddressingMode(AM);
1561bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1572 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1584 if (AM.HasBaseReg) {
1616 return isLegalMUBUFAddressingMode(AM);
1623 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1673 : isLegalMUBUFAddressingMode(AM);
1720 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1732 Align RequiredAlignment(
1735 Alignment < RequiredAlignment)
1756 RequiredAlignment =
Align(4);
1774 *IsFast = (Alignment >= RequiredAlignment) ? 64
1775 : (Alignment <
Align(4)) ? 32
1797 *IsFast = (Alignment >= RequiredAlignment) ? 96
1798 : (Alignment <
Align(4)) ? 32
1811 RequiredAlignment =
Align(8);
1822 *IsFast = (Alignment >= RequiredAlignment) ? 128
1823 : (Alignment <
Align(4)) ? 32
1840 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1842 return Alignment >= RequiredAlignment ||
1851 bool AlignedBy4 = Alignment >=
Align(4);
1853 *IsFast = AlignedBy4;
1864 return Alignment >=
Align(4) ||
1878 return Size >= 32 && Alignment >=
Align(4);
1883 unsigned *IsFast)
const {
1885 Alignment, Flags, IsFast);
1895 if (
Op.size() >= 16 &&
1899 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1907 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1917 unsigned DestAS)
const {
1925 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1943 unsigned Index)
const {
1979 auto [InputPtrReg, RC, ArgTy] =
1989 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1995 const SDLoc &SL)
const {
2002 const SDLoc &SL)
const {
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2042SDValue SITargetLowering::lowerKernargMemParameter(
2054 int64_t OffsetDiff =
Offset - AlignDownOffset;
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2080 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2128 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2133SDValue SITargetLowering::getPreloadedValue(
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2197 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2201 "vector type argument should have been split");
2206 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2214 "unexpected vector split in ps argument type");
2228 Info->markPSInputAllocated(PSInputNum);
2230 Info->markPSInputEnabled(PSInputNum);
2246 if (
Info.hasWorkItemIDX()) {
2256 if (
Info.hasWorkItemIDY()) {
2259 Info.setWorkItemIDY(
2262 unsigned Reg = AMDGPU::VGPR1;
2270 if (
Info.hasWorkItemIDZ()) {
2273 Info.setWorkItemIDZ(
2276 unsigned Reg = AMDGPU::VGPR2;
2296 if (RegIdx == ArgVGPRs.
size()) {
2303 unsigned Reg = ArgVGPRs[RegIdx];
2305 assert(Reg != AMDGPU::NoRegister);
2315 unsigned NumArgRegs) {
2318 if (RegIdx == ArgSGPRs.
size())
2321 unsigned Reg = ArgSGPRs[RegIdx];
2323 assert(Reg != AMDGPU::NoRegister);
2337 assert(Reg != AMDGPU::NoRegister);
2363 const unsigned Mask = 0x3ff;
2366 if (
Info.hasWorkItemIDX()) {
2368 Info.setWorkItemIDX(Arg);
2371 if (
Info.hasWorkItemIDY()) {
2373 Info.setWorkItemIDY(Arg);
2376 if (
Info.hasWorkItemIDZ())
2388 const unsigned Mask = 0x3ff;
2409 if (
Info.hasImplicitArgPtr())
2417 if (
Info.hasWorkGroupIDX())
2420 if (
Info.hasWorkGroupIDY())
2423 if (
Info.hasWorkGroupIDZ())
2426 if (
Info.hasLDSKernelId())
2438 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2445 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2451 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2457 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2472 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2478 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2484 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2501 bool InPreloadSequence =
true;
2503 bool AlignedForImplictArgs =
false;
2504 unsigned ImplicitArgOffset = 0;
2505 for (
auto &Arg :
F.args()) {
2506 if (!InPreloadSequence || !Arg.hasInRegAttr())
2509 unsigned ArgIdx = Arg.getArgNo();
2512 if (InIdx < Ins.size() &&
2513 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2516 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2517 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2519 assert(ArgLocs[ArgIdx].isMemLoc());
2520 auto &ArgLoc = ArgLocs[InIdx];
2522 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2524 unsigned NumAllocSGPRs =
2525 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2528 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2529 if (!AlignedForImplictArgs) {
2531 alignTo(LastExplicitArgOffset,
2533 LastExplicitArgOffset;
2534 AlignedForImplictArgs =
true;
2536 ArgOffset += ImplicitArgOffset;
2540 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2541 assert(InIdx >= 1 &&
"No previous SGPR");
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2551 InPreloadSequence =
false;
2557 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2559 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2561 if (PreloadRegs->
size() > 1)
2562 RC = &AMDGPU::SGPR_32RegClass;
2563 for (
auto &Reg : *PreloadRegs) {
2569 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2578 if (
Info.hasLDSKernelId()) {
2580 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2589 bool IsShader)
const {
2597 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2599 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2603 unsigned NumRequiredSystemSGPRs =
2604 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2605 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2606 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2608 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 if (!HasArchitectedSGPRs) {
2614 if (
Info.hasWorkGroupIDX()) {
2616 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 if (
Info.hasWorkGroupIDY()) {
2622 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 if (
Info.hasWorkGroupIDZ()) {
2628 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 if (
Info.hasWorkGroupInfo()) {
2635 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2654 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2656 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2661 Info.getNumPreloadedSGPRs() >= 16);
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(
true);
2682 HasStackObjects =
true;
2686 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2698 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2727 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2735 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!
MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2742 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2756 return !
Info->isEntryFunction();
2766 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2775 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2776 RC = &AMDGPU::SGPR_64RegClass;
2777 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2778 RC = &AMDGPU::SGPR_32RegClass;
2784 Entry->addLiveIn(*
I);
2789 for (
auto *Exit : Exits)
2791 TII->get(TargetOpcode::COPY), *
I)
2809 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2828 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2829 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2837 !
Info->hasWorkGroupIDZ());
2856 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2857 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2860 Info->markPSInputAllocated(0);
2861 Info->markPSInputEnabled(0);
2872 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2873 if ((PsInputBits & 0x7F) == 0 ||
2874 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2877 }
else if (IsKernel) {
2880 Splits.
append(Ins.begin(), Ins.end());
2893 }
else if (!IsGraphics) {
2918 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2928 if (IsEntryFunc && VA.
isMemLoc()) {
2951 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2955 int64_t OffsetDiff =
Offset - AlignDownOffset;
2962 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2973 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2974 Ins[i].Flags.isSExt(), &Ins[i]);
2982 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2985 if (PreloadRegs.
size() == 1) {
2986 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2991 TRI->getRegSizeInBits(*RC)));
2999 for (
auto Reg : PreloadRegs) {
3006 PreloadRegs.size()),
3023 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3024 Ins[i].Flags.isSExt(), &Ins[i]);
3036 "hidden argument in kernel signature was not preloaded",
3043 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3044 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3049 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3064 if (!IsEntryFunc && VA.
isMemLoc()) {
3065 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3076 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3077 RC = &AMDGPU::VGPR_32RegClass;
3078 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3079 RC = &AMDGPU::SGPR_32RegClass;
3139 Info->setBytesInStackArgArea(StackArgSize);
3141 return Chains.
empty() ? Chain
3157 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3163 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3164 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3165 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3188 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3206 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3207 ++
I, ++RealRVLocIdx) {
3211 SDValue Arg = OutVals[RealRVLocIdx];
3239 if (!
Info->isEntryFunction()) {
3245 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3247 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3263 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3346 auto &ArgUsageInfo =
3348 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3374 const auto [OutgoingArg, ArgRC, ArgTy] =
3379 const auto [IncomingArg, IncomingArgRC, Ty] =
3381 assert(IncomingArgRC == ArgRC);
3384 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3392 InputReg = getImplicitArgPtr(DAG,
DL);
3394 std::optional<uint32_t> Id =
3396 if (Id.has_value()) {
3407 if (OutgoingArg->isRegister()) {
3408 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3409 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3412 unsigned SpecialArgOffset =
3423 auto [OutgoingArg, ArgRC, Ty] =
3426 std::tie(OutgoingArg, ArgRC, Ty) =
3429 std::tie(OutgoingArg, ArgRC, Ty) =
3444 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3445 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3446 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3490 : IncomingArgY ? *IncomingArgY
3497 if (OutgoingArg->isRegister()) {
3499 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3540 if (Callee->isDivergent())
3547 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3551 if (!CallerPreserved)
3554 bool CCMatch = CallerCC == CalleeCC;
3567 if (Arg.hasByValAttr())
3581 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3582 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3591 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3604 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3606 if (!CCVA.isRegLoc())
3611 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3613 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3642 if (IsChainCallConv) {
3646 RequestedExec = CLI.
Args.back();
3647 assert(RequestedExec.
Node &&
"No node for EXEC");
3652 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3653 CLI.
Outs.pop_back();
3657 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3658 CLI.
Outs.pop_back();
3663 "Haven't popped all the pieces of the EXEC mask");
3674 bool IsSibCall =
false;
3688 "unsupported call to variadic function ");
3696 "unsupported required tail call to function ");
3701 Outs, OutVals, Ins, DAG);
3705 "site marked musttail or on llvm.amdgcn.cs.chain");
3712 if (!TailCallOpt && IsTailCall)
3758 if (!IsSibCall || IsChainCallConv) {
3765 RegsToPass.emplace_back(IsChainCallConv
3766 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3767 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3774 const unsigned NumSpecialInputs = RegsToPass.size();
3776 MVT PtrVT = MVT::i32;
3779 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3807 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3815 int32_t
Offset = LocMemOffset;
3822 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3828 ? Flags.getNonZeroByValAlign()
3855 if (Outs[i].Flags.isByVal()) {
3857 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3860 Outs[i].Flags.getNonZeroByValAlign(),
3862 nullptr, std::nullopt, DstInfo,
3868 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3874 if (!MemOpChains.
empty())
3890 unsigned ArgIdx = 0;
3891 for (
auto [Reg, Val] : RegsToPass) {
3892 if (ArgIdx++ >= NumSpecialInputs &&
3893 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3919 if (IsTailCall && !IsSibCall) {
3924 std::vector<SDValue> Ops({Chain});
3930 Ops.push_back(Callee);
3947 Ops.push_back(Callee);
3958 if (IsChainCallConv)
3959 Ops.push_back(RequestedExec.
Node);
3963 for (
auto &[Reg, Val] : RegsToPass)
3967 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3968 assert(Mask &&
"Missing call preserved mask for calling convention");
3978 MVT::Glue, GlueOps),
3983 Ops.push_back(InGlue);
4000 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4005 Chain = Call.getValue(0);
4006 InGlue = Call.getValue(1);
4008 uint64_t CalleePopBytes = NumBytes;
4027 EVT VT =
Op.getValueType();
4042 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
4045 "Stack grows upwards for AMDGPU");
4053 if (Alignment && *Alignment > StackAlign) {
4075 if (isa<ConstantSDNode>(
Size))
4082 if (
Op.getValueType() != MVT::i32)
4101 assert(
Op.getValueType() == MVT::i32);
4110 Op.getOperand(0), IntrinID, GetRoundBothImm);
4144 SDValue RoundModeTimesNumBits =
4164 TableEntry, EnumOffset);
4178 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4180 static_cast<uint32_t>(ConstMode->getZExtValue()),
4192 if (UseReducedTable) {
4198 SDValue RoundModeTimesNumBits =
4218 SDValue RoundModeTimesNumBits =
4227 NewMode = TruncTable;
4236 ReadFirstLaneID, NewMode);
4249 IntrinID, RoundBothImm, NewMode);
4255 if (
Op->isDivergent())
4274 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4275 EVT SrcVT = Src.getValueType();
4284 EVT DstVT =
Op.getValueType();
4293 if (
Op.getValueType() != MVT::i64)
4307 Op.getOperand(0), IntrinID, ModeHwRegImm);
4309 Op.getOperand(0), IntrinID, TrapHwRegImm);
4323 if (
Op.getOperand(1).getValueType() != MVT::i64)
4335 ReadFirstLaneID, NewModeReg);
4337 ReadFirstLaneID, NewTrapReg);
4339 unsigned ModeHwReg =
4342 unsigned TrapHwReg =
4350 IntrinID, ModeHwRegImm, NewModeReg);
4353 IntrinID, TrapHwRegImm, NewTrapReg);
4360 .
Case(
"m0", AMDGPU::M0)
4361 .
Case(
"exec", AMDGPU::EXEC)
4362 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4363 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4364 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4365 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4366 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4369 if (Reg == AMDGPU::NoRegister) {
4377 "\" for subtarget."));
4382 case AMDGPU::EXEC_LO:
4383 case AMDGPU::EXEC_HI:
4384 case AMDGPU::FLAT_SCR_LO:
4385 case AMDGPU::FLAT_SCR_HI:
4390 case AMDGPU::FLAT_SCR:
4409 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4418static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4440 auto Next = std::next(
I);
4453 return std::pair(LoopBB, RemainderBB);
4460 auto I =
MI.getIterator();
4461 auto E = std::next(
I);
4483 Src->setIsKill(
false);
4493 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4499 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4502 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4526 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4527 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4536 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4537 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4538 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4539 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4547 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4554 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4558 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4564 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4565 : AMDGPU::S_AND_SAVEEXEC_B64),
4569 MRI.setSimpleHint(NewExec, CondReg);
4571 if (UseGPRIdxMode) {
4573 SGPRIdxReg = CurrentIdxReg;
4575 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4576 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4583 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4586 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4593 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4596 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4597 : AMDGPU::S_XOR_B64_term),
4621 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4622 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4630 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4632 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4633 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4634 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4635 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4650 InitResultReg, DstReg, PhiReg, TmpExec,
4651 Offset, UseGPRIdxMode, SGPRIdxReg);
4657 LoopBB->removeSuccessor(RemainderBB);
4659 LoopBB->addSuccessor(LandingPad);
4670static std::pair<unsigned, int>
4674 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4679 return std::pair(AMDGPU::sub0,
Offset);
4693 assert(
Idx->getReg() != AMDGPU::NoRegister);
4717 return Idx->getReg();
4719 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4736 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4737 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4746 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4749 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4753 if (UseGPRIdxMode) {
4760 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4773 MI.eraseFromParent();
4782 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4783 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4789 UseGPRIdxMode, SGPRIdxReg);
4793 if (UseGPRIdxMode) {
4795 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4797 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4802 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4807 MI.eraseFromParent();
4824 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4834 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4836 if (
Idx->getReg() == AMDGPU::NoRegister) {
4847 MI.eraseFromParent();
4852 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4856 if (UseGPRIdxMode) {
4860 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4869 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4870 TRI.getRegSizeInBits(*VecRC), 32,
false);
4876 MI.eraseFromParent();
4886 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4890 UseGPRIdxMode, SGPRIdxReg);
4893 if (UseGPRIdxMode) {
4895 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4897 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4903 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4904 TRI.getRegSizeInBits(*VecRC), 32,
false);
4905 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4911 MI.eraseFromParent();
4926 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4957 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4958 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4960 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4961 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4962 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4964 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4965 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4967 bool IsWave32 = ST.isWave32();
4968 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4969 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4974 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4977 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4985 I = ComputeLoop->end();
4987 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4991 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4992 .
addReg(TmpSReg->getOperand(0).getReg())
4996 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4997 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4998 .
addReg(ActiveBits->getOperand(0).getReg());
4999 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5000 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5002 .
addReg(FF1->getOperand(0).getReg());
5003 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5005 .
addReg(LaneValue->getOperand(0).getReg());
5008 unsigned BITSETOpc =
5009 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5010 auto NewActiveBits =
5011 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5012 .
addReg(FF1->getOperand(0).getReg())
5013 .
addReg(ActiveBits->getOperand(0).getReg());
5016 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5017 .addMBB(ComputeLoop);
5018 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5019 .addMBB(ComputeLoop);
5022 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5024 .
addReg(NewActiveBits->getOperand(0).getReg())
5026 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5031 MI.eraseFromParent();
5043 switch (
MI.getOpcode()) {
5044 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5046 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5048 case AMDGPU::S_UADDO_PSEUDO:
5049 case AMDGPU::S_USUBO_PSEUDO: {
5056 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5058 : AMDGPU::S_SUB_I32;
5069 MI.eraseFromParent();
5072 case AMDGPU::S_ADD_U64_PSEUDO:
5073 case AMDGPU::S_SUB_U64_PSEUDO: {
5082 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5084 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5094 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5095 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5098 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5100 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5103 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5105 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5107 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5108 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5121 MI.eraseFromParent();
5124 case AMDGPU::V_ADD_U64_PSEUDO:
5125 case AMDGPU::V_SUB_U64_PSEUDO: {
5131 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5137 if (IsAdd && ST.hasLshlAddB64()) {
5143 TII->legalizeOperands(*
Add);
5144 MI.eraseFromParent();
5148 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5150 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5151 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5153 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5154 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5158 : &AMDGPU::VReg_64RegClass;
5161 : &AMDGPU::VReg_64RegClass;
5164 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5166 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5169 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5171 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5174 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5176 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5179 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5186 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5200 TII->legalizeOperands(*LoHalf);
5201 TII->legalizeOperands(*HiHalf);
5202 MI.eraseFromParent();
5205 case AMDGPU::S_ADD_CO_PSEUDO:
5206 case AMDGPU::S_SUB_CO_PSEUDO: {
5220 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5221 ? AMDGPU::S_ADDC_U32
5222 : AMDGPU::S_SUBB_U32;
5224 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5225 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5230 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5231 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5235 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5243 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5244 assert(WaveSize == 64 || WaveSize == 32);
5246 if (WaveSize == 64) {
5247 if (ST.hasScalarCompareEq64()) {
5253 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5255 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5257 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5258 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5260 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5281 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5287 MI.eraseFromParent();
5290 case AMDGPU::SI_INIT_M0: {
5292 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5293 .
add(
MI.getOperand(0));
5294 MI.eraseFromParent();
5297 case AMDGPU::GET_GROUPSTATICSIZE: {
5302 .
add(
MI.getOperand(0))
5304 MI.eraseFromParent();
5307 case AMDGPU::GET_SHADERCYCLESHILO: {
5321 using namespace AMDGPU::Hwreg;
5322 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5324 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5325 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5327 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5328 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5330 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5334 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5339 .
add(
MI.getOperand(0))
5344 MI.eraseFromParent();
5347 case AMDGPU::SI_INDIRECT_SRC_V1:
5348 case AMDGPU::SI_INDIRECT_SRC_V2:
5349 case AMDGPU::SI_INDIRECT_SRC_V4:
5350 case AMDGPU::SI_INDIRECT_SRC_V8:
5351 case AMDGPU::SI_INDIRECT_SRC_V9:
5352 case AMDGPU::SI_INDIRECT_SRC_V10:
5353 case AMDGPU::SI_INDIRECT_SRC_V11:
5354 case AMDGPU::SI_INDIRECT_SRC_V12:
5355 case AMDGPU::SI_INDIRECT_SRC_V16:
5356 case AMDGPU::SI_INDIRECT_SRC_V32:
5358 case AMDGPU::SI_INDIRECT_DST_V1:
5359 case AMDGPU::SI_INDIRECT_DST_V2:
5360 case AMDGPU::SI_INDIRECT_DST_V4:
5361 case AMDGPU::SI_INDIRECT_DST_V8:
5362 case AMDGPU::SI_INDIRECT_DST_V9:
5363 case AMDGPU::SI_INDIRECT_DST_V10:
5364 case AMDGPU::SI_INDIRECT_DST_V11:
5365 case AMDGPU::SI_INDIRECT_DST_V12:
5366 case AMDGPU::SI_INDIRECT_DST_V16:
5367 case AMDGPU::SI_INDIRECT_DST_V32:
5369 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5370 case AMDGPU::SI_KILL_I1_PSEUDO:
5372 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5381 Register SrcCond =
MI.getOperand(3).getReg();
5383 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5384 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5385 const auto *CondRC =
TRI->getWaveMaskRegClass();
5386 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5390 : &AMDGPU::VReg_64RegClass;
5393 : &AMDGPU::VReg_64RegClass;
5396 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5398 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5401 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5403 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5406 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5408 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5429 MI.eraseFromParent();
5432 case AMDGPU::SI_BR_UNDEF: {
5436 .
add(
MI.getOperand(0));
5438 MI.eraseFromParent();
5441 case AMDGPU::ADJCALLSTACKUP:
5442 case AMDGPU::ADJCALLSTACKDOWN: {
5449 case AMDGPU::SI_CALL_ISEL: {
5453 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5456 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5462 MI.eraseFromParent();
5465 case AMDGPU::V_ADD_CO_U32_e32:
5466 case AMDGPU::V_SUB_CO_U32_e32:
5467 case AMDGPU::V_SUBREV_CO_U32_e32: {
5470 unsigned Opc =
MI.getOpcode();
5472 bool NeedClampOperand =
false;
5473 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5475 NeedClampOperand =
true;
5479 if (
TII->isVOP3(*
I)) {
5484 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5485 if (NeedClampOperand)
5488 TII->legalizeOperands(*
I);
5490 MI.eraseFromParent();
5493 case AMDGPU::V_ADDC_U32_e32:
5494 case AMDGPU::V_SUBB_U32_e32:
5495 case AMDGPU::V_SUBBREV_U32_e32:
5498 TII->legalizeOperands(
MI);
5500 case AMDGPU::DS_GWS_INIT:
5501 case AMDGPU::DS_GWS_SEMA_BR:
5502 case AMDGPU::DS_GWS_BARRIER:
5503 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5505 case AMDGPU::DS_GWS_SEMA_V:
5506 case AMDGPU::DS_GWS_SEMA_P:
5507 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5515 case AMDGPU::S_SETREG_B32: {
5530 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5531 const unsigned SetMask = WidthMask <<
Offset;
5534 unsigned SetDenormOp = 0;
5535 unsigned SetRoundOp = 0;
5543 SetRoundOp = AMDGPU::S_ROUND_MODE;
5544 SetDenormOp = AMDGPU::S_DENORM_MODE;
5546 SetRoundOp = AMDGPU::S_ROUND_MODE;
5548 SetDenormOp = AMDGPU::S_DENORM_MODE;
5551 if (SetRoundOp || SetDenormOp) {
5554 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5555 unsigned ImmVal = Def->getOperand(1).getImm();
5569 MI.eraseFromParent();
5578 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5582 case AMDGPU::S_INVERSE_BALLOT_U32:
5583 case AMDGPU::S_INVERSE_BALLOT_U64:
5586 MI.setDesc(
TII->get(AMDGPU::COPY));
5588 case AMDGPU::ENDPGM_TRAP: {
5591 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5611 MI.eraseFromParent();
5614 case AMDGPU::SIMULATED_TRAP: {
5618 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5619 MI.eraseFromParent();
5656 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5743 EVT VT =
N->getValueType(0);
5747 if (VT == MVT::f16) {
5763 unsigned Opc =
Op.getOpcode();
5764 EVT VT =
Op.getValueType();
5765 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5766 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5767 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5768 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5783 unsigned Opc =
Op.getOpcode();
5784 EVT VT =
Op.getValueType();
5785 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5786 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5787 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5788 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5796 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5798 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5805 unsigned Opc =
Op.getOpcode();
5806 EVT VT =
Op.getValueType();
5807 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5808 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5809 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5810 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5811 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5812 VT == MVT::v32bf16);
5817 : std::pair(Op0, Op0);
5826 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5828 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5834 switch (
Op.getOpcode()) {
5838 return LowerBRCOND(
Op, DAG);
5840 return LowerRETURNADDR(
Op, DAG);
5843 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5844 "Load should return a value and a chain");
5848 EVT VT =
Op.getValueType();
5850 return lowerFSQRTF32(
Op, DAG);
5852 return lowerFSQRTF64(
Op, DAG);
5857 return LowerTrig(
Op, DAG);
5859 return LowerSELECT(
Op, DAG);
5861 return LowerFDIV(
Op, DAG);
5863 return LowerFFREXP(
Op, DAG);
5865 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5867 return LowerSTORE(
Op, DAG);
5871 return LowerGlobalAddress(MFI,
Op, DAG);
5874 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5876 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5878 return LowerINTRINSIC_VOID(
Op, DAG);
5880 return lowerADDRSPACECAST(
Op, DAG);
5882 return lowerINSERT_SUBVECTOR(
Op, DAG);
5884 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5886 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5888 return lowerVECTOR_SHUFFLE(
Op, DAG);
5890 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5892 return lowerBUILD_VECTOR(
Op, DAG);
5895 return lowerFP_ROUND(
Op, DAG);
5897 return lowerTRAP(
Op, DAG);
5899 return lowerDEBUGTRAP(
Op, DAG);
5908 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5911 return lowerFLDEXP(
Op, DAG);
5940 return lowerMUL(
Op, DAG);
5943 return lowerXMULO(
Op, DAG);
5946 return lowerXMUL_LOHI(
Op, DAG);
5979 EVT FittingLoadVT = LoadVT;
6011SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6014 bool IsIntrinsic)
const {
6018 EVT LoadVT =
M->getValueType(0);
6020 EVT EquivLoadVT = LoadVT;
6038 M->getMemoryVT(),
M->getMemOperand());
6049 EVT LoadVT =
M->getValueType(0);
6055 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6056 bool IsTFE =
M->getNumValues() == 3;
6069 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6073 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6074 M->getMemOperand(), DAG);
6079 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6080 M->getMemOperand(), DAG);
6088 EVT VT =
N->getValueType(0);
6089 unsigned CondCode =
N->getConstantOperandVal(3);
6100 EVT CmpVT =
LHS.getValueType();
6101 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6102 unsigned PromoteOp =
6122 EVT VT =
N->getValueType(0);
6124 unsigned CondCode =
N->getConstantOperandVal(3);
6133 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6151 EVT VT =
N->getValueType(0);
6158 Src.getOperand(1), Src.getOperand(2));
6169 Exec = AMDGPU::EXEC_LO;
6171 Exec = AMDGPU::EXEC;
6188 EVT VT =
N->getValueType(0);
6190 unsigned IID =
N->getConstantOperandVal(0);
6191 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6192 IID == Intrinsic::amdgcn_permlanex16;
6193 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6194 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6198 unsigned SplitSize = 32;
6199 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6200 ST->hasDPALU_DPP() &&
6208 case Intrinsic::amdgcn_permlane16:
6209 case Intrinsic::amdgcn_permlanex16:
6210 case Intrinsic::amdgcn_update_dpp:
6215 case Intrinsic::amdgcn_writelane:
6218 case Intrinsic::amdgcn_readlane:
6219 case Intrinsic::amdgcn_set_inactive:
6220 case Intrinsic::amdgcn_set_inactive_chain_arg:
6221 case Intrinsic::amdgcn_mov_dpp8:
6224 case Intrinsic::amdgcn_readfirstlane:
6225 case Intrinsic::amdgcn_permlane64:
6235 if (
SDNode *GL =
N->getGluedNode()) {
6237 GL = GL->getOperand(0).getNode();
6247 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6248 IID == Intrinsic::amdgcn_mov_dpp8 ||
6249 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6250 Src1 =
N->getOperand(2);
6251 if (IID == Intrinsic::amdgcn_writelane ||
6252 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6253 Src2 =
N->getOperand(3);
6256 if (ValSize == SplitSize) {
6266 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6271 if (IID == Intrinsic::amdgcn_writelane) {
6276 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6278 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6281 if (ValSize % SplitSize != 0)
6285 EVT VT =
N->getValueType(0);
6289 unsigned NumOperands =
N->getNumOperands();
6291 SDNode *GL =
N->getGluedNode();
6296 for (
unsigned i = 0; i != NE; ++i) {
6297 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6299 SDValue Operand =
N->getOperand(j);
6329 if (SplitSize == 32) {
6331 return unrollLaneOp(LaneOp.
getNode());
6337 unsigned SubVecNumElt =
6341 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6342 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6346 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6351 if (IID == Intrinsic::amdgcn_writelane)
6356 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6357 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6358 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6359 EltIdx += SubVecNumElt;
6373 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6376 if (IID == Intrinsic::amdgcn_writelane)
6379 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6387 switch (
N->getOpcode()) {
6399 unsigned IID =
N->getConstantOperandVal(0);
6401 case Intrinsic::amdgcn_make_buffer_rsrc:
6402 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6404 case Intrinsic::amdgcn_cvt_pkrtz: {
6413 case Intrinsic::amdgcn_cvt_pknorm_i16:
6414 case Intrinsic::amdgcn_cvt_pknorm_u16:
6415 case Intrinsic::amdgcn_cvt_pk_i16:
6416 case Intrinsic::amdgcn_cvt_pk_u16: {
6422 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6424 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6426 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6431 EVT VT =
N->getValueType(0);
6440 case Intrinsic::amdgcn_s_buffer_load: {
6452 EVT VT =
Op.getValueType();
6453 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6465 if (!
Offset->isDivergent()) {
6484 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6496 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6497 Results.push_back(Res.getOperand(
I));
6501 Results.push_back(Res.getValue(1));
6510 EVT VT =
N->getValueType(0);
6515 EVT SelectVT = NewVT;
6516 if (NewVT.
bitsLT(MVT::i32)) {
6519 SelectVT = MVT::i32;
6525 if (NewVT != SelectVT)
6531 if (
N->getValueType(0) != MVT::v2f16)
6543 if (
N->getValueType(0) != MVT::v2f16)
6555 if (
N->getValueType(0) != MVT::f16)
6570 if (U.get() !=
Value)
6573 if (U.getUser()->getOpcode() == Opcode)
6579unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6581 switch (
Intr->getConstantOperandVal(1)) {
6582 case Intrinsic::amdgcn_if:
6584 case Intrinsic::amdgcn_else:
6586 case Intrinsic::amdgcn_loop:
6588 case Intrinsic::amdgcn_end_cf:
6635 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6648 assert(BR &&
"brcond missing unconditional branch user");
6649 Target = BR->getOperand(1);
6652 unsigned CFNode = isCFIntrinsic(
Intr);
6671 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6695 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6708 Intr->getOperand(0));
6714 MVT VT =
Op.getSimpleValueType();
6717 if (
Op.getConstantOperandVal(0) != 0)
6723 if (
Info->isEntryFunction())
6740 return Op.getValueType().bitsLE(VT)
6747 assert(
Op.getValueType() == MVT::f16 &&
6748 "Do not know how to custom lower FP_ROUND for non-f16 type");
6751 EVT SrcVT = Src.getValueType();
6752 if (SrcVT != MVT::f64)
6768 EVT VT =
Op.getValueType();
6771 bool IsIEEEMode =
Info->getMode().IEEE;
6780 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6788 EVT VT =
Op.getValueType();
6792 EVT ExpVT =
Exp.getValueType();
6793 if (ExpVT == MVT::i16)
6814 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6821 switch (
Op->getOpcode()) {
6851 DAGCombinerInfo &DCI)
const {
6852 const unsigned Opc =
Op.getOpcode();
6860 :
Op->getOperand(0).getValueType();
6863 if (DCI.isBeforeLegalizeOps() ||
6867 auto &DAG = DCI.DAG;
6873 LHS =
Op->getOperand(1);
6874 RHS =
Op->getOperand(2);
6876 LHS =
Op->getOperand(0);
6877 RHS =
Op->getOperand(1);
6908 EVT VT =
Op.getValueType();
6914 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6941 if (
Op->isDivergent())
6954 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6956 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6959 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6961 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6967 EVT VT =
Op.getValueType();
6974 const APInt &
C = RHSC->getAPIntValue();
6976 if (
C.isPowerOf2()) {
6978 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7005 if (
Op->isDivergent()) {
7022 return lowerTrapEndpgm(
Op, DAG);
7025 : lowerTrapHsaQueuePtr(
Op, DAG);
7035SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7037 ImplicitParameter Param)
const {
7057 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7063 if (UserSGPR == AMDGPU::NoRegister) {
7105 "debugtrap handler not supported",
7118SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7122 ? AMDGPU::SRC_SHARED_BASE
7123 : AMDGPU::SRC_PRIVATE_BASE;
7146 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7155 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7161 if (UserSGPR == AMDGPU::NoRegister) {
7191 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7192 isa<BasicBlockSDNode>(Val))
7195 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7196 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7210 unsigned DestAS, SrcAS;
7212 bool IsNonNull =
false;
7213 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7214 SrcAS = ASC->getSrcAddressSpace();
7215 Src = ASC->getOperand(0);
7216 DestAS = ASC->getDestAddressSpace();
7219 Op.getConstantOperandVal(0) ==
7220 Intrinsic::amdgcn_addrspacecast_nonnull);
7221 Src =
Op->getOperand(1);
7222 SrcAS =
Op->getConstantOperandVal(2);
7223 DestAS =
Op->getConstantOperandVal(3);
7238 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7252 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7260 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7272 Op.getValueType() == MVT::i64) {
7281 Src.getValueType() == MVT::i64)
7305 EVT InsVT =
Ins.getValueType();
7308 unsigned IdxVal =
Idx->getAsZExtVal();
7313 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7318 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7320 MVT::i32, InsNumElts / 2);
7325 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7327 if (InsNumElts == 2) {
7340 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7362 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7363 if (NumElts == 4 && EltSize == 16 && KIdx) {
7374 unsigned Idx = KIdx->getZExtValue();
7375 bool InsertLo =
Idx < 2;
7392 if (isa<ConstantSDNode>(
Idx))
7398 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7404 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7432 EVT ResultVT =
Op.getValueType();
7445 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7448 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7452 if (VecSize == 128) {
7460 }
else if (VecSize == 256) {
7463 for (
unsigned P = 0;
P < 4; ++
P) {
7469 Parts[0], Parts[1]));
7471 Parts[2], Parts[3]));
7477 for (
unsigned P = 0;
P < 8; ++
P) {
7484 Parts[0], Parts[1], Parts[2], Parts[3]));
7487 Parts[4], Parts[5], Parts[6], Parts[7]));
7490 EVT IdxVT =
Idx.getValueType();
7507 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7522 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7532 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7538 EVT ResultVT =
Op.getValueType();
7541 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7543 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7559 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7560 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7568 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7569 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7570 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7571 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7590 EVT ResultVT =
Op.getValueType();
7606 EVT VT =
Op.getValueType();
7608 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7642 for (
unsigned P = 0;
P < NumParts; ++
P) {
7644 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7677 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7715 EVT PtrVT =
Op.getValueType();
7731 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7803 SDValue Param = lowerKernargMemParameter(
7813 "non-hsa intrinsic with hsa target",
7822 "intrinsic not supported on subtarget",
7832 unsigned NumElts = Elts.
size();
7834 if (NumElts <= 12) {
7843 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7849 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7850 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7859 EVT SrcVT = Src.getValueType();
7880 bool Unpacked,
bool IsD16,
int DMaskPop,
7881 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7885 EVT ReqRetVT = ResultTypes[0];
7887 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7888 ? (ReqRetNumElts + 1) / 2
7891 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7902 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7913 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7915 NumDataDwords - MaskPopDwords);
7920 EVT LegalReqRetVT = ReqRetVT;
7922 if (!
Data.getValueType().isInteger())
7924 Data.getValueType().changeTypeToInteger(),
Data);
7945 if (Result->getNumValues() == 1)
7952 SDValue *LWE,
bool &IsTexFail) {
7953 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7972 unsigned DimIdx,
unsigned EndIdx,
7973 unsigned NumGradients) {
7975 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7983 if (((
I + 1) >= EndIdx) ||
7984 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7985 I == DimIdx + NumGradients - 1))) {
7986 if (
Addr.getValueType() != MVT::i16)
8007 unsigned IntrOpcode =
Intr->BaseOpcode;
8018 int NumVDataDwords = 0;
8019 bool AdjustRetType =
false;
8020 bool IsAtomicPacked16Bit =
false;
8023 const unsigned ArgOffset = WithChain ? 2 : 1;
8026 unsigned DMaskLanes = 0;
8028 if (BaseOpcode->Atomic) {
8029 VData =
Op.getOperand(2);
8031 IsAtomicPacked16Bit =
8032 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8033 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8036 if (BaseOpcode->AtomicX2) {
8043 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8044 DMask = Is64Bit ? 0xf : 0x3;
8045 NumVDataDwords = Is64Bit ? 4 : 2;
8047 DMask = Is64Bit ? 0x3 : 0x1;
8048 NumVDataDwords = Is64Bit ? 2 : 1;
8051 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8054 if (BaseOpcode->Store) {
8055 VData =
Op.getOperand(2);
8063 VData = handleD16VData(VData, DAG,
true);
8067 }
else if (!BaseOpcode->NoReturn) {
8080 (!LoadVT.
isVector() && DMaskLanes > 1))
8088 NumVDataDwords = (DMaskLanes + 1) / 2;
8090 NumVDataDwords = DMaskLanes;
8092 AdjustRetType =
true;
8096 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8101 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8103 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8104 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8106 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8108 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8109 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8112 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8113 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8114 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8119 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8123 "Bias needs to be converted to 16 bit in A16 mode");
8128 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8132 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8133 "require 16 bit args for both gradients and addresses");
8138 if (!
ST->hasA16()) {
8139 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8140 "support 16 bit addresses\n");
8150 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8154 IntrOpcode = G16MappingInfo->
G16;
8162 ArgOffset +
Intr->GradientStart,
8163 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8165 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8166 I < ArgOffset + Intr->CoordStart;
I++)
8173 ArgOffset +
Intr->CoordStart, VAddrEnd,
8177 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8195 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8196 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8197 const bool UseNSA =
ST->hasNSAEncoding() &&
8198 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8199 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8200 const bool UsePartialNSA =
8201 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8204 if (UsePartialNSA) {
8206 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8207 }
else if (!UseNSA) {
8214 if (!BaseOpcode->Sampler) {
8218 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8220 Unorm = UnormConst ? True : False;
8225 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8226 bool IsTexFail =
false;
8227 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8238 NumVDataDwords += 1;
8239 AdjustRetType =
true;
8244 if (AdjustRetType) {
8247 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8250 if (isa<MemSDNode>(
Op))
8256 MVT::i32, NumVDataDwords)
8259 ResultTypes[0] = NewVT;
8260 if (ResultTypes.size() == 3) {
8264 ResultTypes.erase(&ResultTypes[1]);
8268 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8269 if (BaseOpcode->Atomic)
8276 if (BaseOpcode->Store || BaseOpcode->Atomic)
8278 if (UsePartialNSA) {
8287 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8290 if (BaseOpcode->Sampler) {
8299 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8303 ST->hasFeature(AMDGPU::FeatureR128A16)
8313 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8317 if (BaseOpcode->HasD16)
8319 if (isa<MemSDNode>(
Op))
8322 int NumVAddrDwords =
8328 NumVDataDwords, NumVAddrDwords);
8329 }
else if (IsGFX11Plus) {
8331 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8332 : AMDGPU::MIMGEncGfx11Default,
8333 NumVDataDwords, NumVAddrDwords);
8334 }
else if (IsGFX10Plus) {
8336 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8337 : AMDGPU::MIMGEncGfx10Default,
8338 NumVDataDwords, NumVAddrDwords);
8342 NumVDataDwords, NumVAddrDwords);
8345 "requested image instruction is not supported on this GPU");
8350 NumVDataDwords, NumVAddrDwords);
8353 NumVDataDwords, NumVAddrDwords);
8359 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8364 if (BaseOpcode->AtomicX2) {
8369 if (BaseOpcode->NoReturn)
8373 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8391 if (!
Offset->isDivergent()) {
8436 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8440 unsigned NumLoads = 1;
8446 if (NumElts == 8 || NumElts == 16) {
8447 NumLoads = NumElts / 4;
8455 setBufferOffsets(
Offset, DAG, &Ops[3],
8456 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8459 for (
unsigned i = 0; i < NumLoads; ++i) {
8465 if (NumElts == 8 || NumElts == 16)
8512 EVT VT =
Op.getValueType();
8514 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8518 switch (IntrinsicID) {
8519 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8522 return getPreloadedValue(DAG, *MFI, VT,
8525 case Intrinsic::amdgcn_dispatch_ptr:
8526 case Intrinsic::amdgcn_queue_ptr: {
8529 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8535 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8538 return getPreloadedValue(DAG, *MFI, VT, RegID);
8540 case Intrinsic::amdgcn_implicitarg_ptr: {
8542 return getImplicitArgPtr(DAG,
DL);
8543 return getPreloadedValue(DAG, *MFI, VT,
8546 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8552 return getPreloadedValue(DAG, *MFI, VT,
8555 case Intrinsic::amdgcn_dispatch_id: {
8558 case Intrinsic::amdgcn_rcp:
8560 case Intrinsic::amdgcn_rsq:
8562 case Intrinsic::amdgcn_rsq_legacy:
8566 case Intrinsic::amdgcn_rcp_legacy:
8570 case Intrinsic::amdgcn_rsq_clamp: {
8584 case Intrinsic::r600_read_ngroups_x:
8588 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8591 case Intrinsic::r600_read_ngroups_y:
8595 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8598 case Intrinsic::r600_read_ngroups_z:
8602 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8605 case Intrinsic::r600_read_global_size_x:
8609 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8612 case Intrinsic::r600_read_global_size_y:
8616 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8619 case Intrinsic::r600_read_global_size_z:
8623 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8626 case Intrinsic::r600_read_local_size_x:
8630 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8632 case Intrinsic::r600_read_local_size_y:
8636 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8638 case Intrinsic::r600_read_local_size_z:
8642 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8644 case Intrinsic::amdgcn_workgroup_id_x:
8645 return getPreloadedValue(DAG, *MFI, VT,
8647 case Intrinsic::amdgcn_workgroup_id_y:
8648 return getPreloadedValue(DAG, *MFI, VT,
8650 case Intrinsic::amdgcn_workgroup_id_z:
8651 return getPreloadedValue(DAG, *MFI, VT,
8653 case Intrinsic::amdgcn_wave_id:
8654 return lowerWaveID(DAG,
Op);
8655 case Intrinsic::amdgcn_lds_kernel_id: {
8657 return getLDSKernelId(DAG,
DL);
8658 return getPreloadedValue(DAG, *MFI, VT,
8661 case Intrinsic::amdgcn_workitem_id_x:
8662 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8663 case Intrinsic::amdgcn_workitem_id_y:
8664 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8665 case Intrinsic::amdgcn_workitem_id_z:
8666 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8667 case Intrinsic::amdgcn_wavefrontsize:
8670 case Intrinsic::amdgcn_s_buffer_load: {
8671 unsigned CPol =
Op.getConstantOperandVal(3);
8678 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8679 Op.getOperand(3), DAG);
8681 case Intrinsic::amdgcn_fdiv_fast:
8682 return lowerFDIV_FAST(
Op, DAG);
8683 case Intrinsic::amdgcn_sin:
8686 case Intrinsic::amdgcn_cos:
8689 case Intrinsic::amdgcn_mul_u24:
8692 case Intrinsic::amdgcn_mul_i24:
8696 case Intrinsic::amdgcn_log_clamp: {
8702 case Intrinsic::amdgcn_fract:
8705 case Intrinsic::amdgcn_class:
8708 case Intrinsic::amdgcn_div_fmas:
8710 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8712 case Intrinsic::amdgcn_div_fixup:
8714 Op.getOperand(2),
Op.getOperand(3));
8716 case Intrinsic::amdgcn_div_scale: {
8729 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8732 Denominator, Numerator);
8734 case Intrinsic::amdgcn_icmp: {
8736 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8737 Op.getConstantOperandVal(2) == 0 &&
8742 case Intrinsic::amdgcn_fcmp: {
8745 case Intrinsic::amdgcn_ballot:
8747 case Intrinsic::amdgcn_fmed3:
8749 Op.getOperand(2),
Op.getOperand(3));
8750 case Intrinsic::amdgcn_fdot2:
8752 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8753 case Intrinsic::amdgcn_fmul_legacy:
8756 case Intrinsic::amdgcn_sffbh:
8758 case Intrinsic::amdgcn_sbfe:
8760 Op.getOperand(2),
Op.getOperand(3));
8761 case Intrinsic::amdgcn_ubfe:
8763 Op.getOperand(2),
Op.getOperand(3));
8764 case Intrinsic::amdgcn_cvt_pkrtz:
8765 case Intrinsic::amdgcn_cvt_pknorm_i16:
8766 case Intrinsic::amdgcn_cvt_pknorm_u16:
8767 case Intrinsic::amdgcn_cvt_pk_i16:
8768 case Intrinsic::amdgcn_cvt_pk_u16: {
8770 EVT VT =
Op.getValueType();
8773 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8775 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8777 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8779 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8785 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8788 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8791 case Intrinsic::amdgcn_fmad_ftz:
8793 Op.getOperand(2),
Op.getOperand(3));
8795 case Intrinsic::amdgcn_if_break:
8797 Op->getOperand(1),
Op->getOperand(2)),
8800 case Intrinsic::amdgcn_groupstaticsize: {
8812 case Intrinsic::amdgcn_is_shared:
8813 case Intrinsic::amdgcn_is_private: {
8815 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8818 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8826 case Intrinsic::amdgcn_perm:
8828 Op.getOperand(2),
Op.getOperand(3));
8829 case Intrinsic::amdgcn_reloc_constant: {
8833 auto *RelocSymbol = cast<GlobalVariable>(
8839 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8840 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8841 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8842 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8843 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8844 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8845 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8846 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8847 if (
Op.getOperand(4).getValueType() == MVT::i32)
8853 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8854 Op.getOperand(3), IndexKeyi32);
8856 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8857 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8858 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8859 if (
Op.getOperand(6).getValueType() == MVT::i32)
8865 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8866 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8867 IndexKeyi32, Op.getOperand(7)});
8869 case Intrinsic::amdgcn_addrspacecast_nonnull:
8870 return lowerADDRSPACECAST(
Op, DAG);
8871 case Intrinsic::amdgcn_readlane:
8872 case Intrinsic::amdgcn_readfirstlane:
8873 case Intrinsic::amdgcn_writelane:
8874 case Intrinsic::amdgcn_permlane16:
8875 case Intrinsic::amdgcn_permlanex16:
8876 case Intrinsic::amdgcn_permlane64:
8877 case Intrinsic::amdgcn_set_inactive:
8878 case Intrinsic::amdgcn_set_inactive_chain_arg:
8879 case Intrinsic::amdgcn_mov_dpp8:
8880 case Intrinsic::amdgcn_update_dpp:
8885 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8896 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8902 unsigned NewOpcode)
const {
8906 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8907 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
8921 auto *
M = cast<MemSDNode>(
Op);
8925 M->getMemOperand());
8930 unsigned NewOpcode)
const {
8934 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8935 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
8949 auto *
M = cast<MemSDNode>(
Op);
8953 M->getMemOperand());
8958 unsigned IntrID =
Op.getConstantOperandVal(1);
8962 case Intrinsic::amdgcn_ds_ordered_add:
8963 case Intrinsic::amdgcn_ds_ordered_swap: {
8968 unsigned IndexOperand =
M->getConstantOperandVal(7);
8969 unsigned WaveRelease =
M->getConstantOperandVal(8);
8970 unsigned WaveDone =
M->getConstantOperandVal(9);
8972 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8973 IndexOperand &= ~0x3f;
8974 unsigned CountDw = 0;
8977 CountDw = (IndexOperand >> 24) & 0xf;
8978 IndexOperand &= ~(0xf << 24);
8980 if (CountDw < 1 || CountDw > 4) {
8982 "ds_ordered_count: dword count must be between 1 and 4");
8989 if (WaveDone && !WaveRelease)
8992 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8993 unsigned ShaderType =
8995 unsigned Offset0 = OrderedCountIndex << 2;
8996 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8999 Offset1 |= (CountDw - 1) << 6;
9002 Offset1 |= ShaderType << 2;
9004 unsigned Offset = Offset0 | (Offset1 << 8);
9011 M->getVTList(), Ops,
M->getMemoryVT(),
9012 M->getMemOperand());
9014 case Intrinsic::amdgcn_raw_buffer_load:
9015 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9016 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9017 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9018 case Intrinsic::amdgcn_raw_buffer_load_format:
9019 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9020 const bool IsFormat =
9021 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9022 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9024 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9025 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9038 auto *
M = cast<MemSDNode>(
Op);
9039 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9041 case Intrinsic::amdgcn_struct_buffer_load:
9042 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9043 case Intrinsic::amdgcn_struct_buffer_load_format:
9044 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9045 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9046 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9047 const bool IsFormat =
9048 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9049 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9051 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9052 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9065 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9067 case Intrinsic::amdgcn_raw_tbuffer_load:
9068 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9070 EVT LoadVT =
Op.getValueType();
9071 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9072 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9091 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9094 case Intrinsic::amdgcn_struct_tbuffer_load:
9095 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9097 EVT LoadVT =
Op.getValueType();
9098 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9099 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9118 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9121 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9122 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9124 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9125 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9126 return lowerStructBufferAtomicIntrin(
Op, DAG,
9128 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9129 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9131 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9132 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9133 return lowerStructBufferAtomicIntrin(
Op, DAG,
9135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9138 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9140 return lowerStructBufferAtomicIntrin(
Op, DAG,
9142 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9143 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9145 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9148 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9151 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9152 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9154 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9155 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9157 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9158 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9160 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9163 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9166 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9167 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9169 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9170 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9172 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9173 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9175 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9176 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9178 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9179 return lowerRawBufferAtomicIntrin(
Op, DAG,
9181 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9182 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9183 return lowerStructBufferAtomicIntrin(
Op, DAG,
9185 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9186 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9188 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9189 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9191 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9192 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9193 return lowerStructBufferAtomicIntrin(
Op, DAG,
9195 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9196 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9197 return lowerStructBufferAtomicIntrin(
Op, DAG,
9199 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9200 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9201 return lowerStructBufferAtomicIntrin(
Op, DAG,
9203 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9204 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9205 return lowerStructBufferAtomicIntrin(
Op, DAG,
9207 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9210 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9211 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9213 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9214 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9216 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9217 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9219 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9220 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9222 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9223 return lowerStructBufferAtomicIntrin(
Op, DAG,
9226 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9227 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9228 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9229 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9243 EVT VT =
Op.getValueType();
9244 auto *
M = cast<MemSDNode>(
Op);
9247 Op->getVTList(), Ops, VT,
9248 M->getMemOperand());
9250 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9251 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9252 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9253 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9267 EVT VT =
Op.getValueType();
9268 auto *
M = cast<MemSDNode>(
Op);
9271 Op->getVTList(), Ops, VT,
9272 M->getMemOperand());
9274 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9276 SDValue NodePtr =
M->getOperand(2);
9277 SDValue RayExtent =
M->getOperand(3);
9278 SDValue RayOrigin =
M->getOperand(4);
9280 SDValue RayInvDir =
M->getOperand(6);
9298 const unsigned NumVDataDwords = 4;
9299 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9300 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9304 const unsigned BaseOpcodes[2][2] = {
9305 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9306 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9307 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9311 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9312 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9313 : AMDGPU::MIMGEncGfx10NSA,
9314 NumVDataDwords, NumVAddrDwords);
9318 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9319 : AMDGPU::MIMGEncGfx10Default,
9320 NumVDataDwords, NumVAddrDwords);
9326 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9329 if (Lanes[0].getValueSizeInBits() == 32) {
9330 for (
unsigned I = 0;
I < 3; ++
I)
9349 if (UseNSA && IsGFX11Plus) {
9357 for (
unsigned I = 0;
I < 3; ++
I) {
9360 {DirLanes[I], InvDirLanes[I]})));
9375 packLanes(RayOrigin,
true);
9376 packLanes(RayDir,
true);
9377 packLanes(RayInvDir,
false);
9382 if (NumVAddrDwords > 12) {
9402 case Intrinsic::amdgcn_global_atomic_fmin_num:
9403 case Intrinsic::amdgcn_global_atomic_fmax_num:
9404 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9405 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9412 unsigned Opcode = 0;
9414 case Intrinsic::amdgcn_global_atomic_fmin_num:
9415 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9419 case Intrinsic::amdgcn_global_atomic_fmax_num:
9420 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9428 Ops,
M->getMemOperand());
9430 case Intrinsic::amdgcn_s_get_barrier_state:
9431 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9436 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9437 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9438 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9439 BarID = (BarID >> 4) & 0x3F;
9440 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9445 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9446 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9466 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9474SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9484 bool IsTFE = VTList.
NumVTs == 3;
9487 unsigned NumOpDWords = NumValueDWords + 1;
9492 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9493 OpDWordsVT, OpDWordsMMO, DAG);
9508 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9514 WidenedMemVT, WidenedMMO);
9524 bool ImageStore)
const {
9559 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9565 if ((NumElements % 2) == 1) {
9567 unsigned I = Elts.
size() / 2;
9583 if (NumElements == 3) {
9604 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9607 switch (IntrinsicID) {
9608 case Intrinsic::amdgcn_exp_compr: {
9612 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9635 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9638 case Intrinsic::amdgcn_s_barrier:
9639 case Intrinsic::amdgcn_s_barrier_signal:
9640 case Intrinsic::amdgcn_s_barrier_wait: {
9643 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9644 if (WGSize <=
ST.getWavefrontSize()) {
9647 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9648 return Op.getOperand(0);
9651 MVT::Other,
Op.getOperand(0)),
9656 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9662 MVT::Other, K,
Op.getOperand(0)),
9674 case Intrinsic::amdgcn_struct_tbuffer_store:
9675 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9679 VData = handleD16VData(VData, DAG);
9680 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9681 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9699 M->getMemoryVT(),
M->getMemOperand());
9702 case Intrinsic::amdgcn_raw_tbuffer_store:
9703 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9707 VData = handleD16VData(VData, DAG);
9708 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9709 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9727 M->getMemoryVT(),
M->getMemOperand());
9730 case Intrinsic::amdgcn_raw_buffer_store:
9731 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9732 case Intrinsic::amdgcn_raw_buffer_store_format:
9733 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9734 const bool IsFormat =
9735 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9736 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9743 VData = handleD16VData(VData, DAG);
9753 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9754 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9774 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9777 M->getMemoryVT(),
M->getMemOperand());
9780 case Intrinsic::amdgcn_struct_buffer_store:
9781 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9782 case Intrinsic::amdgcn_struct_buffer_store_format:
9783 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9784 const bool IsFormat =
9785 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9786 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9794 VData = handleD16VData(VData, DAG);
9804 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9805 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9826 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9829 M->getMemoryVT(),
M->getMemOperand());
9831 case Intrinsic::amdgcn_raw_buffer_load_lds:
9832 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9833 case Intrinsic::amdgcn_struct_buffer_load_lds:
9834 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9838 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9839 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9840 unsigned OpOffset = HasVIndex ? 1 : 0;
9841 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9843 unsigned Size =
Op->getConstantOperandVal(4);
9849 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9850 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9851 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9852 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9855 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9856 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9857 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9858 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9861 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9862 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9863 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9864 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9869 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9870 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9871 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9872 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9877 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9878 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9879 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9880 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9888 if (HasVIndex && HasVOffset)
9894 else if (HasVOffset)
9897 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9902 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9914 auto *
M = cast<MemSDNode>(
Op);
9941 case Intrinsic::amdgcn_global_load_lds: {
9943 unsigned Size =
Op->getConstantOperandVal(4);
9948 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9951 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9954 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9959 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9964 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
9968 auto *
M = cast<MemSDNode>(
Op);
9981 if (
LHS->isDivergent())
9985 RHS.getOperand(0).getValueType() == MVT::i32) {
9988 VOffset =
RHS.getOperand(0);
9993 if (!
Addr->isDivergent()) {
10010 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10030 case Intrinsic::amdgcn_end_cf:
10032 Op->getOperand(2), Chain),
10034 case Intrinsic::amdgcn_s_barrier_init:
10035 case Intrinsic::amdgcn_s_barrier_signal_var: {
10042 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10043 ? AMDGPU::S_BARRIER_INIT_M0
10044 : AMDGPU::S_BARRIER_SIGNAL_M0;
10059 constexpr unsigned ShAmt = 16;
10071 case Intrinsic::amdgcn_s_barrier_join:
10072 case Intrinsic::amdgcn_s_wakeup_barrier: {
10079 if (isa<ConstantSDNode>(BarOp)) {
10080 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10081 switch (IntrinsicID) {
10084 case Intrinsic::amdgcn_s_barrier_join:
10085 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10087 case Intrinsic::amdgcn_s_wakeup_barrier:
10088 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
10092 unsigned BarID = (BarVal >> 4) & 0x3F;
10097 switch (IntrinsicID) {
10100 case Intrinsic::amdgcn_s_barrier_join:
10101 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10103 case Intrinsic::amdgcn_s_wakeup_barrier:
10104 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
10121 case Intrinsic::amdgcn_s_prefetch_data: {
10124 return Op.getOperand(0);
10127 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10129 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10136 Op->getVTList(), Ops,
M->getMemoryVT(),
10137 M->getMemOperand());
10142 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10155std::pair<SDValue, SDValue>
10162 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10165 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10179 unsigned Overflow = ImmOffset & ~MaxImm;
10180 ImmOffset -= Overflow;
10181 if ((int32_t)Overflow < 0) {
10182 Overflow += ImmOffset;
10187 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10191 SDValue Ops[] = {N0, OverflowVal};
10206void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10208 Align Alignment)
const {
10211 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10214 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10225 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10227 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10244SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10247 return MaybePointer;
10261 SDValue NumRecords =
Op->getOperand(3);
10264 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10267 std::optional<uint32_t> ConstStride = std::nullopt;
10268 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10269 ConstStride = ConstNode->getZExtValue();
10272 if (!ConstStride || *ConstStride != 0) {
10275 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10286 NewHighHalf, NumRecords, Flags);
10296 bool IsTFE)
const {
10306 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10334 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10338 Ops[1] = BufferStoreExt;
10343 M->getMemOperand());
10368 DAGCombinerInfo &DCI)
const {
10384 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10391 "unexpected vector extload");
10404 "unexpected fp extload");
10422 DCI.AddToWorklist(Cvt.
getNode());
10427 DCI.AddToWorklist(Cvt.
getNode());
10438 if (
Info.isEntryFunction())
10439 return Info.getUserSGPRInfo().hasFlatScratchInit();
10447 EVT MemVT =
Load->getMemoryVT();
10460 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10488 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10489 "Custom lowering for non-i32 vectors hasn't been implemented.");
10492 unsigned AS =
Load->getAddressSpace();
10516 Alignment >=
Align(4) && NumElements < 32) {
10530 if (NumElements > 4)
10549 if (NumElements > 2)
10554 if (NumElements > 4)
10566 auto Flags =
Load->getMemOperand()->getFlags();
10568 Load->getAlign(), Flags, &
Fast) &&
10577 MemVT, *
Load->getMemOperand())) {
10586 EVT VT =
Op.getValueType();
10623 EVT VT =
Op.getValueType();
10626 bool AllowInaccurateRcp =
10633 if (!AllowInaccurateRcp && VT != MVT::f16)
10636 if (CLHS->isExactlyValue(1.0)) {
10653 if (CLHS->isExactlyValue(-1.0)) {
10662 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10676 EVT VT =
Op.getValueType();
10679 bool AllowInaccurateDiv =
10681 if (!AllowInaccurateDiv)
10702 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10716 return DAG.
getNode(Opcode, SL, VTList,
10725 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10739 return DAG.
getNode(Opcode, SL, VTList,
10745 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10746 return FastLowered;
10766 unsigned FMADOpCode =
10776 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10778 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10779 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10802 const APFloat K0Val(0x1p+96f);
10805 const APFloat K1Val(0x1p-32f);
10832 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10833 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10834 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10839 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10840 return FastLowered;
10847 Flags.setNoFPExcept(
true);
10868 using namespace AMDGPU::Hwreg;
10869 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10877 const bool HasDynamicDenormals =
10883 if (!PreservesDenormals) {
10891 if (HasDynamicDenormals) {
10895 SavedDenormMode =
SDValue(GetReg, 0);
10903 const SDValue EnableDenormValue =
10910 const SDValue EnableDenormValue =
10912 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10913 {EnableDenormValue,
BitField, Glue});
10923 ApproxRcp, One, NegDivScale0, Flags);
10926 ApproxRcp, Fma0, Flags);
10932 NumeratorScaled,
Mul, Flags);
10938 NumeratorScaled, Fma3, Flags);
10940 if (!PreservesDenormals) {
10948 DisableDenormValue, Fma4.
getValue(2))
10951 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10952 const SDValue DisableDenormValue =
10953 HasDynamicDenormals
10958 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10969 {Fma4, Fma1, Fma3, Scale},
Flags);
10975 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10976 return FastLowered;
11044 EVT VT =
Op.getValueType();
11046 if (VT == MVT::f32)
11047 return LowerFDIV32(
Op, DAG);
11049 if (VT == MVT::f64)
11050 return LowerFDIV64(
Op, DAG);
11052 if (VT == MVT::f16)
11053 return LowerFDIV16(
Op, DAG);
11062 EVT ResultExpVT =
Op->getValueType(1);
11063 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11093 if (VT == MVT::i1) {
11097 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11101 Store->getValue().getValueType().getScalarType() == MVT::i32);
11103 unsigned AS =
Store->getAddressSpace();
11122 if (NumElements > 4)
11129 VT, *
Store->getMemOperand()))
11139 if (NumElements > 2)
11143 if (NumElements > 4 ||
11152 auto Flags =
Store->getMemOperand()->getFlags();
11187 MVT VT =
Op.getValueType().getSimpleVT();
11358 EVT VT =
Op.getValueType();
11375 switch (
Op.getOpcode()) {
11402 EVT VT =
Op.getValueType();
11410 Op->getVTList(), Ops, VT,
11419SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11420 DAGCombinerInfo &DCI)
const {
11421 EVT VT =
N->getValueType(0);
11423 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11430 EVT SrcVT = Src.getValueType();
11436 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11439 DCI.AddToWorklist(Cvt.
getNode());
11442 if (ScalarVT != MVT::f32) {
11454 DAGCombinerInfo &DCI)
const {
11455 SDValue MagnitudeOp =
N->getOperand(0);
11456 SDValue SignOp =
N->getOperand(1);
11512SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11514 DAGCombinerInfo &DCI)
const {
11544 AM.HasBaseReg =
true;
11545 AM.BaseOffs =
Offset.getSExtValue();
11550 EVT VT =
N->getValueType(0);
11556 Flags.setNoUnsignedWrap(
11557 N->getFlags().hasNoUnsignedWrap() &&
11567 switch (
N->getOpcode()) {
11578 DAGCombinerInfo &DCI)
const {
11587 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11588 N->getMemoryVT(), DCI);
11592 NewOps[PtrIdx] = NewPtr;
11601 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11602 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11611SDValue SITargetLowering::splitBinaryBitConstantOp(
11612 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11632 if (V.getValueType() != MVT::i1)
11634 switch (V.getOpcode()) {
11653 if (!(
C & 0x000000ff))
11654 ZeroByteMask |= 0x000000ff;
11655 if (!(
C & 0x0000ff00))
11656 ZeroByteMask |= 0x0000ff00;
11657 if (!(
C & 0x00ff0000))
11658 ZeroByteMask |= 0x00ff0000;
11659 if (!(
C & 0xff000000))
11660 ZeroByteMask |= 0xff000000;
11661 uint32_t NonZeroByteMask = ~ZeroByteMask;
11662 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11675 assert(V.getValueSizeInBits() == 32);
11677 if (V.getNumOperands() != 2)
11686 switch (V.getOpcode()) {
11691 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11696 return (0x03020100 & ~ConstMask) | ConstMask;
11703 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11709 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11716 DAGCombinerInfo &DCI)
const {
11717 if (DCI.isBeforeLegalize())
11721 EVT VT =
N->getValueType(0);
11726 if (VT == MVT::i64 && CRHS) {
11732 if (CRHS && VT == MVT::i32) {
11741 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11742 unsigned Shift = CShift->getZExtValue();
11744 unsigned Offset = NB + Shift;
11745 if ((
Offset & (Bits - 1)) == 0) {
11763 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11769 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11784 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11789 if (
X !=
LHS.getOperand(1))
11794 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11827 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11828 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11830 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11831 :
Mask->getZExtValue() & OrdMask;
11852 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11855 if (LHSMask != ~0u && RHSMask != ~0u) {
11858 if (LHSMask > RHSMask) {
11865 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11866 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11869 if (!(LHSUsedLanes & RHSUsedLanes) &&
11872 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11879 for (
unsigned I = 0;
I < 32;
I += 8) {
11881 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11882 Mask &= (0x0c <<
I) & 0xffffffff;
11940static const std::optional<ByteProvider<SDValue>>
11942 unsigned Depth = 0) {
11945 return std::nullopt;
11947 if (
Op.getValueSizeInBits() < 8)
11948 return std::nullopt;
11950 if (
Op.getValueType().isVector())
11953 switch (
Op->getOpcode()) {
11964 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11965 NarrowVT = VTSign->getVT();
11968 return std::nullopt;
11971 if (SrcIndex >= NarrowByteWidth)
11972 return std::nullopt;
11978 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11980 return std::nullopt;
11982 uint64_t BitShift = ShiftOp->getZExtValue();
11984 if (BitShift % 8 != 0)
11985 return std::nullopt;
11987 SrcIndex += BitShift / 8;
12005static const std::optional<ByteProvider<SDValue>>
12007 unsigned StartingIndex = 0) {
12011 return std::nullopt;
12013 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12015 return std::nullopt;
12017 return std::nullopt;
12019 bool IsVec =
Op.getValueType().isVector();
12020 switch (
Op.getOpcode()) {
12023 return std::nullopt;
12028 return std::nullopt;
12032 return std::nullopt;
12035 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12036 return std::nullopt;
12037 if (!
LHS ||
LHS->isConstantZero())
12039 if (!
RHS ||
RHS->isConstantZero())
12041 return std::nullopt;
12046 return std::nullopt;
12048 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12050 return std::nullopt;
12052 uint32_t BitMask = BitMaskOp->getZExtValue();
12054 uint32_t IndexMask = 0xFF << (Index * 8);
12056 if ((IndexMask & BitMask) != IndexMask) {
12059 if (IndexMask & BitMask)
12060 return std::nullopt;
12069 return std::nullopt;
12072 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12073 if (!ShiftOp ||
Op.getValueType().isVector())
12074 return std::nullopt;
12076 uint64_t BitsProvided =
Op.getValueSizeInBits();
12077 if (BitsProvided % 8 != 0)
12078 return std::nullopt;
12080 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12082 return std::nullopt;
12084 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12085 uint64_t ByteShift = BitShift / 8;
12087 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12088 uint64_t BytesProvided = BitsProvided / 8;
12089 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12090 NewIndex %= BytesProvided;
12097 return std::nullopt;
12099 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12101 return std::nullopt;
12103 uint64_t BitShift = ShiftOp->getZExtValue();
12105 return std::nullopt;
12107 auto BitsProvided =
Op.getScalarValueSizeInBits();
12108 if (BitsProvided % 8 != 0)
12109 return std::nullopt;
12111 uint64_t BytesProvided = BitsProvided / 8;
12112 uint64_t ByteShift = BitShift / 8;
12117 return BytesProvided - ByteShift > Index
12125 return std::nullopt;
12127 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12129 return std::nullopt;
12131 uint64_t BitShift = ShiftOp->getZExtValue();
12132 if (BitShift % 8 != 0)
12133 return std::nullopt;
12134 uint64_t ByteShift = BitShift / 8;
12140 return Index < ByteShift
12143 Depth + 1, StartingIndex);
12152 return std::nullopt;
12159 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12160 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12162 if (NarrowBitWidth % 8 != 0)
12163 return std::nullopt;
12164 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12166 if (Index >= NarrowByteWidth)
12168 ? std::optional<ByteProvider<SDValue>>(
12176 return std::nullopt;
12180 if (NarrowByteWidth >= Index) {
12185 return std::nullopt;
12192 return std::nullopt;
12196 auto *L = cast<LoadSDNode>(
Op.getNode());
12198 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12199 if (NarrowBitWidth % 8 != 0)
12200 return std::nullopt;
12201 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12206 if (Index >= NarrowByteWidth) {
12208 ? std::optional<ByteProvider<SDValue>>(
12213 if (NarrowByteWidth > Index) {
12217 return std::nullopt;
12222 return std::nullopt;
12225 Depth + 1, StartingIndex);
12229 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12231 return std::nullopt;
12232 auto VecIdx = IdxOp->getZExtValue();
12233 auto ScalarSize =
Op.getScalarValueSizeInBits();
12234 if (ScalarSize < 32)
12235 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12237 StartingIndex, Index);
12242 return std::nullopt;
12244 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12246 return std::nullopt;
12249 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12250 if (IdxMask > 0x07 && IdxMask != 0x0c)
12251 return std::nullopt;
12253 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12254 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12256 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12262 return std::nullopt;
12277 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12281 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12284 auto MemVT = L->getMemoryVT();
12287 return L->getMemoryVT().getSizeInBits() == 16;
12297 int Low8 = Mask & 0xff;
12298 int Hi8 = (Mask & 0xff00) >> 8;
12300 assert(Low8 < 8 && Hi8 < 8);
12302 bool IsConsecutive = (Hi8 - Low8 == 1);
12307 bool Is16Aligned = !(Low8 % 2);
12309 return IsConsecutive && Is16Aligned;
12317 int Low16 = PermMask & 0xffff;
12318 int Hi16 = (PermMask & 0xffff0000) >> 16;
12328 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12330 if (!OtherOpIs16Bit)
12338 unsigned DWordOffset) {
12341 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12343 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12348 if (Src.getValueType().isVector()) {
12349 auto ScalarTySize = Src.getScalarValueSizeInBits();
12350 auto ScalarTy = Src.getValueType().getScalarType();
12351 if (ScalarTySize == 32) {
12355 if (ScalarTySize > 32) {
12358 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12359 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12366 assert(ScalarTySize < 32);
12367 auto NumElements =
TypeSize / ScalarTySize;
12368 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12369 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12370 auto NumElementsIn32 = 32 / ScalarTySize;
12371 auto NumAvailElements = DWordOffset < Trunc32Elements
12373 : NumElements - NormalizedTrunc;
12386 auto ShiftVal = 32 * DWordOffset;
12394 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12399 for (
int i = 0; i < 4; i++) {
12401 std::optional<ByteProvider<SDValue>>
P =
12404 if (!
P ||
P->isConstantZero())
12409 if (PermNodes.
size() != 4)
12412 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12413 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12415 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12416 auto PermOp = PermNodes[i];
12419 int SrcByteAdjust = 4;
12423 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12424 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12426 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12427 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12431 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12432 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12435 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12437 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12440 SDValue Op = *PermNodes[FirstSrc.first].Src;
12442 assert(
Op.getValueSizeInBits() == 32);
12446 int Low16 = PermMask & 0xffff;
12447 int Hi16 = (PermMask & 0xffff0000) >> 16;
12449 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12450 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12453 if (WellFormedLow && WellFormedHi)
12457 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12466 assert(
Op.getValueType().isByteSized() &&
12484 DAGCombinerInfo &DCI)
const {
12489 EVT VT =
N->getValueType(0);
12490 if (VT == MVT::i1) {
12495 if (Src !=
RHS.getOperand(0))
12500 if (!CLHS || !CRHS)
12504 static const uint32_t MaxMask = 0x3ff;
12519 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12524 Sel |=
LHS.getConstantOperandVal(2);
12533 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12537 auto usesCombinedOperand = [](
SDNode *OrUse) {
12540 !OrUse->getValueType(0).isVector())
12544 for (
auto *VUser : OrUse->users()) {
12545 if (!VUser->getValueType(0).isVector())
12552 if (VUser->getOpcode() == VectorwiseOp)
12558 if (!
any_of(
N->users(), usesCombinedOperand))
12564 if (LHSMask != ~0u && RHSMask != ~0u) {
12567 if (LHSMask > RHSMask) {
12574 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12575 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12578 if (!(LHSUsedLanes & RHSUsedLanes) &&
12581 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12583 LHSMask &= ~RHSUsedLanes;
12584 RHSMask &= ~LHSUsedLanes;
12586 LHSMask |= LHSUsedLanes & 0x04040404;
12596 if (LHSMask == ~0u || RHSMask == ~0u) {
12602 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12617 if (SrcVT == MVT::i32) {
12622 DCI.AddToWorklist(LowOr.
getNode());
12623 DCI.AddToWorklist(HiBits.getNode());
12631 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12634 N->getOperand(0), CRHS))
12642 DAGCombinerInfo &DCI)
const {
12643 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12652 EVT VT =
N->getValueType(0);
12653 if (CRHS && VT == MVT::i64) {
12675 LHS->getOperand(0), FNegLHS, FNegRHS);
12684 DAGCombinerInfo &DCI)
const {
12689 EVT VT =
N->getValueType(0);
12690 if (VT != MVT::i32)
12694 if (Src.getValueType() != MVT::i16)
12701SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12702 DAGCombinerInfo &DCI)
const {
12704 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12709 VTSign->getVT() == MVT::i8) ||
12711 VTSign->getVT() == MVT::i16))) {
12713 "s_buffer_load_{u8, i8} are supported "
12714 "in GFX12 (or newer) architectures.");
12715 EVT VT = Src.getValueType();
12720 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12726 auto *
M = cast<MemSDNode>(Src);
12727 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12728 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12733 VTSign->getVT() == MVT::i8) ||
12735 VTSign->getVT() == MVT::i16)) &&
12737 auto *
M = cast<MemSDNode>(Src);
12738 SDValue Ops[] = {Src.getOperand(0),
12744 Src.getOperand(6), Src.getOperand(7)};
12747 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12751 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12752 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12753 return DCI.DAG.getMergeValues(
12760 DAGCombinerInfo &DCI)
const {
12768 if (
N->getOperand(0).isUndef())
12775 DAGCombinerInfo &DCI)
const {
12776 EVT VT =
N->getValueType(0);
12802 unsigned Opcode =
Op.getOpcode();
12806 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12807 const auto &
F = CFP->getValueAPF();
12808 if (
F.isNaN() &&
F.isSignaling())
12810 if (!
F.isDenormal())
12873 if (
Op.getValueType() == MVT::i32) {
12878 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12879 if (
RHS->getZExtValue() == 0xffff0000) {
12889 return Op.getValueType().getScalarType() != MVT::f16;
12957 if (
Op.getValueType() == MVT::i16) {
12968 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12970 switch (IntrinsicID) {
12971 case Intrinsic::amdgcn_cvt_pkrtz:
12972 case Intrinsic::amdgcn_cubeid:
12973 case Intrinsic::amdgcn_frexp_mant:
12974 case Intrinsic::amdgcn_fdot2:
12975 case Intrinsic::amdgcn_rcp:
12976 case Intrinsic::amdgcn_rsq:
12977 case Intrinsic::amdgcn_rsq_clamp:
12978 case Intrinsic::amdgcn_rcp_legacy:
12979 case Intrinsic::amdgcn_rsq_legacy:
12980 case Intrinsic::amdgcn_trig_preop:
12981 case Intrinsic::amdgcn_log:
12982 case Intrinsic::amdgcn_exp2:
12983 case Intrinsic::amdgcn_sqrt:
13004 unsigned Opcode =
MI->getOpcode();
13006 if (Opcode == AMDGPU::G_FCANONICALIZE)
13009 std::optional<FPValueAndVReg> FCR;
13012 if (FCR->Value.isSignaling())
13014 if (!FCR->Value.isDenormal())
13025 case AMDGPU::G_FADD:
13026 case AMDGPU::G_FSUB:
13027 case AMDGPU::G_FMUL:
13028 case AMDGPU::G_FCEIL:
13029 case AMDGPU::G_FFLOOR:
13030 case AMDGPU::G_FRINT:
13031 case AMDGPU::G_FNEARBYINT:
13032 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13033 case AMDGPU::G_INTRINSIC_TRUNC:
13034 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13035 case AMDGPU::G_FMA:
13036 case AMDGPU::G_FMAD:
13037 case AMDGPU::G_FSQRT:
13038 case AMDGPU::G_FDIV:
13039 case AMDGPU::G_FREM:
13040 case AMDGPU::G_FPOW:
13041 case AMDGPU::G_FPEXT:
13042 case AMDGPU::G_FLOG:
13043 case AMDGPU::G_FLOG2:
13044 case AMDGPU::G_FLOG10:
13045 case AMDGPU::G_FPTRUNC:
13046 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13047 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13048 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13049 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13050 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13052 case AMDGPU::G_FNEG:
13053 case AMDGPU::G_FABS:
13054 case AMDGPU::G_FCOPYSIGN:
13056 case AMDGPU::G_FMINNUM:
13057 case AMDGPU::G_FMAXNUM:
13058 case AMDGPU::G_FMINNUM_IEEE:
13059 case AMDGPU::G_FMAXNUM_IEEE:
13060 case AMDGPU::G_FMINIMUM:
13061 case AMDGPU::G_FMAXIMUM: {
13069 case AMDGPU::G_BUILD_VECTOR:
13074 case AMDGPU::G_INTRINSIC:
13075 case AMDGPU::G_INTRINSIC_CONVERGENT:
13077 case Intrinsic::amdgcn_fmul_legacy:
13078 case Intrinsic::amdgcn_fmad_ftz:
13079 case Intrinsic::amdgcn_sqrt:
13080 case Intrinsic::amdgcn_fmed3:
13081 case Intrinsic::amdgcn_sin:
13082 case Intrinsic::amdgcn_cos:
13083 case Intrinsic::amdgcn_log:
13084 case Intrinsic::amdgcn_exp2:
13085 case Intrinsic::amdgcn_log_clamp:
13086 case Intrinsic::amdgcn_rcp:
13087 case Intrinsic::amdgcn_rcp_legacy:
13088 case Intrinsic::amdgcn_rsq:
13089 case Intrinsic::amdgcn_rsq_clamp:
13090 case Intrinsic::amdgcn_rsq_legacy:
13091 case Intrinsic::amdgcn_div_scale:
13092 case Intrinsic::amdgcn_div_fmas:
13093 case Intrinsic::amdgcn_div_fixup:
13094 case Intrinsic::amdgcn_fract:
13095 case Intrinsic::amdgcn_cvt_pkrtz:
13096 case Intrinsic::amdgcn_cubeid:
13097 case Intrinsic::amdgcn_cubema:
13098 case Intrinsic::amdgcn_cubesc:
13099 case Intrinsic::amdgcn_cubetc:
13100 case Intrinsic::amdgcn_frexp_mant:
13101 case Intrinsic::amdgcn_fdot2:
13102 case Intrinsic::amdgcn_trig_preop:
13121 if (
C.isDenormal()) {
13135 if (
C.isSignaling()) {
13154 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13158SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13159 DAGCombinerInfo &DCI)
const {
13162 EVT VT =
N->getValueType(0);
13171 EVT VT =
N->getValueType(0);
13172 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13188 EVT EltVT =
Lo.getValueType();
13191 for (
unsigned I = 0;
I != 2; ++
I) {
13195 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13196 }
else if (
Op.isUndef()) {
13208 if (isa<ConstantFPSDNode>(NewElts[1]))
13209 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13215 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13267 if (!MinK || !MaxK)
13280 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13281 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13322 if (
Info->getMode().DX10Clamp) {
13331 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13363 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13372 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13381 DAGCombinerInfo &DCI)
const {
13384 EVT VT =
N->getValueType(0);
13385 unsigned Opc =
N->getOpcode();
13414 if (
SDValue Med3 = performIntMed3ImmCombine(
13419 if (
SDValue Med3 = performIntMed3ImmCombine(
13425 if (
SDValue Med3 = performIntMed3ImmCombine(
13430 if (
SDValue Med3 = performIntMed3ImmCombine(
13440 (VT == MVT::f32 || VT == MVT::f64 ||
13444 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13455 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13456 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13465 DAGCombinerInfo &DCI)
const {
13466 EVT VT =
N->getValueType(0);
13489 if (
Info->getMode().DX10Clamp) {
13492 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13495 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13498 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13509 DAGCombinerInfo &DCI)
const {
13513 return DCI.DAG.getUNDEF(
N->getValueType(0));
13521 bool IsDivergentIdx,
13526 unsigned VecSize = EltSize * NumElem;
13529 if (VecSize <= 64 && EltSize < 32)
13538 if (IsDivergentIdx)
13542 unsigned NumInsts = NumElem +
13543 ((EltSize + 31) / 32) * NumElem ;
13548 return NumInsts <= 16;
13553 return NumInsts <= 15;
13560 if (isa<ConstantSDNode>(
Idx))
13574SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13575 DAGCombinerInfo &DCI)
const {
13581 EVT ResVT =
N->getValueType(0);
13600 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13628 DCI.AddToWorklist(Elt0.
getNode());
13629 DCI.AddToWorklist(Elt1.
getNode());
13651 if (!DCI.isBeforeLegalize())
13657 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13658 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13659 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13662 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13663 unsigned EltIdx = BitIndex / 32;
13664 unsigned LeftoverBitIdx = BitIndex % 32;
13668 DCI.AddToWorklist(Cast.
getNode());
13672 DCI.AddToWorklist(Elt.
getNode());
13675 DCI.AddToWorklist(Srl.
getNode());
13679 DCI.AddToWorklist(Trunc.
getNode());
13681 if (VecEltVT == ResVT) {
13693SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13694 DAGCombinerInfo &DCI)
const {
13708 EVT IdxVT =
Idx.getValueType();
13725 Src.getOperand(0).getValueType() == MVT::f16) {
13726 return Src.getOperand(0);
13729 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13730 APFloat Val = CFP->getValueAPF();
13731 bool LosesInfo =
true;
13741 DAGCombinerInfo &DCI)
const {
13743 "combine only useful on gfx8");
13745 SDValue TruncSrc =
N->getOperand(0);
13746 EVT VT =
N->getValueType(0);
13747 if (VT != MVT::f16)
13785unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13787 const SDNode *N1)
const {
13792 if (((VT == MVT::f32 &&
13794 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13814 EVT VT =
N->getValueType(0);
13815 if (VT != MVT::i32 && VT != MVT::i64)
13821 unsigned Opc =
N->getOpcode();
13844 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13864 DAGCombinerInfo &DCI)
const {
13868 EVT VT =
N->getValueType(0);
13878 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13882 if (NumBits <= 32 || NumBits > 64)
13894 unsigned NumUsers = 0;
13919 bool MulSignedLo =
false;
13920 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13929 if (VT != MVT::i64) {
13952 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13954 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13955 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13957 if (!MulLHSUnsigned32) {
13964 if (!MulRHSUnsigned32) {
13975 if (VT != MVT::i64)
13982static std::optional<ByteProvider<SDValue>>
13985 if (!Byte0 || Byte0->isConstantZero()) {
13986 return std::nullopt;
13989 if (Byte1 && !Byte1->isConstantZero()) {
13990 return std::nullopt;
13996 unsigned FirstCs =
First & 0x0c0c0c0c;
13997 unsigned SecondCs = Second & 0x0c0c0c0c;
13998 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13999 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14001 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14002 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14003 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14004 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14006 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14030 for (
int BPI = 0; BPI < 2; BPI++) {
14033 BPP = {Src1, Src0};
14035 unsigned ZeroMask = 0x0c0c0c0c;
14036 unsigned FMask = 0xFF << (8 * (3 - Step));
14038 unsigned FirstMask =
14039 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14040 unsigned SecondMask =
14041 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14045 int FirstGroup = -1;
14046 for (
int I = 0;
I < 2;
I++) {
14048 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14049 return IterElt.SrcOp == *BPP.first.Src &&
14050 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14060 if (FirstGroup != -1) {
14062 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14063 return IterElt.SrcOp == *BPP.second.Src &&
14064 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14070 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14078 unsigned ZeroMask = 0x0c0c0c0c;
14079 unsigned FMask = 0xFF << (8 * (3 - Step));
14083 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14087 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14096 if (Srcs.
size() == 1) {
14097 auto *Elt = Srcs.
begin();
14101 if (Elt->PermMask == 0x3020100)
14108 auto *FirstElt = Srcs.
begin();
14109 auto *SecondElt = std::next(FirstElt);
14116 auto FirstMask = FirstElt->PermMask;
14117 auto SecondMask = SecondElt->PermMask;
14119 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14120 unsigned FirstPlusFour = FirstMask | 0x04040404;
14123 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14135 FirstElt = std::next(SecondElt);
14136 if (FirstElt == Srcs.
end())
14139 SecondElt = std::next(FirstElt);
14142 if (SecondElt == Srcs.
end()) {
14148 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14154 return Perms.
size() == 2
14160 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14161 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14162 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14163 EntryMask += ZeroMask;
14168 auto Opcode =
Op.getOpcode();
14174static std::optional<bool>
14185 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14188 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14190 assert(!(S0IsUnsigned && S0IsSigned));
14191 assert(!(S1IsUnsigned && S1IsSigned));
14199 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14205 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14206 return std::nullopt;
14218 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14219 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14224 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14230 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14231 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14232 return std::nullopt;
14238 DAGCombinerInfo &DCI)
const {
14240 EVT VT =
N->getValueType(0);
14247 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14252 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14259 std::optional<bool> IsSigned;
14265 int ChainLength = 0;
14266 for (
int I = 0;
I < 4;
I++) {
14267 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14270 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14273 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14278 TempNode->getOperand(MulIdx), *Src0, *Src1,
14279 TempNode->getOperand(MulIdx)->getOperand(0),
14280 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14284 IsSigned = *IterIsSigned;
14285 if (*IterIsSigned != *IsSigned)
14288 auto AddIdx = 1 - MulIdx;
14291 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14292 Src2s.
push_back(TempNode->getOperand(AddIdx));
14302 TempNode->getOperand(AddIdx), *Src0, *Src1,
14303 TempNode->getOperand(AddIdx)->getOperand(0),
14304 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14308 if (*IterIsSigned != *IsSigned)
14312 ChainLength =
I + 2;
14316 TempNode = TempNode->getOperand(AddIdx);
14318 ChainLength =
I + 1;
14319 if (TempNode->getNumOperands() < 2)
14321 LHS = TempNode->getOperand(0);
14322 RHS = TempNode->getOperand(1);
14325 if (ChainLength < 2)
14331 if (ChainLength < 4) {
14341 bool UseOriginalSrc =
false;
14342 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14343 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14344 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14345 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14347 auto Src0Mask = Src0s.
begin()->PermMask;
14348 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14349 bool UniqueEntries =
true;
14350 for (
auto I = 1;
I < 4;
I++) {
14351 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14354 UniqueEntries =
false;
14360 if (UniqueEntries) {
14361 UseOriginalSrc =
true;
14363 auto *FirstElt = Src0s.
begin();
14367 auto *SecondElt = Src1s.
begin();
14369 SecondElt->DWordOffset);
14378 if (!UseOriginalSrc) {
14385 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14388 : Intrinsic::amdgcn_udot4,
14398 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14403 unsigned Opc =
LHS.getOpcode();
14408 Opc =
RHS.getOpcode();
14415 auto Cond =
RHS.getOperand(0);
14423 return DAG.
getNode(Opc, SL, VTList, Args);
14437 DAGCombinerInfo &DCI)
const {
14439 EVT VT =
N->getValueType(0);
14441 if (VT != MVT::i32)
14450 unsigned Opc =
RHS.getOpcode();
14457 auto Cond =
RHS.getOperand(0);
14465 return DAG.
getNode(Opc, SL, VTList, Args);
14480SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14481 DAGCombinerInfo &DCI)
const {
14483 if (
N->getValueType(0) != MVT::i32)
14494 unsigned LHSOpc =
LHS.getOpcode();
14495 unsigned Opc =
N->getOpcode();
14505 DAGCombinerInfo &DCI)
const {
14510 EVT VT =
N->getValueType(0);
14522 if (
A ==
LHS.getOperand(1)) {
14523 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14524 if (FusedOp != 0) {
14526 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14534 if (
A ==
RHS.getOperand(1)) {
14535 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14536 if (FusedOp != 0) {
14538 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14547 DAGCombinerInfo &DCI)
const {
14553 EVT VT =
N->getValueType(0);
14566 if (
A ==
LHS.getOperand(1)) {
14567 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14568 if (FusedOp != 0) {
14572 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14581 if (
A ==
RHS.getOperand(1)) {
14582 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14583 if (FusedOp != 0) {
14585 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14594 DAGCombinerInfo &DCI)
const {
14597 EVT VT =
N->getValueType(0);
14611 bool IsNegative =
false;
14612 if (CLHS->isExactlyValue(1.0) ||
14613 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14629 DAGCombinerInfo &DCI)
const {
14631 EVT VT =
N->getValueType(0);
14645 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14660 if (ScalarVT == MVT::f32 &&
14666 if (TrueNodeExpVal == INT_MIN)
14669 if (FalseNodeExpVal == INT_MIN)
14689 DAGCombinerInfo &DCI)
const {
14691 EVT VT =
N->getValueType(0);
14712 (
N->getFlags().hasAllowContract() &&
14713 FMA->getFlags().hasAllowContract())) {
14747 if (Vec1 == Vec2 || Vec3 == Vec4)
14753 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14762 DAGCombinerInfo &DCI)
const {
14768 EVT VT =
LHS.getValueType();
14771 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14773 CRHS = dyn_cast<ConstantSDNode>(LHS);
14797 return LHS.getOperand(0);
14803 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14804 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14805 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14812 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14813 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14821 return LHS.getOperand(0);
14825 if (VT != MVT::f32 && VT != MVT::f64 &&
14841 const unsigned IsInfMask =
14843 const unsigned IsFiniteMask =
14857SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
14858 DAGCombinerInfo &DCI)
const {
14876 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14880 unsigned ShiftOffset = 8 *
Offset;
14882 ShiftOffset -=
C->getZExtValue();
14884 ShiftOffset +=
C->getZExtValue();
14886 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14888 MVT::f32, Shifted);
14899 DCI.AddToWorklist(
N);
14906 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14912 DAGCombinerInfo &DCI)
const {
14922 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14925 APFloat One(
F.getSemantics(),
"1.0");
14927 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14934 switch (
N->getOpcode()) {
14950 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
14960 switch (
N->getOpcode()) {
14962 return performAddCombine(
N, DCI);
14964 return performSubCombine(
N, DCI);
14967 return performAddCarrySubCarryCombine(
N, DCI);
14969 return performFAddCombine(
N, DCI);
14971 return performFSubCombine(
N, DCI);
14973 return performFDivCombine(
N, DCI);
14975 return performFMulCombine(
N, DCI);
14977 return performSetCCCombine(
N, DCI);
14990 return performMinMaxCombine(
N, DCI);
14992 return performFMACombine(
N, DCI);
14994 return performAndCombine(
N, DCI);
14996 return performOrCombine(
N, DCI);
14999 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15000 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15006 return performXorCombine(
N, DCI);
15008 return performZeroExtendCombine(
N, DCI);
15010 return performSignExtendInRegCombine(
N, DCI);
15012 return performClassCombine(
N, DCI);
15014 return performFCanonicalizeCombine(
N, DCI);
15016 return performRcpCombine(
N, DCI);
15031 return performUCharToFloatCombine(
N, DCI);
15033 return performFCopySignCombine(
N, DCI);
15038 return performCvtF32UByteNCombine(
N, DCI);
15040 return performFMed3Combine(
N, DCI);
15042 return performCvtPkRTZCombine(
N, DCI);
15044 return performClampCombine(
N, DCI);
15047 EVT VT =
N->getValueType(0);
15050 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15053 EVT EltVT = Src.getValueType();
15054 if (EltVT != MVT::i16)
15064 return performExtractVectorEltCombine(
N, DCI);
15066 return performInsertVectorEltCombine(
N, DCI);
15068 return performFPRoundCombine(
N, DCI);
15070 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15076 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15077 return performMemSDNodeCombine(MemNode, DCI);
15108 unsigned Opcode =
Node->getMachineOpcode();
15112 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15117 unsigned DmaskIdx =
15119 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15120 unsigned NewDmask = 0;
15123 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15124 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15127 unsigned TFCLane = 0;
15128 bool HasChain =
Node->getNumValues() > 1;
15130 if (OldDmask == 0) {
15138 TFCLane = OldBitsSet;
15145 if (
Use.getResNo() != 0)
15151 if (!
User->isMachineOpcode() ||
15152 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15164 if (UsesTFC && Lane == TFCLane) {
15169 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15171 Dmask &= ~(1 << Comp);
15179 NewDmask |= 1 << Comp;
15184 bool NoChannels = !NewDmask;
15191 if (OldBitsSet == 1)
15197 if (NewDmask == OldDmask)
15206 unsigned NewChannels = BitsSet + UsesTFC;
15210 assert(NewOpcode != -1 &&
15211 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15212 "failed to find equivalent MIMG op");
15220 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15222 MVT ResultVT = NewChannels == 1
15225 : NewChannels == 5 ? 8
15239 if (NewChannels == 1) {
15249 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15254 if (i || !NoChannels)
15259 if (NewUser !=
User) {
15269 Idx = AMDGPU::sub1;
15272 Idx = AMDGPU::sub2;
15275 Idx = AMDGPU::sub3;
15278 Idx = AMDGPU::sub4;
15289 Op =
Op.getOperand(0);
15291 return isa<FrameIndexSDNode>(
Op);
15301 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15302 SDValue SrcVal = Node->getOperand(2);
15310 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15312 SDNode *Glued = Node->getGluedNode();
15314 Node->getOperand(0), SL, VReg, SrcVal,
15320 return ToResultReg.
getNode();
15325 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15333 Node->getOperand(i).getValueType(),
15334 Node->getOperand(i)),
15346 unsigned Opcode = Node->getMachineOpcode();
15348 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15349 !
TII->isGather4(Opcode) &&
15351 return adjustWritemask(Node, DAG);
15354 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15360 case AMDGPU::V_DIV_SCALE_F32_e64:
15361 case AMDGPU::V_DIV_SCALE_F64_e64: {
15365 SDValue Src0 = Node->getOperand(1);
15366 SDValue Src1 = Node->getOperand(3);
15367 SDValue Src2 = Node->getOperand(5);
15371 (Src0 == Src1 || Src0 == Src2))
15428 unsigned InitIdx = 0;
15430 if (
TII->isImage(
MI)) {
15438 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15439 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15440 unsigned D16Val = D16 ? D16->getImm() : 0;
15442 if (!TFEVal && !LWEVal)
15453 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15455 unsigned dmask = MO_Dmask->
getImm();
15462 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15468 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15469 if (DstSize < InitIdx)
15472 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15480 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15481 unsigned NewDst = 0;
15490 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15491 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15511 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15524 if (
TII->isVOP3(
MI.getOpcode())) {
15526 TII->legalizeOperandsVOP3(
MRI,
MI);
15531 if (!
MI.getDesc().operands().empty()) {
15532 unsigned Opc =
MI.getOpcode();
15533 bool HasAGPRs =
Info->mayNeedAGPRs();
15541 if ((
I == Src2Idx) && (HasAGPRs))
15544 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15546 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15547 if (!
TRI->hasAGPRs(RC))
15549 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15550 if (!Src || !Src->isCopy() ||
15551 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15553 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15557 MRI.setRegClass(
Op.getReg(), NewRC);
15560 if (
TII->isMAI(
MI)) {
15566 AMDGPU::OpName::scale_src0);
15567 if (Src0Idx != -1) {
15569 AMDGPU::OpName::scale_src1);
15570 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15571 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15572 TII->legalizeOpWithMove(
MI, Src1Idx);
15580 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15581 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15582 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15583 if (
TRI->isVectorSuperClass(RC)) {
15584 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15585 MRI.setRegClass(Src2->getReg(), NewRC);
15586 if (Src2->isTied())
15587 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15596 if (
TII->isImage(
MI))
15597 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15671std::pair<unsigned, const TargetRegisterClass *>
15678 if (Constraint.
size() == 1) {
15680 switch (Constraint[0]) {
15687 RC = &AMDGPU::SReg_32RegClass;
15690 RC = &AMDGPU::SGPR_64RegClass;
15695 return std::pair(0U,
nullptr);
15702 RC = &AMDGPU::VGPR_32RegClass;
15707 return std::pair(0U,
nullptr);
15716 RC = &AMDGPU::AGPR_32RegClass;
15721 return std::pair(0U,
nullptr);
15730 return std::pair(0U, RC);
15735 if (
RegName.consume_front(
"v")) {
15736 RC = &AMDGPU::VGPR_32RegClass;
15737 }
else if (
RegName.consume_front(
"s")) {
15738 RC = &AMDGPU::SGPR_32RegClass;
15739 }
else if (
RegName.consume_front(
"a")) {
15740 RC = &AMDGPU::AGPR_32RegClass;
15745 if (
RegName.consume_front(
"[")) {
15756 return std::pair(0U,
nullptr);
15759 RC =
TRI->getVGPRClassForBitWidth(Width);
15761 RC =
TRI->getSGPRClassForBitWidth(Width);
15763 RC =
TRI->getAGPRClassForBitWidth(Width);
15765 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15766 return std::pair(Reg, RC);
15772 return std::pair(0U,
nullptr);
15774 if (!
Failed && Idx < RC->getNumRegs())
15782 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15788 if (Constraint.
size() == 1) {
15789 switch (Constraint[0]) {
15799 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
15807 if (Constraint.
size() == 1) {
15808 switch (Constraint[0]) {
15825 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15832 std::vector<SDValue> &Ops,
15847 unsigned Size =
Op.getScalarValueSizeInBits();
15855 Val =
C->getSExtValue();
15859 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15865 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15868 Val =
C->getSExtValue();
15872 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15882 if (Constraint.
size() == 1) {
15883 switch (Constraint[0]) {
15887 return isInt<16>(Val);
15891 return isInt<32>(Val);
15898 }
else if (Constraint.
size() == 2) {
15899 if (Constraint ==
"DA") {
15900 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15901 int64_t LoBits =
static_cast<int32_t
>(Val);
15905 if (Constraint ==
"DB") {
15913 unsigned MaxSize)
const {
15914 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15917 MVT VT =
Op.getSimpleValueType();
15942 switch (UnalignedClassID) {
15943 case AMDGPU::VReg_64RegClassID:
15944 return AMDGPU::VReg_64_Align2RegClassID;
15945 case AMDGPU::VReg_96RegClassID:
15946 return AMDGPU::VReg_96_Align2RegClassID;
15947 case AMDGPU::VReg_128RegClassID:
15948 return AMDGPU::VReg_128_Align2RegClassID;
15949 case AMDGPU::VReg_160RegClassID:
15950 return AMDGPU::VReg_160_Align2RegClassID;
15951 case AMDGPU::VReg_192RegClassID:
15952 return AMDGPU::VReg_192_Align2RegClassID;
15953 case AMDGPU::VReg_224RegClassID:
15954 return AMDGPU::VReg_224_Align2RegClassID;
15955 case AMDGPU::VReg_256RegClassID:
15956 return AMDGPU::VReg_256_Align2RegClassID;
15957 case AMDGPU::VReg_288RegClassID:
15958 return AMDGPU::VReg_288_Align2RegClassID;
15959 case AMDGPU::VReg_320RegClassID:
15960 return AMDGPU::VReg_320_Align2RegClassID;
15961 case AMDGPU::VReg_352RegClassID:
15962 return AMDGPU::VReg_352_Align2RegClassID;
15963 case AMDGPU::VReg_384RegClassID:
15964 return AMDGPU::VReg_384_Align2RegClassID;
15965 case AMDGPU::VReg_512RegClassID:
15966 return AMDGPU::VReg_512_Align2RegClassID;
15967 case AMDGPU::VReg_1024RegClassID:
15968 return AMDGPU::VReg_1024_Align2RegClassID;
15969 case AMDGPU::AReg_64RegClassID:
15970 return AMDGPU::AReg_64_Align2RegClassID;
15971 case AMDGPU::AReg_96RegClassID:
15972 return AMDGPU::AReg_96_Align2RegClassID;
15973 case AMDGPU::AReg_128RegClassID:
15974 return AMDGPU::AReg_128_Align2RegClassID;
15975 case AMDGPU::AReg_160RegClassID:
15976 return AMDGPU::AReg_160_Align2RegClassID;
15977 case AMDGPU::AReg_192RegClassID:
15978 return AMDGPU::AReg_192_Align2RegClassID;
15979 case AMDGPU::AReg_256RegClassID:
15980 return AMDGPU::AReg_256_Align2RegClassID;
15981 case AMDGPU::AReg_512RegClassID:
15982 return AMDGPU::AReg_512_Align2RegClassID;
15983 case AMDGPU::AReg_1024RegClassID:
15984 return AMDGPU::AReg_1024_Align2RegClassID;
16000 if (
Info->isEntryFunction()) {
16007 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16009 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16010 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16011 &AMDGPU::SGPR_64RegClass);
16012 Info->setSGPRForEXECCopy(SReg);
16015 Info->getStackPtrOffsetReg()));
16016 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16017 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16021 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16022 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16024 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16025 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16027 Info->limitOccupancy(MF);
16029 if (ST.isWave32() && !MF.
empty()) {
16030 for (
auto &
MBB : MF) {
16031 for (
auto &
MI :
MBB) {
16032 TII->fixImplicitOperands(
MI);
16042 if (ST.needsAlignedVGPRs()) {
16043 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16049 if (NewClassID != -1)
16050 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16059 const APInt &DemandedElts,
16061 unsigned Depth)
const {
16063 unsigned Opc =
Op.getOpcode();
16066 unsigned IID =
Op.getConstantOperandVal(0);
16068 case Intrinsic::amdgcn_mbcnt_lo:
16069 case Intrinsic::amdgcn_mbcnt_hi: {
16075 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16085 Op, Known, DemandedElts, DAG,
Depth);
16100 unsigned MaxValue =
16109 switch (
MI->getOpcode()) {
16110 case AMDGPU::G_INTRINSIC:
16111 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16114 case Intrinsic::amdgcn_workitem_id_x:
16117 case Intrinsic::amdgcn_workitem_id_y:
16120 case Intrinsic::amdgcn_workitem_id_z:
16123 case Intrinsic::amdgcn_mbcnt_lo:
16124 case Intrinsic::amdgcn_mbcnt_hi: {
16136 case Intrinsic::amdgcn_groupstaticsize: {
16147 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16150 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16153 case AMDGPU::G_AMDGPU_SMED3:
16154 case AMDGPU::G_AMDGPU_UMED3: {
16155 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16182 unsigned Depth)
const {
16184 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16190 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16217 if (Header->getAlignment() != PrefAlign)
16218 return Header->getAlignment();
16220 unsigned LoopSize = 0;
16228 LoopSize +=
TII->getInstSizeInBytes(
MI);
16229 if (LoopSize > 192)
16234 if (LoopSize <= 64)
16237 if (LoopSize <= 128)
16238 return CacheLineAlign;
16244 auto I = Exit->getFirstNonDebugInstr();
16245 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16246 return CacheLineAlign;
16255 if (PreTerm == Pre->
begin() ||
16256 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16260 auto ExitHead = Exit->getFirstNonDebugInstr();
16261 if (ExitHead == Exit->end() ||
16262 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16267 return CacheLineAlign;
16275 N =
N->getOperand(0).getNode();
16285 switch (
N->getOpcode()) {
16293 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16294 return !
TRI->isSGPRReg(
MRI, Reg);
16296 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16300 return !
TRI->isSGPRReg(
MRI, Reg);
16304 unsigned AS = L->getAddressSpace();
16335 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16337 return A->readMem() &&
A->writeMem();
16372 unsigned Depth)
const {
16377 if (
Info->getMode().DX10Clamp)
16389 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16409 <<
"Hardware instruction generated for atomic "
16411 <<
" operation at memory scope " << MemScope;
16415 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16416 Type *EltTy = VT->getElementType();
16417 return VT->getNumElements() == 2 &&
16436 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16437 unsigned BW =
IT->getBitWidth();
16438 return BW == 32 || BW == 64;
16450 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16452 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16453 return BW == 32 || BW == 64;
16460 return VT->getNumElements() == 2 &&
16461 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16471 bool HasSystemScope) {
16478 if (HasSystemScope) {
16485 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16498 const MDNode *NoaliasAddrSpaceMD =
16499 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16500 if (!NoaliasAddrSpaceMD)
16503 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16505 auto *
Low = mdconst::extract<ConstantInt>(
16508 auto *
High = mdconst::extract<ConstantInt>(
16530 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16543 bool HasSystemScope =
16730 if (HasSystemScope)
16782 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16783 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16784 : &AMDGPU::SReg_32RegClass;
16785 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16786 return TRI->getEquivalentSGPRClass(RC);
16787 if (
TRI->isSGPRClass(RC) && isDivergent)
16788 return TRI->getEquivalentVGPRClass(RC);
16800 unsigned WaveSize) {
16805 if (!
IT ||
IT->getBitWidth() != WaveSize)
16808 if (!isa<Instruction>(V))
16810 if (!Visited.
insert(V).second)
16812 bool Result =
false;
16813 for (
const auto *U : V->users()) {
16814 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16815 if (V == U->getOperand(1)) {
16816 switch (Intrinsic->getIntrinsicID()) {
16820 case Intrinsic::amdgcn_if_break:
16821 case Intrinsic::amdgcn_if:
16822 case Intrinsic::amdgcn_else:
16827 if (V == U->getOperand(0)) {
16828 switch (Intrinsic->getIntrinsicID()) {
16832 case Intrinsic::amdgcn_end_cf:
16833 case Intrinsic::amdgcn_loop:
16839 Result =
hasCFUser(U, Visited, WaveSize);
16848 const Value *V)
const {
16849 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16850 if (CI->isInlineAsm()) {
16859 for (
auto &TC : TargetConstraints) {
16901 return MRI.hasOneNonDBGUse(N0);
16908 if (
I.getMetadata(
"amdgpu.noclobber"))
16910 if (
I.getMetadata(
"amdgpu.last.use"))
16920 if (!Def->isMachineOpcode())
16930 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16931 PhysReg = AMDGPU::SCC;
16933 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16988 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16999 Alignment = RMW->getAlign();
17014 RMW->getType()->isFloatTy();
17017 bool ReturnValueIsUsed = !AI->
use_empty();
17026 if (FullFlatEmulation) {
17037 std::prev(BB->
end())->eraseFromParent();
17040 Value *LoadedShared =
nullptr;
17041 if (FullFlatEmulation) {
17043 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17044 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17052 LoadedShared = Clone;
17059 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17067 Value *LoadedPrivate;
17070 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17073 LoadedPrivate, RMW->getValOperand());
17077 auto [ResultLoad, Equal] =
17092 if (FullFlatEmulation) {
17102 if (!FullFlatEmulation) {
17107 MDNode *RangeNotPrivate =
17110 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17118 if (ReturnValueIsUsed) {
17121 if (FullFlatEmulation)
17136 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17137 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const