39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
792 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
800 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1005 EVT DestVT,
EVT SrcVT)
const {
1015 LLT DestTy,
LLT SrcTy)
const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1045 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1072 return (NumElts + 1) / 2;
1078 return NumElts * ((
Size + 31) / 32);
1087 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1100 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((
Size + 31) / 32);
1133 return NumIntermediates;
1138 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1147 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1181 DL.getPointerSizeInBits(AS) == 192)
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1201 unsigned IntrID)
const {
1203 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1221 if (RsrcIntr->IsImage) {
1229 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1236 Info.ptrVal = RsrcArg;
1239 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1248 if (RsrcIntr->IsImage) {
1249 unsigned MaxNumLanes = 4;
1264 std::numeric_limits<unsigned>::max());
1274 if (RsrcIntr->IsImage) {
1275 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1295 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1297 Info.memVT = MVT::i32;
1304 case Intrinsic::amdgcn_raw_buffer_load_lds:
1305 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1306 case Intrinsic::amdgcn_struct_buffer_load_lds:
1307 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1308 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1313 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1314 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1315 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1316 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1319 std::numeric_limits<unsigned>::max());
1329 case Intrinsic::amdgcn_ds_ordered_add:
1330 case Intrinsic::amdgcn_ds_ordered_swap: {
1343 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1344 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1347 Info.ptrVal =
nullptr;
1352 case Intrinsic::amdgcn_ds_append:
1353 case Intrinsic::amdgcn_ds_consume: {
1366 case Intrinsic::amdgcn_global_atomic_csub: {
1375 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1385 case Intrinsic::amdgcn_global_atomic_fmin_num:
1386 case Intrinsic::amdgcn_global_atomic_fmax_num:
1387 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1388 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1389 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1390 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1400 case Intrinsic::amdgcn_global_load_tr_b64:
1401 case Intrinsic::amdgcn_global_load_tr_b128:
1402 case Intrinsic::amdgcn_ds_read_tr4_b64:
1403 case Intrinsic::amdgcn_ds_read_tr6_b96:
1404 case Intrinsic::amdgcn_ds_read_tr8_b64:
1405 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1413 case Intrinsic::amdgcn_ds_gws_init:
1414 case Intrinsic::amdgcn_ds_gws_barrier:
1415 case Intrinsic::amdgcn_ds_gws_sema_v:
1416 case Intrinsic::amdgcn_ds_gws_sema_br:
1417 case Intrinsic::amdgcn_ds_gws_sema_p:
1418 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1428 Info.memVT = MVT::i32;
1432 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1438 case Intrinsic::amdgcn_global_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1446 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1456 Info.memVT = MVT::i32;
1463 case Intrinsic::amdgcn_s_prefetch_data: {
1478 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1481 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1482 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1494 Type *&AccessTy)
const {
1496 switch (
II->getIntrinsicID()) {
1497 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume:
1500 case Intrinsic::amdgcn_ds_read_tr4_b64:
1501 case Intrinsic::amdgcn_ds_read_tr6_b96:
1502 case Intrinsic::amdgcn_ds_read_tr8_b64:
1503 case Intrinsic::amdgcn_ds_read_tr16_b64:
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap:
1506 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1507 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_csub:
1509 case Intrinsic::amdgcn_global_atomic_fmax_num:
1510 case Intrinsic::amdgcn_global_atomic_fmin_num:
1511 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1512 case Intrinsic::amdgcn_global_load_tr_b64:
1513 case Intrinsic::amdgcn_global_load_tr_b128:
1514 Ptr =
II->getArgOperand(0);
1516 case Intrinsic::amdgcn_global_load_lds:
1517 Ptr =
II->getArgOperand(1);
1522 AccessTy =
II->getType();
1528 unsigned AddrSpace)
const {
1540 return AM.
Scale == 0 &&
1542 AM.
BaseOffs, AddrSpace, FlatVariant));
1562 return isLegalMUBUFAddressingMode(AM);
1565bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1576 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1588 if (AM.HasBaseReg) {
1620 return isLegalMUBUFAddressingMode(AM);
1627 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1677 : isLegalMUBUFAddressingMode(AM);
1724 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1736 Align RequiredAlignment(
1739 Alignment < RequiredAlignment)
1760 RequiredAlignment =
Align(4);
1778 *IsFast = (Alignment >= RequiredAlignment) ? 64
1779 : (Alignment <
Align(4)) ? 32
1801 *IsFast = (Alignment >= RequiredAlignment) ? 96
1802 : (Alignment <
Align(4)) ? 32
1815 RequiredAlignment =
Align(8);
1826 *IsFast = (Alignment >= RequiredAlignment) ? 128
1827 : (Alignment <
Align(4)) ? 32
1844 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1846 return Alignment >= RequiredAlignment ||
1855 bool AlignedBy4 = Alignment >=
Align(4);
1857 *IsFast = AlignedBy4;
1868 return Alignment >=
Align(4) ||
1882 return Size >= 32 && Alignment >=
Align(4);
1887 unsigned *IsFast)
const {
1889 Alignment, Flags, IsFast);
1899 if (
Op.size() >= 16 &&
1903 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1911 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1921 unsigned DestAS)
const {
1929 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1947 unsigned Index)
const {
1983 auto [InputPtrReg, RC, ArgTy] =
1993 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1999 const SDLoc &SL)
const {
2006 const SDLoc &SL)
const {
2009 std::optional<uint32_t> KnownSize =
2011 if (KnownSize.has_value())
2037 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2046SDValue SITargetLowering::lowerKernargMemParameter(
2058 int64_t OffsetDiff =
Offset - AlignDownOffset;
2064 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2074 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2084 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2132 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2137SDValue SITargetLowering::getPreloadedValue(
2159 Reg = &WorkGroupIDX;
2160 RC = &AMDGPU::SReg_32RegClass;
2164 Reg = &WorkGroupIDY;
2165 RC = &AMDGPU::SReg_32RegClass;
2169 Reg = &WorkGroupIDZ;
2170 RC = &AMDGPU::SReg_32RegClass;
2201 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2205 "vector type argument should have been split");
2210 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2218 "unexpected vector split in ps argument type");
2232 Info->markPSInputAllocated(PSInputNum);
2234 Info->markPSInputEnabled(PSInputNum);
2250 if (
Info.hasWorkItemIDX()) {
2260 if (
Info.hasWorkItemIDY()) {
2263 Info.setWorkItemIDY(
2266 unsigned Reg = AMDGPU::VGPR1;
2274 if (
Info.hasWorkItemIDZ()) {
2277 Info.setWorkItemIDZ(
2280 unsigned Reg = AMDGPU::VGPR2;
2300 if (RegIdx == ArgVGPRs.
size()) {
2307 unsigned Reg = ArgVGPRs[RegIdx];
2309 assert(Reg != AMDGPU::NoRegister);
2319 unsigned NumArgRegs) {
2322 if (RegIdx == ArgSGPRs.
size())
2325 unsigned Reg = ArgSGPRs[RegIdx];
2327 assert(Reg != AMDGPU::NoRegister);
2341 assert(Reg != AMDGPU::NoRegister);
2367 const unsigned Mask = 0x3ff;
2370 if (
Info.hasWorkItemIDX()) {
2372 Info.setWorkItemIDX(Arg);
2375 if (
Info.hasWorkItemIDY()) {
2377 Info.setWorkItemIDY(Arg);
2380 if (
Info.hasWorkItemIDZ())
2392 const unsigned Mask = 0x3ff;
2413 if (
Info.hasImplicitArgPtr())
2421 if (
Info.hasWorkGroupIDX())
2424 if (
Info.hasWorkGroupIDY())
2427 if (
Info.hasWorkGroupIDZ())
2430 if (
Info.hasLDSKernelId())
2442 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2449 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2455 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2461 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2476 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2482 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2488 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2505 bool InPreloadSequence =
true;
2507 bool AlignedForImplictArgs =
false;
2508 unsigned ImplicitArgOffset = 0;
2509 for (
auto &Arg :
F.args()) {
2510 if (!InPreloadSequence || !Arg.hasInRegAttr())
2513 unsigned ArgIdx = Arg.getArgNo();
2516 if (InIdx < Ins.size() &&
2517 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2520 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2521 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2523 assert(ArgLocs[ArgIdx].isMemLoc());
2524 auto &ArgLoc = ArgLocs[InIdx];
2526 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2528 unsigned NumAllocSGPRs =
2529 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2532 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2533 if (!AlignedForImplictArgs) {
2535 alignTo(LastExplicitArgOffset,
2537 LastExplicitArgOffset;
2538 AlignedForImplictArgs =
true;
2540 ArgOffset += ImplicitArgOffset;
2544 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2545 assert(InIdx >= 1 &&
"No previous SGPR");
2546 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2547 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2551 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2552 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2555 InPreloadSequence =
false;
2561 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2563 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2565 if (PreloadRegs->
size() > 1)
2566 RC = &AMDGPU::SGPR_32RegClass;
2567 for (
auto &Reg : *PreloadRegs) {
2573 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2582 if (
Info.hasLDSKernelId()) {
2584 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2593 bool IsShader)
const {
2601 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2603 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2607 unsigned NumRequiredSystemSGPRs =
2608 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2609 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2610 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2612 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 if (!HasArchitectedSGPRs) {
2618 if (
Info.hasWorkGroupIDX()) {
2620 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2624 if (
Info.hasWorkGroupIDY()) {
2626 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2630 if (
Info.hasWorkGroupIDZ()) {
2632 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2637 if (
Info.hasWorkGroupInfo()) {
2639 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2643 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2645 unsigned PrivateSegmentWaveByteOffsetReg;
2648 PrivateSegmentWaveByteOffsetReg =
2649 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2653 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2655 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2658 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2660 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2661 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2665 Info.getNumPreloadedSGPRs() >= 16);
2680 if (HasStackObjects)
2681 Info.setHasNonSpillStackObjects(
true);
2686 HasStackObjects =
true;
2690 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2692 if (!ST.enableFlatScratch()) {
2693 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2700 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2702 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2712 Info.setScratchRSrcReg(ReservedBufferReg);
2731 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2732 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2739 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2740 if (!
MRI.isLiveIn(Reg)) {
2741 Info.setStackPtrOffsetReg(Reg);
2746 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2753 if (ST.getFrameLowering()->hasFP(MF)) {
2754 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2760 return !
Info->isEntryFunction();
2770 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2779 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2780 RC = &AMDGPU::SGPR_64RegClass;
2781 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2782 RC = &AMDGPU::SGPR_32RegClass;
2788 Entry->addLiveIn(*
I);
2793 for (
auto *Exit : Exits)
2795 TII->get(TargetOpcode::COPY), *
I)
2813 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2832 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2833 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2841 !
Info->hasWorkGroupIDZ());
2860 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2861 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2864 Info->markPSInputAllocated(0);
2865 Info->markPSInputEnabled(0);
2876 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2877 if ((PsInputBits & 0x7F) == 0 ||
2878 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2881 }
else if (IsKernel) {
2884 Splits.
append(Ins.begin(), Ins.end());
2897 }
else if (!IsGraphics) {
2922 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2932 if (IsEntryFunc && VA.
isMemLoc()) {
2955 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2959 int64_t OffsetDiff =
Offset - AlignDownOffset;
2966 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2977 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2978 Ins[i].Flags.isSExt(), &Ins[i]);
2986 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2989 if (PreloadRegs.
size() == 1) {
2990 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2995 TRI->getRegSizeInBits(*RC)));
3003 for (
auto Reg : PreloadRegs) {
3010 PreloadRegs.size()),
3027 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3028 Ins[i].Flags.isSExt(), &Ins[i]);
3040 "hidden argument in kernel signature was not preloaded",
3047 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3048 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3053 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3068 if (!IsEntryFunc && VA.
isMemLoc()) {
3069 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3080 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3081 RC = &AMDGPU::VGPR_32RegClass;
3082 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3083 RC = &AMDGPU::SGPR_32RegClass;
3143 Info->setBytesInStackArgArea(StackArgSize);
3145 return Chains.
empty() ? Chain
3161 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3167 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3168 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3169 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3192 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3210 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3211 ++
I, ++RealRVLocIdx) {
3215 SDValue Arg = OutVals[RealRVLocIdx];
3243 if (!
Info->isEntryFunction()) {
3249 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3251 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3267 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3350 auto &ArgUsageInfo =
3352 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3378 const auto [OutgoingArg, ArgRC, ArgTy] =
3383 const auto [IncomingArg, IncomingArgRC, Ty] =
3385 assert(IncomingArgRC == ArgRC);
3388 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3396 InputReg = getImplicitArgPtr(DAG,
DL);
3398 std::optional<uint32_t> Id =
3400 if (Id.has_value()) {
3411 if (OutgoingArg->isRegister()) {
3412 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3413 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3416 unsigned SpecialArgOffset =
3427 auto [OutgoingArg, ArgRC, Ty] =
3430 std::tie(OutgoingArg, ArgRC, Ty) =
3433 std::tie(OutgoingArg, ArgRC, Ty) =
3448 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3449 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3450 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3482 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3483 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3494 : IncomingArgY ? *IncomingArgY
3501 if (OutgoingArg->isRegister()) {
3503 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3544 if (Callee->isDivergent())
3551 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3555 if (!CallerPreserved)
3558 bool CCMatch = CallerCC == CalleeCC;
3571 if (Arg.hasByValAttr())
3585 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3586 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3595 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3608 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3610 if (!CCVA.isRegLoc())
3615 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3617 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3646 if (IsChainCallConv) {
3650 RequestedExec = CLI.
Args.back();
3651 assert(RequestedExec.
Node &&
"No node for EXEC");
3656 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3657 CLI.
Outs.pop_back();
3661 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3662 CLI.
Outs.pop_back();
3667 "Haven't popped all the pieces of the EXEC mask");
3678 bool IsSibCall =
false;
3692 "unsupported call to variadic function ");
3700 "unsupported required tail call to function ");
3705 Outs, OutVals, Ins, DAG);
3709 "site marked musttail or on llvm.amdgcn.cs.chain");
3716 if (!TailCallOpt && IsTailCall)
3762 if (!IsSibCall || IsChainCallConv) {
3769 RegsToPass.emplace_back(IsChainCallConv
3770 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3771 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3778 const unsigned NumSpecialInputs = RegsToPass.size();
3780 MVT PtrVT = MVT::i32;
3783 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3811 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3819 int32_t
Offset = LocMemOffset;
3826 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3832 ? Flags.getNonZeroByValAlign()
3859 if (Outs[i].Flags.isByVal()) {
3861 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3864 Outs[i].Flags.getNonZeroByValAlign(),
3866 nullptr, std::nullopt, DstInfo,
3872 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3878 if (!MemOpChains.
empty())
3894 unsigned ArgIdx = 0;
3895 for (
auto [Reg, Val] : RegsToPass) {
3896 if (ArgIdx++ >= NumSpecialInputs &&
3897 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3923 if (IsTailCall && !IsSibCall) {
3928 std::vector<SDValue> Ops({Chain});
3934 Ops.push_back(Callee);
3951 Ops.push_back(Callee);
3962 if (IsChainCallConv)
3963 Ops.push_back(RequestedExec.
Node);
3967 for (
auto &[Reg, Val] : RegsToPass)
3971 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3972 assert(Mask &&
"Missing call preserved mask for calling convention");
3982 MVT::Glue, GlueOps),
3987 Ops.push_back(InGlue);
4004 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4009 Chain = Call.getValue(0);
4010 InGlue = Call.getValue(1);
4012 uint64_t CalleePopBytes = NumBytes;
4033 EVT VT =
Op.getValueType();
4043 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4047 "Stack grows upwards for AMDGPU");
4049 Chain = BaseAddr.getValue(1);
4051 if (Alignment > StackAlign) {
4054 uint64_t StackAlignMask = ScaledAlignment - 1;
4061 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4063 if (isa<ConstantSDNode>(
Size)) {
4094 if (
Op.getValueType() != MVT::i32)
4113 assert(
Op.getValueType() == MVT::i32);
4122 Op.getOperand(0), IntrinID, GetRoundBothImm);
4156 SDValue RoundModeTimesNumBits =
4176 TableEntry, EnumOffset);
4190 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4192 static_cast<uint32_t>(ConstMode->getZExtValue()),
4204 if (UseReducedTable) {
4210 SDValue RoundModeTimesNumBits =
4230 SDValue RoundModeTimesNumBits =
4239 NewMode = TruncTable;
4248 ReadFirstLaneID, NewMode);
4261 IntrinID, RoundBothImm, NewMode);
4267 if (
Op->isDivergent())
4286 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4287 EVT SrcVT = Src.getValueType();
4296 EVT DstVT =
Op.getValueType();
4305 if (
Op.getValueType() != MVT::i64)
4319 Op.getOperand(0), IntrinID, ModeHwRegImm);
4321 Op.getOperand(0), IntrinID, TrapHwRegImm);
4335 if (
Op.getOperand(1).getValueType() != MVT::i64)
4347 ReadFirstLaneID, NewModeReg);
4349 ReadFirstLaneID, NewTrapReg);
4351 unsigned ModeHwReg =
4354 unsigned TrapHwReg =
4362 IntrinID, ModeHwRegImm, NewModeReg);
4365 IntrinID, TrapHwRegImm, NewTrapReg);
4372 .
Case(
"m0", AMDGPU::M0)
4373 .
Case(
"exec", AMDGPU::EXEC)
4374 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4375 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4376 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4377 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4378 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4381 if (Reg == AMDGPU::NoRegister) {
4389 "\" for subtarget."));
4394 case AMDGPU::EXEC_LO:
4395 case AMDGPU::EXEC_HI:
4396 case AMDGPU::FLAT_SCR_LO:
4397 case AMDGPU::FLAT_SCR_HI:
4402 case AMDGPU::FLAT_SCR:
4421 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4430static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4452 auto Next = std::next(
I);
4465 return std::pair(LoopBB, RemainderBB);
4472 auto I =
MI.getIterator();
4473 auto E = std::next(
I);
4495 Src->setIsKill(
false);
4505 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4511 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4514 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4538 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4539 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4548 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4549 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4550 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4551 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4559 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4566 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4570 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4576 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4577 : AMDGPU::S_AND_SAVEEXEC_B64),
4581 MRI.setSimpleHint(NewExec, CondReg);
4583 if (UseGPRIdxMode) {
4585 SGPRIdxReg = CurrentIdxReg;
4587 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4588 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4595 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4598 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4605 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4608 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4609 : AMDGPU::S_XOR_B64_term),
4633 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4634 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4642 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4644 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4645 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4646 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4647 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4662 InitResultReg, DstReg, PhiReg, TmpExec,
4663 Offset, UseGPRIdxMode, SGPRIdxReg);
4669 LoopBB->removeSuccessor(RemainderBB);
4671 LoopBB->addSuccessor(LandingPad);
4682static std::pair<unsigned, int>
4686 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4691 return std::pair(AMDGPU::sub0,
Offset);
4705 assert(
Idx->getReg() != AMDGPU::NoRegister);
4729 return Idx->getReg();
4731 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4748 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4749 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4758 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4761 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4765 if (UseGPRIdxMode) {
4772 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4785 MI.eraseFromParent();
4794 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4795 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4801 UseGPRIdxMode, SGPRIdxReg);
4805 if (UseGPRIdxMode) {
4807 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4809 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4814 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4819 MI.eraseFromParent();
4836 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4846 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4848 if (
Idx->getReg() == AMDGPU::NoRegister) {
4859 MI.eraseFromParent();
4864 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4868 if (UseGPRIdxMode) {
4872 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4881 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4882 TRI.getRegSizeInBits(*VecRC), 32,
false);
4888 MI.eraseFromParent();
4898 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4902 UseGPRIdxMode, SGPRIdxReg);
4905 if (UseGPRIdxMode) {
4907 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4909 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4915 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4916 TRI.getRegSizeInBits(*VecRC), 32,
false);
4917 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4923 MI.eraseFromParent();
4938 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4969 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4970 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4972 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4973 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4974 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4976 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4977 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4979 bool IsWave32 = ST.isWave32();
4980 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4981 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4986 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4989 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4997 I = ComputeLoop->end();
4999 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5003 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5004 .
addReg(TmpSReg->getOperand(0).getReg())
5008 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5009 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5010 .
addReg(ActiveBits->getOperand(0).getReg());
5011 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5012 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5014 .
addReg(FF1->getOperand(0).getReg());
5015 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5017 .
addReg(LaneValue->getOperand(0).getReg());
5020 unsigned BITSETOpc =
5021 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5022 auto NewActiveBits =
5023 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5024 .
addReg(FF1->getOperand(0).getReg())
5025 .
addReg(ActiveBits->getOperand(0).getReg());
5028 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5029 .addMBB(ComputeLoop);
5030 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5031 .addMBB(ComputeLoop);
5034 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5036 .
addReg(NewActiveBits->getOperand(0).getReg())
5038 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5043 MI.eraseFromParent();
5055 switch (
MI.getOpcode()) {
5056 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5058 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5060 case AMDGPU::S_UADDO_PSEUDO:
5061 case AMDGPU::S_USUBO_PSEUDO: {
5068 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5070 : AMDGPU::S_SUB_I32;
5081 MI.eraseFromParent();
5084 case AMDGPU::S_ADD_U64_PSEUDO:
5085 case AMDGPU::S_SUB_U64_PSEUDO: {
5094 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5096 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5106 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5110 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5112 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5115 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5117 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5119 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5120 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5133 MI.eraseFromParent();
5136 case AMDGPU::V_ADD_U64_PSEUDO:
5137 case AMDGPU::V_SUB_U64_PSEUDO: {
5143 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5149 if (IsAdd && ST.hasLshlAddB64()) {
5155 TII->legalizeOperands(*
Add);
5156 MI.eraseFromParent();
5160 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5162 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5163 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5165 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5166 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5170 : &AMDGPU::VReg_64RegClass;
5173 : &AMDGPU::VReg_64RegClass;
5176 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5178 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5181 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5183 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5186 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5188 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5191 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5198 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5212 TII->legalizeOperands(*LoHalf);
5213 TII->legalizeOperands(*HiHalf);
5214 MI.eraseFromParent();
5217 case AMDGPU::S_ADD_CO_PSEUDO:
5218 case AMDGPU::S_SUB_CO_PSEUDO: {
5232 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5233 ? AMDGPU::S_ADDC_U32
5234 : AMDGPU::S_SUBB_U32;
5236 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5242 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5243 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5247 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5249 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5255 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5256 assert(WaveSize == 64 || WaveSize == 32);
5258 if (WaveSize == 64) {
5259 if (ST.hasScalarCompareEq64()) {
5265 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5267 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5269 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5270 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5272 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5293 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5299 MI.eraseFromParent();
5302 case AMDGPU::SI_INIT_M0: {
5304 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5305 .
add(
MI.getOperand(0));
5306 MI.eraseFromParent();
5309 case AMDGPU::GET_GROUPSTATICSIZE: {
5314 .
add(
MI.getOperand(0))
5316 MI.eraseFromParent();
5319 case AMDGPU::GET_SHADERCYCLESHILO: {
5333 using namespace AMDGPU::Hwreg;
5334 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5336 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5337 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5339 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5340 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5342 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5346 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351 .
add(
MI.getOperand(0))
5356 MI.eraseFromParent();
5359 case AMDGPU::SI_INDIRECT_SRC_V1:
5360 case AMDGPU::SI_INDIRECT_SRC_V2:
5361 case AMDGPU::SI_INDIRECT_SRC_V4:
5362 case AMDGPU::SI_INDIRECT_SRC_V8:
5363 case AMDGPU::SI_INDIRECT_SRC_V9:
5364 case AMDGPU::SI_INDIRECT_SRC_V10:
5365 case AMDGPU::SI_INDIRECT_SRC_V11:
5366 case AMDGPU::SI_INDIRECT_SRC_V12:
5367 case AMDGPU::SI_INDIRECT_SRC_V16:
5368 case AMDGPU::SI_INDIRECT_SRC_V32:
5370 case AMDGPU::SI_INDIRECT_DST_V1:
5371 case AMDGPU::SI_INDIRECT_DST_V2:
5372 case AMDGPU::SI_INDIRECT_DST_V4:
5373 case AMDGPU::SI_INDIRECT_DST_V8:
5374 case AMDGPU::SI_INDIRECT_DST_V9:
5375 case AMDGPU::SI_INDIRECT_DST_V10:
5376 case AMDGPU::SI_INDIRECT_DST_V11:
5377 case AMDGPU::SI_INDIRECT_DST_V12:
5378 case AMDGPU::SI_INDIRECT_DST_V16:
5379 case AMDGPU::SI_INDIRECT_DST_V32:
5381 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5382 case AMDGPU::SI_KILL_I1_PSEUDO:
5384 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5393 Register SrcCond =
MI.getOperand(3).getReg();
5395 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5396 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5397 const auto *CondRC =
TRI->getWaveMaskRegClass();
5398 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5402 : &AMDGPU::VReg_64RegClass;
5405 : &AMDGPU::VReg_64RegClass;
5408 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5410 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5413 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5415 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5418 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5420 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5441 MI.eraseFromParent();
5444 case AMDGPU::SI_BR_UNDEF: {
5448 .
add(
MI.getOperand(0));
5450 MI.eraseFromParent();
5453 case AMDGPU::ADJCALLSTACKUP:
5454 case AMDGPU::ADJCALLSTACKDOWN: {
5461 case AMDGPU::SI_CALL_ISEL: {
5465 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5468 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5474 MI.eraseFromParent();
5477 case AMDGPU::V_ADD_CO_U32_e32:
5478 case AMDGPU::V_SUB_CO_U32_e32:
5479 case AMDGPU::V_SUBREV_CO_U32_e32: {
5482 unsigned Opc =
MI.getOpcode();
5484 bool NeedClampOperand =
false;
5485 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5487 NeedClampOperand =
true;
5491 if (
TII->isVOP3(*
I)) {
5496 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5497 if (NeedClampOperand)
5500 TII->legalizeOperands(*
I);
5502 MI.eraseFromParent();
5505 case AMDGPU::V_ADDC_U32_e32:
5506 case AMDGPU::V_SUBB_U32_e32:
5507 case AMDGPU::V_SUBBREV_U32_e32:
5510 TII->legalizeOperands(
MI);
5512 case AMDGPU::DS_GWS_INIT:
5513 case AMDGPU::DS_GWS_SEMA_BR:
5514 case AMDGPU::DS_GWS_BARRIER:
5515 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5517 case AMDGPU::DS_GWS_SEMA_V:
5518 case AMDGPU::DS_GWS_SEMA_P:
5519 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5527 case AMDGPU::S_SETREG_B32: {
5542 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5543 const unsigned SetMask = WidthMask <<
Offset;
5546 unsigned SetDenormOp = 0;
5547 unsigned SetRoundOp = 0;
5555 SetRoundOp = AMDGPU::S_ROUND_MODE;
5556 SetDenormOp = AMDGPU::S_DENORM_MODE;
5558 SetRoundOp = AMDGPU::S_ROUND_MODE;
5560 SetDenormOp = AMDGPU::S_DENORM_MODE;
5563 if (SetRoundOp || SetDenormOp) {
5566 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5567 unsigned ImmVal = Def->getOperand(1).getImm();
5581 MI.eraseFromParent();
5590 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5594 case AMDGPU::S_INVERSE_BALLOT_U32:
5595 case AMDGPU::S_INVERSE_BALLOT_U64:
5598 MI.setDesc(
TII->get(AMDGPU::COPY));
5600 case AMDGPU::ENDPGM_TRAP: {
5603 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5623 MI.eraseFromParent();
5626 case AMDGPU::SIMULATED_TRAP: {
5630 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5631 MI.eraseFromParent();
5668 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5784 EVT VT =
N->getValueType(0);
5788 if (VT == MVT::f16) {
5804 unsigned Opc =
Op.getOpcode();
5805 EVT VT =
Op.getValueType();
5806 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5807 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5808 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5809 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5824 unsigned Opc =
Op.getOpcode();
5825 EVT VT =
Op.getValueType();
5826 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5827 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5828 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5829 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5837 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5839 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5846 unsigned Opc =
Op.getOpcode();
5847 EVT VT =
Op.getValueType();
5848 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5849 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5850 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5851 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5852 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5853 VT == MVT::v32bf16);
5858 : std::pair(Op0, Op0);
5867 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5869 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5875 switch (
Op.getOpcode()) {
5879 return LowerBRCOND(
Op, DAG);
5881 return LowerRETURNADDR(
Op, DAG);
5884 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5885 "Load should return a value and a chain");
5889 EVT VT =
Op.getValueType();
5891 return lowerFSQRTF32(
Op, DAG);
5893 return lowerFSQRTF64(
Op, DAG);
5898 return LowerTrig(
Op, DAG);
5900 return LowerSELECT(
Op, DAG);
5902 return LowerFDIV(
Op, DAG);
5904 return LowerFFREXP(
Op, DAG);
5906 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5908 return LowerSTORE(
Op, DAG);
5912 return LowerGlobalAddress(MFI,
Op, DAG);
5915 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5917 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5919 return LowerINTRINSIC_VOID(
Op, DAG);
5921 return lowerADDRSPACECAST(
Op, DAG);
5923 return lowerINSERT_SUBVECTOR(
Op, DAG);
5925 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5927 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5929 return lowerVECTOR_SHUFFLE(
Op, DAG);
5931 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5933 return lowerBUILD_VECTOR(
Op, DAG);
5936 return lowerFP_ROUND(
Op, DAG);
5938 return lowerTRAP(
Op, DAG);
5940 return lowerDEBUGTRAP(
Op, DAG);
5949 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5952 return lowerFLDEXP(
Op, DAG);
5981 return lowerMUL(
Op, DAG);
5984 return lowerXMULO(
Op, DAG);
5987 return lowerXMUL_LOHI(
Op, DAG);
6020 EVT FittingLoadVT = LoadVT;
6052SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6055 bool IsIntrinsic)
const {
6059 EVT LoadVT =
M->getValueType(0);
6061 EVT EquivLoadVT = LoadVT;
6079 M->getMemoryVT(),
M->getMemOperand());
6090 EVT LoadVT =
M->getValueType(0);
6096 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6097 bool IsTFE =
M->getNumValues() == 3;
6110 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6114 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6115 M->getMemOperand(), DAG);
6120 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6121 M->getMemOperand(), DAG);
6129 EVT VT =
N->getValueType(0);
6130 unsigned CondCode =
N->getConstantOperandVal(3);
6141 EVT CmpVT =
LHS.getValueType();
6142 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6143 unsigned PromoteOp =
6163 EVT VT =
N->getValueType(0);
6165 unsigned CondCode =
N->getConstantOperandVal(3);
6174 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6192 EVT VT =
N->getValueType(0);
6199 Src.getOperand(1), Src.getOperand(2));
6210 Exec = AMDGPU::EXEC_LO;
6212 Exec = AMDGPU::EXEC;
6229 EVT VT =
N->getValueType(0);
6231 unsigned IID =
N->getConstantOperandVal(0);
6232 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6233 IID == Intrinsic::amdgcn_permlanex16;
6234 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6235 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6239 unsigned SplitSize = 32;
6240 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6241 ST->hasDPALU_DPP() &&
6249 case Intrinsic::amdgcn_permlane16:
6250 case Intrinsic::amdgcn_permlanex16:
6251 case Intrinsic::amdgcn_update_dpp:
6256 case Intrinsic::amdgcn_writelane:
6259 case Intrinsic::amdgcn_readlane:
6260 case Intrinsic::amdgcn_set_inactive:
6261 case Intrinsic::amdgcn_set_inactive_chain_arg:
6262 case Intrinsic::amdgcn_mov_dpp8:
6265 case Intrinsic::amdgcn_readfirstlane:
6266 case Intrinsic::amdgcn_permlane64:
6276 if (
SDNode *GL =
N->getGluedNode()) {
6278 GL = GL->getOperand(0).getNode();
6288 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6289 IID == Intrinsic::amdgcn_mov_dpp8 ||
6290 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6291 Src1 =
N->getOperand(2);
6292 if (IID == Intrinsic::amdgcn_writelane ||
6293 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6294 Src2 =
N->getOperand(3);
6297 if (ValSize == SplitSize) {
6307 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6312 if (IID == Intrinsic::amdgcn_writelane) {
6317 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6319 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6322 if (ValSize % SplitSize != 0)
6326 EVT VT =
N->getValueType(0);
6330 unsigned NumOperands =
N->getNumOperands();
6332 SDNode *GL =
N->getGluedNode();
6337 for (
unsigned i = 0; i != NE; ++i) {
6338 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6340 SDValue Operand =
N->getOperand(j);
6370 if (SplitSize == 32) {
6372 return unrollLaneOp(LaneOp.
getNode());
6378 unsigned SubVecNumElt =
6382 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6383 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6387 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6392 if (IID == Intrinsic::amdgcn_writelane)
6397 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6398 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6399 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6400 EltIdx += SubVecNumElt;
6414 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6417 if (IID == Intrinsic::amdgcn_writelane)
6420 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6428 switch (
N->getOpcode()) {
6440 unsigned IID =
N->getConstantOperandVal(0);
6442 case Intrinsic::amdgcn_make_buffer_rsrc:
6443 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6445 case Intrinsic::amdgcn_cvt_pkrtz: {
6454 case Intrinsic::amdgcn_cvt_pknorm_i16:
6455 case Intrinsic::amdgcn_cvt_pknorm_u16:
6456 case Intrinsic::amdgcn_cvt_pk_i16:
6457 case Intrinsic::amdgcn_cvt_pk_u16: {
6463 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6465 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6467 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6472 EVT VT =
N->getValueType(0);
6481 case Intrinsic::amdgcn_s_buffer_load: {
6493 EVT VT =
Op.getValueType();
6494 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6506 if (!
Offset->isDivergent()) {
6525 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6537 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6538 Results.push_back(Res.getOperand(
I));
6542 Results.push_back(Res.getValue(1));
6551 EVT VT =
N->getValueType(0);
6556 EVT SelectVT = NewVT;
6557 if (NewVT.
bitsLT(MVT::i32)) {
6560 SelectVT = MVT::i32;
6566 if (NewVT != SelectVT)
6572 if (
N->getValueType(0) != MVT::v2f16)
6584 if (
N->getValueType(0) != MVT::v2f16)
6596 if (
N->getValueType(0) != MVT::f16)
6611 if (U.get() !=
Value)
6614 if (U.getUser()->getOpcode() == Opcode)
6620unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6622 switch (
Intr->getConstantOperandVal(1)) {
6623 case Intrinsic::amdgcn_if:
6625 case Intrinsic::amdgcn_else:
6627 case Intrinsic::amdgcn_loop:
6629 case Intrinsic::amdgcn_end_cf:
6676 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6689 assert(BR &&
"brcond missing unconditional branch user");
6690 Target = BR->getOperand(1);
6693 unsigned CFNode = isCFIntrinsic(
Intr);
6712 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6736 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6749 Intr->getOperand(0));
6755 MVT VT =
Op.getSimpleValueType();
6758 if (
Op.getConstantOperandVal(0) != 0)
6764 if (
Info->isEntryFunction())
6781 return Op.getValueType().bitsLE(VT)
6788 assert(
Op.getValueType() == MVT::f16 &&
6789 "Do not know how to custom lower FP_ROUND for non-f16 type");
6792 EVT SrcVT = Src.getValueType();
6793 if (SrcVT != MVT::f64)
6809 EVT VT =
Op.getValueType();
6812 bool IsIEEEMode =
Info->getMode().IEEE;
6821 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6829 EVT VT =
Op.getValueType();
6833 EVT ExpVT =
Exp.getValueType();
6834 if (ExpVT == MVT::i16)
6855 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6862 switch (
Op->getOpcode()) {
6892 DAGCombinerInfo &DCI)
const {
6893 const unsigned Opc =
Op.getOpcode();
6901 :
Op->getOperand(0).getValueType();
6904 if (DCI.isBeforeLegalizeOps() ||
6908 auto &DAG = DCI.DAG;
6914 LHS =
Op->getOperand(1);
6915 RHS =
Op->getOperand(2);
6917 LHS =
Op->getOperand(0);
6918 RHS =
Op->getOperand(1);
6949 EVT VT =
Op.getValueType();
6955 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6982 if (
Op->isDivergent())
6995 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6997 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7000 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7002 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7008 EVT VT =
Op.getValueType();
7015 const APInt &
C = RHSC->getAPIntValue();
7017 if (
C.isPowerOf2()) {
7019 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7046 if (
Op->isDivergent()) {
7063 return lowerTrapEndpgm(
Op, DAG);
7066 : lowerTrapHsaQueuePtr(
Op, DAG);
7076SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7078 ImplicitParameter Param)
const {
7098 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7104 if (UserSGPR == AMDGPU::NoRegister) {
7146 "debugtrap handler not supported",
7159SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7163 ? AMDGPU::SRC_SHARED_BASE
7164 : AMDGPU::SRC_PRIVATE_BASE;
7187 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7196 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7202 if (UserSGPR == AMDGPU::NoRegister) {
7232 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7233 isa<BasicBlockSDNode>(Val))
7236 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7237 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7251 unsigned DestAS, SrcAS;
7253 bool IsNonNull =
false;
7254 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7255 SrcAS = ASC->getSrcAddressSpace();
7256 Src = ASC->getOperand(0);
7257 DestAS = ASC->getDestAddressSpace();
7260 Op.getConstantOperandVal(0) ==
7261 Intrinsic::amdgcn_addrspacecast_nonnull);
7262 Src =
Op->getOperand(1);
7263 SrcAS =
Op->getConstantOperandVal(2);
7264 DestAS =
Op->getConstantOperandVal(3);
7279 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7293 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7301 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7313 Op.getValueType() == MVT::i64) {
7322 Src.getValueType() == MVT::i64)
7346 EVT InsVT =
Ins.getValueType();
7349 unsigned IdxVal =
Idx->getAsZExtVal();
7354 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7359 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7361 MVT::i32, InsNumElts / 2);
7366 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7368 if (InsNumElts == 2) {
7381 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7403 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7404 if (NumElts == 4 && EltSize == 16 && KIdx) {
7415 unsigned Idx = KIdx->getZExtValue();
7416 bool InsertLo =
Idx < 2;
7433 if (isa<ConstantSDNode>(
Idx))
7439 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7445 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7474 EVT ResultVT =
Op.getValueType();
7487 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7490 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7494 if (VecSize == 128) {
7502 }
else if (VecSize == 256) {
7505 for (
unsigned P = 0;
P < 4; ++
P) {
7511 Parts[0], Parts[1]));
7513 Parts[2], Parts[3]));
7519 for (
unsigned P = 0;
P < 8; ++
P) {
7526 Parts[0], Parts[1], Parts[2], Parts[3]));
7529 Parts[4], Parts[5], Parts[6], Parts[7]));
7532 EVT IdxVT =
Idx.getValueType();
7549 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7564 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7574 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7580 EVT ResultVT =
Op.getValueType();
7584 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7600 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7601 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7609 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7610 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7611 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7612 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7631 EVT ResultVT =
Op.getValueType();
7647 EVT VT =
Op.getValueType();
7649 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7684 for (
unsigned P = 0;
P < NumParts; ++
P) {
7686 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7719 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7757 EVT PtrVT =
Op.getValueType();
7773 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7845 SDValue Param = lowerKernargMemParameter(
7855 "non-hsa intrinsic with hsa target",
7864 "intrinsic not supported on subtarget",
7874 unsigned NumElts = Elts.
size();
7876 if (NumElts <= 12) {
7885 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7891 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7892 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7901 EVT SrcVT = Src.getValueType();
7922 bool Unpacked,
bool IsD16,
int DMaskPop,
7923 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7927 EVT ReqRetVT = ResultTypes[0];
7929 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7930 ? (ReqRetNumElts + 1) / 2
7933 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7944 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7955 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7957 NumDataDwords - MaskPopDwords);
7962 EVT LegalReqRetVT = ReqRetVT;
7964 if (!
Data.getValueType().isInteger())
7966 Data.getValueType().changeTypeToInteger(),
Data);
7987 if (Result->getNumValues() == 1)
7994 SDValue *LWE,
bool &IsTexFail) {
7995 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8014 unsigned DimIdx,
unsigned EndIdx,
8015 unsigned NumGradients) {
8017 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8025 if (((
I + 1) >= EndIdx) ||
8026 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8027 I == DimIdx + NumGradients - 1))) {
8028 if (
Addr.getValueType() != MVT::i16)
8049 unsigned IntrOpcode =
Intr->BaseOpcode;
8060 int NumVDataDwords = 0;
8061 bool AdjustRetType =
false;
8062 bool IsAtomicPacked16Bit =
false;
8065 const unsigned ArgOffset = WithChain ? 2 : 1;
8068 unsigned DMaskLanes = 0;
8070 if (BaseOpcode->Atomic) {
8071 VData =
Op.getOperand(2);
8073 IsAtomicPacked16Bit =
8074 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8075 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8078 if (BaseOpcode->AtomicX2) {
8085 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8086 DMask = Is64Bit ? 0xf : 0x3;
8087 NumVDataDwords = Is64Bit ? 4 : 2;
8089 DMask = Is64Bit ? 0x3 : 0x1;
8090 NumVDataDwords = Is64Bit ? 2 : 1;
8093 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8096 if (BaseOpcode->Store) {
8097 VData =
Op.getOperand(2);
8105 VData = handleD16VData(VData, DAG,
true);
8109 }
else if (!BaseOpcode->NoReturn) {
8122 (!LoadVT.
isVector() && DMaskLanes > 1))
8130 NumVDataDwords = (DMaskLanes + 1) / 2;
8132 NumVDataDwords = DMaskLanes;
8134 AdjustRetType =
true;
8138 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8143 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8145 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8146 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8148 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8150 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8151 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8154 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8155 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8156 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8161 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8165 "Bias needs to be converted to 16 bit in A16 mode");
8170 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8174 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8175 "require 16 bit args for both gradients and addresses");
8180 if (!
ST->hasA16()) {
8181 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8182 "support 16 bit addresses\n");
8192 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8196 IntrOpcode = G16MappingInfo->
G16;
8204 ArgOffset +
Intr->GradientStart,
8205 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8207 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8208 I < ArgOffset + Intr->CoordStart;
I++)
8215 ArgOffset +
Intr->CoordStart, VAddrEnd,
8219 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8237 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8238 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8239 const bool UseNSA =
ST->hasNSAEncoding() &&
8240 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8241 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8242 const bool UsePartialNSA =
8243 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8246 if (UsePartialNSA) {
8248 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8249 }
else if (!UseNSA) {
8256 if (!BaseOpcode->Sampler) {
8260 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8262 Unorm = UnormConst ? True : False;
8267 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8268 bool IsTexFail =
false;
8269 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8280 NumVDataDwords += 1;
8281 AdjustRetType =
true;
8286 if (AdjustRetType) {
8289 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8292 if (isa<MemSDNode>(
Op))
8298 MVT::i32, NumVDataDwords)
8301 ResultTypes[0] = NewVT;
8302 if (ResultTypes.size() == 3) {
8306 ResultTypes.erase(&ResultTypes[1]);
8310 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8311 if (BaseOpcode->Atomic)
8318 if (BaseOpcode->Store || BaseOpcode->Atomic)
8320 if (UsePartialNSA) {
8329 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8332 if (BaseOpcode->Sampler) {
8341 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8345 ST->hasFeature(AMDGPU::FeatureR128A16)
8355 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8359 if (BaseOpcode->HasD16)
8361 if (isa<MemSDNode>(
Op))
8364 int NumVAddrDwords =
8370 NumVDataDwords, NumVAddrDwords);
8371 }
else if (IsGFX11Plus) {
8373 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8374 : AMDGPU::MIMGEncGfx11Default,
8375 NumVDataDwords, NumVAddrDwords);
8376 }
else if (IsGFX10Plus) {
8378 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8379 : AMDGPU::MIMGEncGfx10Default,
8380 NumVDataDwords, NumVAddrDwords);
8384 NumVDataDwords, NumVAddrDwords);
8387 "requested image instruction is not supported on this GPU");
8392 NumVDataDwords, NumVAddrDwords);
8395 NumVDataDwords, NumVAddrDwords);
8401 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8406 if (BaseOpcode->AtomicX2) {
8411 if (BaseOpcode->NoReturn)
8415 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8433 if (!
Offset->isDivergent()) {
8478 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8482 unsigned NumLoads = 1;
8488 if (NumElts == 8 || NumElts == 16) {
8489 NumLoads = NumElts / 4;
8497 setBufferOffsets(
Offset, DAG, &Ops[3],
8498 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8501 for (
unsigned i = 0; i < NumLoads; ++i) {
8507 if (NumElts == 8 || NumElts == 16)
8554 EVT VT =
Op.getValueType();
8556 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8560 switch (IntrinsicID) {
8561 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8564 return getPreloadedValue(DAG, *MFI, VT,
8567 case Intrinsic::amdgcn_dispatch_ptr:
8568 case Intrinsic::amdgcn_queue_ptr: {
8571 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8577 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8580 return getPreloadedValue(DAG, *MFI, VT, RegID);
8582 case Intrinsic::amdgcn_implicitarg_ptr: {
8584 return getImplicitArgPtr(DAG,
DL);
8585 return getPreloadedValue(DAG, *MFI, VT,
8588 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8594 return getPreloadedValue(DAG, *MFI, VT,
8597 case Intrinsic::amdgcn_dispatch_id: {
8600 case Intrinsic::amdgcn_rcp:
8602 case Intrinsic::amdgcn_rsq:
8604 case Intrinsic::amdgcn_rsq_legacy:
8608 case Intrinsic::amdgcn_rcp_legacy:
8612 case Intrinsic::amdgcn_rsq_clamp: {
8626 case Intrinsic::r600_read_ngroups_x:
8630 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8633 case Intrinsic::r600_read_ngroups_y:
8637 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8640 case Intrinsic::r600_read_ngroups_z:
8644 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8647 case Intrinsic::r600_read_global_size_x:
8651 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8654 case Intrinsic::r600_read_global_size_y:
8658 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8661 case Intrinsic::r600_read_global_size_z:
8665 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8668 case Intrinsic::r600_read_local_size_x:
8672 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8674 case Intrinsic::r600_read_local_size_y:
8678 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8680 case Intrinsic::r600_read_local_size_z:
8684 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8686 case Intrinsic::amdgcn_workgroup_id_x:
8687 return getPreloadedValue(DAG, *MFI, VT,
8689 case Intrinsic::amdgcn_workgroup_id_y:
8690 return getPreloadedValue(DAG, *MFI, VT,
8692 case Intrinsic::amdgcn_workgroup_id_z:
8693 return getPreloadedValue(DAG, *MFI, VT,
8695 case Intrinsic::amdgcn_wave_id:
8696 return lowerWaveID(DAG,
Op);
8697 case Intrinsic::amdgcn_lds_kernel_id: {
8699 return getLDSKernelId(DAG,
DL);
8700 return getPreloadedValue(DAG, *MFI, VT,
8703 case Intrinsic::amdgcn_workitem_id_x:
8704 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8705 case Intrinsic::amdgcn_workitem_id_y:
8706 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8707 case Intrinsic::amdgcn_workitem_id_z:
8708 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8709 case Intrinsic::amdgcn_wavefrontsize:
8712 case Intrinsic::amdgcn_s_buffer_load: {
8713 unsigned CPol =
Op.getConstantOperandVal(3);
8720 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8721 Op.getOperand(3), DAG);
8723 case Intrinsic::amdgcn_fdiv_fast:
8724 return lowerFDIV_FAST(
Op, DAG);
8725 case Intrinsic::amdgcn_sin:
8728 case Intrinsic::amdgcn_cos:
8731 case Intrinsic::amdgcn_mul_u24:
8734 case Intrinsic::amdgcn_mul_i24:
8738 case Intrinsic::amdgcn_log_clamp: {
8744 case Intrinsic::amdgcn_fract:
8747 case Intrinsic::amdgcn_class:
8750 case Intrinsic::amdgcn_div_fmas:
8752 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8754 case Intrinsic::amdgcn_div_fixup:
8756 Op.getOperand(2),
Op.getOperand(3));
8758 case Intrinsic::amdgcn_div_scale: {
8771 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8774 Denominator, Numerator);
8776 case Intrinsic::amdgcn_icmp: {
8778 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8779 Op.getConstantOperandVal(2) == 0 &&
8784 case Intrinsic::amdgcn_fcmp: {
8787 case Intrinsic::amdgcn_ballot:
8789 case Intrinsic::amdgcn_fmed3:
8791 Op.getOperand(2),
Op.getOperand(3));
8792 case Intrinsic::amdgcn_fdot2:
8794 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8795 case Intrinsic::amdgcn_fmul_legacy:
8798 case Intrinsic::amdgcn_sffbh:
8800 case Intrinsic::amdgcn_sbfe:
8802 Op.getOperand(2),
Op.getOperand(3));
8803 case Intrinsic::amdgcn_ubfe:
8805 Op.getOperand(2),
Op.getOperand(3));
8806 case Intrinsic::amdgcn_cvt_pkrtz:
8807 case Intrinsic::amdgcn_cvt_pknorm_i16:
8808 case Intrinsic::amdgcn_cvt_pknorm_u16:
8809 case Intrinsic::amdgcn_cvt_pk_i16:
8810 case Intrinsic::amdgcn_cvt_pk_u16: {
8812 EVT VT =
Op.getValueType();
8815 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8817 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8819 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8821 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8827 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8830 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8833 case Intrinsic::amdgcn_fmad_ftz:
8835 Op.getOperand(2),
Op.getOperand(3));
8837 case Intrinsic::amdgcn_if_break:
8839 Op->getOperand(1),
Op->getOperand(2)),
8842 case Intrinsic::amdgcn_groupstaticsize: {
8854 case Intrinsic::amdgcn_is_shared:
8855 case Intrinsic::amdgcn_is_private: {
8857 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8860 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8868 case Intrinsic::amdgcn_perm:
8870 Op.getOperand(2),
Op.getOperand(3));
8871 case Intrinsic::amdgcn_reloc_constant: {
8875 auto *RelocSymbol = cast<GlobalVariable>(
8881 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8882 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8883 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8884 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8885 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8886 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8887 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8888 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8889 if (
Op.getOperand(4).getValueType() == MVT::i32)
8895 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8896 Op.getOperand(3), IndexKeyi32);
8898 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8899 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8900 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8901 if (
Op.getOperand(6).getValueType() == MVT::i32)
8907 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8908 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8909 IndexKeyi32, Op.getOperand(7)});
8911 case Intrinsic::amdgcn_addrspacecast_nonnull:
8912 return lowerADDRSPACECAST(
Op, DAG);
8913 case Intrinsic::amdgcn_readlane:
8914 case Intrinsic::amdgcn_readfirstlane:
8915 case Intrinsic::amdgcn_writelane:
8916 case Intrinsic::amdgcn_permlane16:
8917 case Intrinsic::amdgcn_permlanex16:
8918 case Intrinsic::amdgcn_permlane64:
8919 case Intrinsic::amdgcn_set_inactive:
8920 case Intrinsic::amdgcn_set_inactive_chain_arg:
8921 case Intrinsic::amdgcn_mov_dpp8:
8922 case Intrinsic::amdgcn_update_dpp:
8927 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8938 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8944 unsigned NewOpcode)
const {
8948 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8949 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
8963 auto *
M = cast<MemSDNode>(
Op);
8967 M->getMemOperand());
8972 unsigned NewOpcode)
const {
8976 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8977 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
8991 auto *
M = cast<MemSDNode>(
Op);
8995 M->getMemOperand());
9000 unsigned IntrID =
Op.getConstantOperandVal(1);
9004 case Intrinsic::amdgcn_ds_ordered_add:
9005 case Intrinsic::amdgcn_ds_ordered_swap: {
9010 unsigned IndexOperand =
M->getConstantOperandVal(7);
9011 unsigned WaveRelease =
M->getConstantOperandVal(8);
9012 unsigned WaveDone =
M->getConstantOperandVal(9);
9014 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9015 IndexOperand &= ~0x3f;
9016 unsigned CountDw = 0;
9019 CountDw = (IndexOperand >> 24) & 0xf;
9020 IndexOperand &= ~(0xf << 24);
9022 if (CountDw < 1 || CountDw > 4) {
9024 "ds_ordered_count: dword count must be between 1 and 4");
9031 if (WaveDone && !WaveRelease)
9034 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9035 unsigned ShaderType =
9037 unsigned Offset0 = OrderedCountIndex << 2;
9038 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9041 Offset1 |= (CountDw - 1) << 6;
9044 Offset1 |= ShaderType << 2;
9046 unsigned Offset = Offset0 | (Offset1 << 8);
9053 M->getVTList(), Ops,
M->getMemoryVT(),
9054 M->getMemOperand());
9056 case Intrinsic::amdgcn_raw_buffer_load:
9057 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9058 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9059 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9060 case Intrinsic::amdgcn_raw_buffer_load_format:
9061 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9062 const bool IsFormat =
9063 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9064 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9066 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9067 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9080 auto *
M = cast<MemSDNode>(
Op);
9081 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9083 case Intrinsic::amdgcn_struct_buffer_load:
9084 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9085 case Intrinsic::amdgcn_struct_buffer_load_format:
9086 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9087 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9088 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9089 const bool IsFormat =
9090 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9091 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9093 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9094 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9107 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9109 case Intrinsic::amdgcn_raw_tbuffer_load:
9110 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9112 EVT LoadVT =
Op.getValueType();
9113 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9114 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9133 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9136 case Intrinsic::amdgcn_struct_tbuffer_load:
9137 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9139 EVT LoadVT =
Op.getValueType();
9140 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9141 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9160 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9163 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9166 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9167 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9168 return lowerStructBufferAtomicIntrin(
Op, DAG,
9170 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9173 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9174 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9175 return lowerStructBufferAtomicIntrin(
Op, DAG,
9177 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9180 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9182 return lowerStructBufferAtomicIntrin(
Op, DAG,
9184 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9187 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9188 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9190 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9193 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9196 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9199 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9200 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9202 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9203 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9205 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9208 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9211 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9212 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9214 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9215 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9217 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9218 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9220 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9221 return lowerRawBufferAtomicIntrin(
Op, DAG,
9223 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9224 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9225 return lowerStructBufferAtomicIntrin(
Op, DAG,
9227 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9230 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9231 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9233 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9234 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9235 return lowerStructBufferAtomicIntrin(
Op, DAG,
9237 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9239 return lowerStructBufferAtomicIntrin(
Op, DAG,
9241 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9243 return lowerStructBufferAtomicIntrin(
Op, DAG,
9245 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9247 return lowerStructBufferAtomicIntrin(
Op, DAG,
9249 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9252 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9255 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9258 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9261 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9262 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9264 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9265 return lowerStructBufferAtomicIntrin(
Op, DAG,
9268 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9270 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9271 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9285 EVT VT =
Op.getValueType();
9286 auto *
M = cast<MemSDNode>(
Op);
9289 Op->getVTList(), Ops, VT,
9290 M->getMemOperand());
9292 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9294 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9295 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9309 EVT VT =
Op.getValueType();
9310 auto *
M = cast<MemSDNode>(
Op);
9313 Op->getVTList(), Ops, VT,
9314 M->getMemOperand());
9316 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9318 SDValue NodePtr =
M->getOperand(2);
9319 SDValue RayExtent =
M->getOperand(3);
9320 SDValue RayOrigin =
M->getOperand(4);
9322 SDValue RayInvDir =
M->getOperand(6);
9340 const unsigned NumVDataDwords = 4;
9341 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9342 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9346 const unsigned BaseOpcodes[2][2] = {
9347 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9348 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9349 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9353 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9354 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9355 : AMDGPU::MIMGEncGfx10NSA,
9356 NumVDataDwords, NumVAddrDwords);
9360 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9361 : AMDGPU::MIMGEncGfx10Default,
9362 NumVDataDwords, NumVAddrDwords);
9368 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9371 if (Lanes[0].getValueSizeInBits() == 32) {
9372 for (
unsigned I = 0;
I < 3; ++
I)
9391 if (UseNSA && IsGFX11Plus) {
9399 for (
unsigned I = 0;
I < 3; ++
I) {
9402 {DirLanes[I], InvDirLanes[I]})));
9417 packLanes(RayOrigin,
true);
9418 packLanes(RayDir,
true);
9419 packLanes(RayInvDir,
false);
9424 if (NumVAddrDwords > 12) {
9444 case Intrinsic::amdgcn_global_atomic_fmin_num:
9445 case Intrinsic::amdgcn_global_atomic_fmax_num:
9446 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9447 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9454 unsigned Opcode = 0;
9456 case Intrinsic::amdgcn_global_atomic_fmin_num:
9457 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9461 case Intrinsic::amdgcn_global_atomic_fmax_num:
9462 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9470 Ops,
M->getMemOperand());
9472 case Intrinsic::amdgcn_s_get_barrier_state:
9473 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9478 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9479 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9480 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9481 BarID = (BarID >> 4) & 0x3F;
9482 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9487 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9488 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9508 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9516SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9526 bool IsTFE = VTList.
NumVTs == 3;
9529 unsigned NumOpDWords = NumValueDWords + 1;
9534 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9535 OpDWordsVT, OpDWordsMMO, DAG);
9550 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9556 WidenedMemVT, WidenedMMO);
9566 bool ImageStore)
const {
9601 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9607 if ((NumElements % 2) == 1) {
9609 unsigned I = Elts.
size() / 2;
9625 if (NumElements == 3) {
9646 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9649 switch (IntrinsicID) {
9650 case Intrinsic::amdgcn_exp_compr: {
9654 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9677 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9680 case Intrinsic::amdgcn_s_barrier:
9681 case Intrinsic::amdgcn_s_barrier_signal:
9682 case Intrinsic::amdgcn_s_barrier_wait: {
9685 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9686 if (WGSize <=
ST.getWavefrontSize()) {
9689 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9690 return Op.getOperand(0);
9693 MVT::Other,
Op.getOperand(0)),
9698 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9704 MVT::Other, K,
Op.getOperand(0)),
9716 case Intrinsic::amdgcn_struct_tbuffer_store:
9717 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9721 VData = handleD16VData(VData, DAG);
9722 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9723 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9741 M->getMemoryVT(),
M->getMemOperand());
9744 case Intrinsic::amdgcn_raw_tbuffer_store:
9745 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9749 VData = handleD16VData(VData, DAG);
9750 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9751 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9769 M->getMemoryVT(),
M->getMemOperand());
9772 case Intrinsic::amdgcn_raw_buffer_store:
9773 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9774 case Intrinsic::amdgcn_raw_buffer_store_format:
9775 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9776 const bool IsFormat =
9777 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9778 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9785 VData = handleD16VData(VData, DAG);
9795 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9796 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9816 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9819 M->getMemoryVT(),
M->getMemOperand());
9822 case Intrinsic::amdgcn_struct_buffer_store:
9823 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9824 case Intrinsic::amdgcn_struct_buffer_store_format:
9825 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9826 const bool IsFormat =
9827 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9828 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9836 VData = handleD16VData(VData, DAG);
9846 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9847 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9868 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9871 M->getMemoryVT(),
M->getMemOperand());
9873 case Intrinsic::amdgcn_raw_buffer_load_lds:
9874 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9875 case Intrinsic::amdgcn_struct_buffer_load_lds:
9876 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9880 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9881 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9882 unsigned OpOffset = HasVIndex ? 1 : 0;
9883 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9885 unsigned Size =
Op->getConstantOperandVal(4);
9891 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9892 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9893 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9894 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9897 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9898 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9899 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9900 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9903 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9904 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9905 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9906 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9911 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9912 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9913 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9914 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9919 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9920 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9921 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9922 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9930 if (HasVIndex && HasVOffset)
9936 else if (HasVOffset)
9939 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9944 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9956 auto *
M = cast<MemSDNode>(
Op);
9983 case Intrinsic::amdgcn_global_load_lds: {
9985 unsigned Size =
Op->getConstantOperandVal(4);
9990 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9993 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9996 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10001 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10006 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10010 auto *
M = cast<MemSDNode>(
Op);
10023 if (
LHS->isDivergent())
10027 RHS.getOperand(0).getValueType() == MVT::i32) {
10030 VOffset =
RHS.getOperand(0);
10035 if (!
Addr->isDivergent()) {
10052 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10072 case Intrinsic::amdgcn_end_cf:
10074 Op->getOperand(2), Chain),
10076 case Intrinsic::amdgcn_s_barrier_init:
10077 case Intrinsic::amdgcn_s_barrier_signal_var: {
10084 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10085 ? AMDGPU::S_BARRIER_INIT_M0
10086 : AMDGPU::S_BARRIER_SIGNAL_M0;
10101 constexpr unsigned ShAmt = 16;
10113 case Intrinsic::amdgcn_s_barrier_join: {
10120 if (isa<ConstantSDNode>(BarOp)) {
10121 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10122 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10125 unsigned BarID = (BarVal >> 4) & 0x3F;
10130 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10146 case Intrinsic::amdgcn_s_prefetch_data: {
10149 return Op.getOperand(0);
10152 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10154 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10161 Op->getVTList(), Ops,
M->getMemoryVT(),
10162 M->getMemOperand());
10167 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10180std::pair<SDValue, SDValue>
10187 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10190 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10204 unsigned Overflow = ImmOffset & ~MaxImm;
10205 ImmOffset -= Overflow;
10206 if ((int32_t)Overflow < 0) {
10207 Overflow += ImmOffset;
10212 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10216 SDValue Ops[] = {N0, OverflowVal};
10231void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10233 Align Alignment)
const {
10236 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10239 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10250 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10252 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10269SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10272 return MaybePointer;
10286 SDValue NumRecords =
Op->getOperand(3);
10289 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10292 std::optional<uint32_t> ConstStride = std::nullopt;
10293 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10294 ConstStride = ConstNode->getZExtValue();
10297 if (!ConstStride || *ConstStride != 0) {
10300 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10311 NewHighHalf, NumRecords, Flags);
10321 bool IsTFE)
const {
10331 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10359 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10363 Ops[1] = BufferStoreExt;
10368 M->getMemOperand());
10393 DAGCombinerInfo &DCI)
const {
10409 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10416 "unexpected vector extload");
10429 "unexpected fp extload");
10447 DCI.AddToWorklist(Cvt.
getNode());
10452 DCI.AddToWorklist(Cvt.
getNode());
10463 if (
Info.isEntryFunction())
10464 return Info.getUserSGPRInfo().hasFlatScratchInit();
10472 EVT MemVT =
Load->getMemoryVT();
10485 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10513 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10514 "Custom lowering for non-i32 vectors hasn't been implemented.");
10517 unsigned AS =
Load->getAddressSpace();
10541 Alignment >=
Align(4) && NumElements < 32) {
10555 if (NumElements > 4)
10574 if (NumElements > 2)
10579 if (NumElements > 4)
10591 auto Flags =
Load->getMemOperand()->getFlags();
10593 Load->getAlign(), Flags, &
Fast) &&
10602 MemVT, *
Load->getMemOperand())) {
10611 EVT VT =
Op.getValueType();
10648 EVT VT =
Op.getValueType();
10651 bool AllowInaccurateRcp =
10658 if (!AllowInaccurateRcp && VT != MVT::f16)
10661 if (CLHS->isExactlyValue(1.0)) {
10678 if (CLHS->isExactlyValue(-1.0)) {
10687 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10701 EVT VT =
Op.getValueType();
10704 bool AllowInaccurateDiv =
10706 if (!AllowInaccurateDiv)
10727 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10741 return DAG.
getNode(Opcode, SL, VTList,
10750 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10764 return DAG.
getNode(Opcode, SL, VTList,
10770 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10771 return FastLowered;
10791 unsigned FMADOpCode =
10801 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10803 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10804 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10827 const APFloat K0Val(0x1p+96f);
10830 const APFloat K1Val(0x1p-32f);
10857 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10858 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10859 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10864 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10865 return FastLowered;
10872 Flags.setNoFPExcept(
true);
10893 using namespace AMDGPU::Hwreg;
10894 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10902 const bool HasDynamicDenormals =
10908 if (!PreservesDenormals) {
10916 if (HasDynamicDenormals) {
10920 SavedDenormMode =
SDValue(GetReg, 0);
10928 const SDValue EnableDenormValue =
10935 const SDValue EnableDenormValue =
10937 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10938 {EnableDenormValue,
BitField, Glue});
10948 ApproxRcp, One, NegDivScale0, Flags);
10951 ApproxRcp, Fma0, Flags);
10957 NumeratorScaled,
Mul, Flags);
10963 NumeratorScaled, Fma3, Flags);
10965 if (!PreservesDenormals) {
10973 DisableDenormValue, Fma4.
getValue(2))
10976 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10977 const SDValue DisableDenormValue =
10978 HasDynamicDenormals
10983 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10994 {Fma4, Fma1, Fma3, Scale},
Flags);
11000 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11001 return FastLowered;
11069 EVT VT =
Op.getValueType();
11071 if (VT == MVT::f32)
11072 return LowerFDIV32(
Op, DAG);
11074 if (VT == MVT::f64)
11075 return LowerFDIV64(
Op, DAG);
11077 if (VT == MVT::f16)
11078 return LowerFDIV16(
Op, DAG);
11087 EVT ResultExpVT =
Op->getValueType(1);
11088 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11118 if (VT == MVT::i1) {
11122 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11126 Store->getValue().getValueType().getScalarType() == MVT::i32);
11128 unsigned AS =
Store->getAddressSpace();
11147 if (NumElements > 4)
11154 VT, *
Store->getMemOperand()))
11164 if (NumElements > 2)
11168 if (NumElements > 4 ||
11177 auto Flags =
Store->getMemOperand()->getFlags();
11212 MVT VT =
Op.getValueType().getSimpleVT();
11383 EVT VT =
Op.getValueType();
11400 switch (
Op.getOpcode()) {
11427 EVT VT =
Op.getValueType();
11435 Op->getVTList(), Ops, VT,
11444SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11445 DAGCombinerInfo &DCI)
const {
11446 EVT VT =
N->getValueType(0);
11448 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11455 EVT SrcVT = Src.getValueType();
11461 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11464 DCI.AddToWorklist(Cvt.
getNode());
11467 if (ScalarVT != MVT::f32) {
11479 DAGCombinerInfo &DCI)
const {
11480 SDValue MagnitudeOp =
N->getOperand(0);
11481 SDValue SignOp =
N->getOperand(1);
11537SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11539 DAGCombinerInfo &DCI)
const {
11569 AM.HasBaseReg =
true;
11570 AM.BaseOffs =
Offset.getSExtValue();
11575 EVT VT =
N->getValueType(0);
11581 Flags.setNoUnsignedWrap(
11582 N->getFlags().hasNoUnsignedWrap() &&
11592 switch (
N->getOpcode()) {
11603 DAGCombinerInfo &DCI)
const {
11612 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11613 N->getMemoryVT(), DCI);
11617 NewOps[PtrIdx] = NewPtr;
11626 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11627 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11636SDValue SITargetLowering::splitBinaryBitConstantOp(
11637 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11657 if (V.getValueType() != MVT::i1)
11659 switch (V.getOpcode()) {
11678 if (!(
C & 0x000000ff))
11679 ZeroByteMask |= 0x000000ff;
11680 if (!(
C & 0x0000ff00))
11681 ZeroByteMask |= 0x0000ff00;
11682 if (!(
C & 0x00ff0000))
11683 ZeroByteMask |= 0x00ff0000;
11684 if (!(
C & 0xff000000))
11685 ZeroByteMask |= 0xff000000;
11686 uint32_t NonZeroByteMask = ~ZeroByteMask;
11687 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11700 assert(V.getValueSizeInBits() == 32);
11702 if (V.getNumOperands() != 2)
11711 switch (V.getOpcode()) {
11716 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11721 return (0x03020100 & ~ConstMask) | ConstMask;
11728 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11734 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11741 DAGCombinerInfo &DCI)
const {
11742 if (DCI.isBeforeLegalize())
11746 EVT VT =
N->getValueType(0);
11751 if (VT == MVT::i64 && CRHS) {
11757 if (CRHS && VT == MVT::i32) {
11766 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11767 unsigned Shift = CShift->getZExtValue();
11769 unsigned Offset = NB + Shift;
11770 if ((
Offset & (Bits - 1)) == 0) {
11788 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11794 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11809 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11814 if (
X !=
LHS.getOperand(1))
11819 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11852 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11853 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11855 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11856 :
Mask->getZExtValue() & OrdMask;
11877 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11880 if (LHSMask != ~0u && RHSMask != ~0u) {
11883 if (LHSMask > RHSMask) {
11890 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11891 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11894 if (!(LHSUsedLanes & RHSUsedLanes) &&
11897 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11904 for (
unsigned I = 0;
I < 32;
I += 8) {
11906 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11907 Mask &= (0x0c <<
I) & 0xffffffff;
11965static const std::optional<ByteProvider<SDValue>>
11967 unsigned Depth = 0) {
11970 return std::nullopt;
11972 if (
Op.getValueSizeInBits() < 8)
11973 return std::nullopt;
11975 if (
Op.getValueType().isVector())
11978 switch (
Op->getOpcode()) {
11989 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11990 NarrowVT = VTSign->getVT();
11993 return std::nullopt;
11996 if (SrcIndex >= NarrowByteWidth)
11997 return std::nullopt;
12003 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12005 return std::nullopt;
12007 uint64_t BitShift = ShiftOp->getZExtValue();
12009 if (BitShift % 8 != 0)
12010 return std::nullopt;
12012 SrcIndex += BitShift / 8;
12030static const std::optional<ByteProvider<SDValue>>
12032 unsigned StartingIndex = 0) {
12036 return std::nullopt;
12038 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12040 return std::nullopt;
12042 return std::nullopt;
12044 bool IsVec =
Op.getValueType().isVector();
12045 switch (
Op.getOpcode()) {
12048 return std::nullopt;
12053 return std::nullopt;
12057 return std::nullopt;
12060 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12061 return std::nullopt;
12062 if (!
LHS ||
LHS->isConstantZero())
12064 if (!
RHS ||
RHS->isConstantZero())
12066 return std::nullopt;
12071 return std::nullopt;
12073 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12075 return std::nullopt;
12077 uint32_t BitMask = BitMaskOp->getZExtValue();
12079 uint32_t IndexMask = 0xFF << (Index * 8);
12081 if ((IndexMask & BitMask) != IndexMask) {
12084 if (IndexMask & BitMask)
12085 return std::nullopt;
12094 return std::nullopt;
12097 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12098 if (!ShiftOp ||
Op.getValueType().isVector())
12099 return std::nullopt;
12101 uint64_t BitsProvided =
Op.getValueSizeInBits();
12102 if (BitsProvided % 8 != 0)
12103 return std::nullopt;
12105 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12107 return std::nullopt;
12109 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12110 uint64_t ByteShift = BitShift / 8;
12112 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12113 uint64_t BytesProvided = BitsProvided / 8;
12114 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12115 NewIndex %= BytesProvided;
12122 return std::nullopt;
12124 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12126 return std::nullopt;
12128 uint64_t BitShift = ShiftOp->getZExtValue();
12130 return std::nullopt;
12132 auto BitsProvided =
Op.getScalarValueSizeInBits();
12133 if (BitsProvided % 8 != 0)
12134 return std::nullopt;
12136 uint64_t BytesProvided = BitsProvided / 8;
12137 uint64_t ByteShift = BitShift / 8;
12142 return BytesProvided - ByteShift > Index
12150 return std::nullopt;
12152 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12154 return std::nullopt;
12156 uint64_t BitShift = ShiftOp->getZExtValue();
12157 if (BitShift % 8 != 0)
12158 return std::nullopt;
12159 uint64_t ByteShift = BitShift / 8;
12165 return Index < ByteShift
12168 Depth + 1, StartingIndex);
12177 return std::nullopt;
12184 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12185 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12187 if (NarrowBitWidth % 8 != 0)
12188 return std::nullopt;
12189 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12191 if (Index >= NarrowByteWidth)
12193 ? std::optional<ByteProvider<SDValue>>(
12201 return std::nullopt;
12205 if (NarrowByteWidth >= Index) {
12210 return std::nullopt;
12217 return std::nullopt;
12221 auto *L = cast<LoadSDNode>(
Op.getNode());
12223 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12224 if (NarrowBitWidth % 8 != 0)
12225 return std::nullopt;
12226 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12231 if (Index >= NarrowByteWidth) {
12233 ? std::optional<ByteProvider<SDValue>>(
12238 if (NarrowByteWidth > Index) {
12242 return std::nullopt;
12247 return std::nullopt;
12250 Depth + 1, StartingIndex);
12254 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12256 return std::nullopt;
12257 auto VecIdx = IdxOp->getZExtValue();
12258 auto ScalarSize =
Op.getScalarValueSizeInBits();
12259 if (ScalarSize < 32)
12260 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12262 StartingIndex, Index);
12267 return std::nullopt;
12269 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12271 return std::nullopt;
12274 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12275 if (IdxMask > 0x07 && IdxMask != 0x0c)
12276 return std::nullopt;
12278 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12279 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12281 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12287 return std::nullopt;
12302 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12306 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12309 auto MemVT = L->getMemoryVT();
12312 return L->getMemoryVT().getSizeInBits() == 16;
12322 int Low8 = Mask & 0xff;
12323 int Hi8 = (Mask & 0xff00) >> 8;
12325 assert(Low8 < 8 && Hi8 < 8);
12327 bool IsConsecutive = (Hi8 - Low8 == 1);
12332 bool Is16Aligned = !(Low8 % 2);
12334 return IsConsecutive && Is16Aligned;
12342 int Low16 = PermMask & 0xffff;
12343 int Hi16 = (PermMask & 0xffff0000) >> 16;
12353 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12355 if (!OtherOpIs16Bit)
12363 unsigned DWordOffset) {
12366 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12368 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12373 if (Src.getValueType().isVector()) {
12374 auto ScalarTySize = Src.getScalarValueSizeInBits();
12375 auto ScalarTy = Src.getValueType().getScalarType();
12376 if (ScalarTySize == 32) {
12380 if (ScalarTySize > 32) {
12383 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12384 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12391 assert(ScalarTySize < 32);
12392 auto NumElements =
TypeSize / ScalarTySize;
12393 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12394 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12395 auto NumElementsIn32 = 32 / ScalarTySize;
12396 auto NumAvailElements = DWordOffset < Trunc32Elements
12398 : NumElements - NormalizedTrunc;
12411 auto ShiftVal = 32 * DWordOffset;
12419 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12424 for (
int i = 0; i < 4; i++) {
12426 std::optional<ByteProvider<SDValue>>
P =
12429 if (!
P ||
P->isConstantZero())
12434 if (PermNodes.
size() != 4)
12437 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12438 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12440 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12441 auto PermOp = PermNodes[i];
12444 int SrcByteAdjust = 4;
12448 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12449 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12451 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12452 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12456 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12457 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12460 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12462 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12465 SDValue Op = *PermNodes[FirstSrc.first].Src;
12467 assert(
Op.getValueSizeInBits() == 32);
12471 int Low16 = PermMask & 0xffff;
12472 int Hi16 = (PermMask & 0xffff0000) >> 16;
12474 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12475 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12478 if (WellFormedLow && WellFormedHi)
12482 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12491 assert(
Op.getValueType().isByteSized() &&
12509 DAGCombinerInfo &DCI)
const {
12514 EVT VT =
N->getValueType(0);
12515 if (VT == MVT::i1) {
12520 if (Src !=
RHS.getOperand(0))
12525 if (!CLHS || !CRHS)
12529 static const uint32_t MaxMask = 0x3ff;
12544 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12549 Sel |=
LHS.getConstantOperandVal(2);
12558 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12562 auto usesCombinedOperand = [](
SDNode *OrUse) {
12565 !OrUse->getValueType(0).isVector())
12569 for (
auto *VUser : OrUse->users()) {
12570 if (!VUser->getValueType(0).isVector())
12577 if (VUser->getOpcode() == VectorwiseOp)
12583 if (!
any_of(
N->users(), usesCombinedOperand))
12589 if (LHSMask != ~0u && RHSMask != ~0u) {
12592 if (LHSMask > RHSMask) {
12599 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12600 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12603 if (!(LHSUsedLanes & RHSUsedLanes) &&
12606 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12608 LHSMask &= ~RHSUsedLanes;
12609 RHSMask &= ~LHSUsedLanes;
12611 LHSMask |= LHSUsedLanes & 0x04040404;
12621 if (LHSMask == ~0u || RHSMask == ~0u) {
12627 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12642 if (SrcVT == MVT::i32) {
12647 DCI.AddToWorklist(LowOr.
getNode());
12648 DCI.AddToWorklist(HiBits.getNode());
12656 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12659 N->getOperand(0), CRHS))
12667 DAGCombinerInfo &DCI)
const {
12668 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12677 EVT VT =
N->getValueType(0);
12678 if (CRHS && VT == MVT::i64) {
12700 LHS->getOperand(0), FNegLHS, FNegRHS);
12709 DAGCombinerInfo &DCI)
const {
12714 EVT VT =
N->getValueType(0);
12715 if (VT != MVT::i32)
12719 if (Src.getValueType() != MVT::i16)
12726SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12727 DAGCombinerInfo &DCI)
const {
12729 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12734 VTSign->getVT() == MVT::i8) ||
12736 VTSign->getVT() == MVT::i16))) {
12738 "s_buffer_load_{u8, i8} are supported "
12739 "in GFX12 (or newer) architectures.");
12740 EVT VT = Src.getValueType();
12745 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12751 auto *
M = cast<MemSDNode>(Src);
12752 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12753 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12758 VTSign->getVT() == MVT::i8) ||
12760 VTSign->getVT() == MVT::i16)) &&
12762 auto *
M = cast<MemSDNode>(Src);
12763 SDValue Ops[] = {Src.getOperand(0),
12769 Src.getOperand(6), Src.getOperand(7)};
12772 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12776 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12777 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12778 return DCI.DAG.getMergeValues(
12785 DAGCombinerInfo &DCI)
const {
12793 if (
N->getOperand(0).isUndef())
12800 DAGCombinerInfo &DCI)
const {
12801 EVT VT =
N->getValueType(0);
12826 unsigned MaxDepth)
const {
12827 unsigned Opcode =
Op.getOpcode();
12831 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12832 const auto &
F = CFP->getValueAPF();
12833 if (
F.isNaN() &&
F.isSignaling())
12835 if (!
F.isDenormal())
12898 if (
Op.getValueType() == MVT::i32) {
12903 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12904 if (
RHS->getZExtValue() == 0xffff0000) {
12914 return Op.getValueType().getScalarType() != MVT::f16;
12982 if (
Op.getValueType() == MVT::i16) {
12993 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12995 switch (IntrinsicID) {
12996 case Intrinsic::amdgcn_cvt_pkrtz:
12997 case Intrinsic::amdgcn_cubeid:
12998 case Intrinsic::amdgcn_frexp_mant:
12999 case Intrinsic::amdgcn_fdot2:
13000 case Intrinsic::amdgcn_rcp:
13001 case Intrinsic::amdgcn_rsq:
13002 case Intrinsic::amdgcn_rsq_clamp:
13003 case Intrinsic::amdgcn_rcp_legacy:
13004 case Intrinsic::amdgcn_rsq_legacy:
13005 case Intrinsic::amdgcn_trig_preop:
13006 case Intrinsic::amdgcn_log:
13007 case Intrinsic::amdgcn_exp2:
13008 case Intrinsic::amdgcn_sqrt:
13026 unsigned MaxDepth)
const {
13029 unsigned Opcode =
MI->getOpcode();
13031 if (Opcode == AMDGPU::G_FCANONICALIZE)
13034 std::optional<FPValueAndVReg> FCR;
13037 if (FCR->Value.isSignaling())
13039 if (!FCR->Value.isDenormal())
13050 case AMDGPU::G_FADD:
13051 case AMDGPU::G_FSUB:
13052 case AMDGPU::G_FMUL:
13053 case AMDGPU::G_FCEIL:
13054 case AMDGPU::G_FFLOOR:
13055 case AMDGPU::G_FRINT:
13056 case AMDGPU::G_FNEARBYINT:
13057 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13058 case AMDGPU::G_INTRINSIC_TRUNC:
13059 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13060 case AMDGPU::G_FMA:
13061 case AMDGPU::G_FMAD:
13062 case AMDGPU::G_FSQRT:
13063 case AMDGPU::G_FDIV:
13064 case AMDGPU::G_FREM:
13065 case AMDGPU::G_FPOW:
13066 case AMDGPU::G_FPEXT:
13067 case AMDGPU::G_FLOG:
13068 case AMDGPU::G_FLOG2:
13069 case AMDGPU::G_FLOG10:
13070 case AMDGPU::G_FPTRUNC:
13071 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13072 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13073 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13074 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13075 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13077 case AMDGPU::G_FNEG:
13078 case AMDGPU::G_FABS:
13079 case AMDGPU::G_FCOPYSIGN:
13081 case AMDGPU::G_FMINNUM:
13082 case AMDGPU::G_FMAXNUM:
13083 case AMDGPU::G_FMINNUM_IEEE:
13084 case AMDGPU::G_FMAXNUM_IEEE:
13085 case AMDGPU::G_FMINIMUM:
13086 case AMDGPU::G_FMAXIMUM: {
13094 case AMDGPU::G_BUILD_VECTOR:
13099 case AMDGPU::G_INTRINSIC:
13100 case AMDGPU::G_INTRINSIC_CONVERGENT:
13102 case Intrinsic::amdgcn_fmul_legacy:
13103 case Intrinsic::amdgcn_fmad_ftz:
13104 case Intrinsic::amdgcn_sqrt:
13105 case Intrinsic::amdgcn_fmed3:
13106 case Intrinsic::amdgcn_sin:
13107 case Intrinsic::amdgcn_cos:
13108 case Intrinsic::amdgcn_log:
13109 case Intrinsic::amdgcn_exp2:
13110 case Intrinsic::amdgcn_log_clamp:
13111 case Intrinsic::amdgcn_rcp:
13112 case Intrinsic::amdgcn_rcp_legacy:
13113 case Intrinsic::amdgcn_rsq:
13114 case Intrinsic::amdgcn_rsq_clamp:
13115 case Intrinsic::amdgcn_rsq_legacy:
13116 case Intrinsic::amdgcn_div_scale:
13117 case Intrinsic::amdgcn_div_fmas:
13118 case Intrinsic::amdgcn_div_fixup:
13119 case Intrinsic::amdgcn_fract:
13120 case Intrinsic::amdgcn_cvt_pkrtz:
13121 case Intrinsic::amdgcn_cubeid:
13122 case Intrinsic::amdgcn_cubema:
13123 case Intrinsic::amdgcn_cubesc:
13124 case Intrinsic::amdgcn_cubetc:
13125 case Intrinsic::amdgcn_frexp_mant:
13126 case Intrinsic::amdgcn_fdot2:
13127 case Intrinsic::amdgcn_trig_preop:
13146 if (
C.isDenormal()) {
13160 if (
C.isSignaling()) {
13179 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13183SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13184 DAGCombinerInfo &DCI)
const {
13187 EVT VT =
N->getValueType(0);
13196 EVT VT =
N->getValueType(0);
13197 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13213 EVT EltVT =
Lo.getValueType();
13216 for (
unsigned I = 0;
I != 2; ++
I) {
13220 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13221 }
else if (
Op.isUndef()) {
13233 if (isa<ConstantFPSDNode>(NewElts[1]))
13234 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13240 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13292 if (!MinK || !MaxK)
13305 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13306 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13347 if (
Info->getMode().DX10Clamp) {
13356 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13388 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13397 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13406 DAGCombinerInfo &DCI)
const {
13409 EVT VT =
N->getValueType(0);
13410 unsigned Opc =
N->getOpcode();
13439 if (
SDValue Med3 = performIntMed3ImmCombine(
13444 if (
SDValue Med3 = performIntMed3ImmCombine(
13450 if (
SDValue Med3 = performIntMed3ImmCombine(
13455 if (
SDValue Med3 = performIntMed3ImmCombine(
13465 (VT == MVT::f32 || VT == MVT::f64 ||
13469 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13480 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13481 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13490 DAGCombinerInfo &DCI)
const {
13491 EVT VT =
N->getValueType(0);
13514 if (
Info->getMode().DX10Clamp) {
13517 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13520 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13523 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13534 DAGCombinerInfo &DCI)
const {
13538 return DCI.DAG.getUNDEF(
N->getValueType(0));
13546 bool IsDivergentIdx,
13551 unsigned VecSize = EltSize * NumElem;
13554 if (VecSize <= 64 && EltSize < 32)
13563 if (IsDivergentIdx)
13567 unsigned NumInsts = NumElem +
13568 ((EltSize + 31) / 32) * NumElem ;
13573 return NumInsts <= 16;
13578 return NumInsts <= 15;
13585 if (isa<ConstantSDNode>(
Idx))
13599SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13600 DAGCombinerInfo &DCI)
const {
13606 EVT ResVT =
N->getValueType(0);
13625 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13653 DCI.AddToWorklist(Elt0.
getNode());
13654 DCI.AddToWorklist(Elt1.
getNode());
13676 if (!DCI.isBeforeLegalize())
13682 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13683 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13684 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13687 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13688 unsigned EltIdx = BitIndex / 32;
13689 unsigned LeftoverBitIdx = BitIndex % 32;
13693 DCI.AddToWorklist(Cast.
getNode());
13697 DCI.AddToWorklist(Elt.
getNode());
13700 DCI.AddToWorklist(Srl.
getNode());
13704 DCI.AddToWorklist(Trunc.
getNode());
13706 if (VecEltVT == ResVT) {
13718SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13719 DAGCombinerInfo &DCI)
const {
13733 EVT IdxVT =
Idx.getValueType();
13750 Src.getOperand(0).getValueType() == MVT::f16) {
13751 return Src.getOperand(0);
13754 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13755 APFloat Val = CFP->getValueAPF();
13756 bool LosesInfo =
true;
13766 DAGCombinerInfo &DCI)
const {
13768 "combine only useful on gfx8");
13770 SDValue TruncSrc =
N->getOperand(0);
13771 EVT VT =
N->getValueType(0);
13772 if (VT != MVT::f16)
13810unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13812 const SDNode *N1)
const {
13817 if (((VT == MVT::f32 &&
13819 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13839 EVT VT =
N->getValueType(0);
13840 if (VT != MVT::i32 && VT != MVT::i64)
13846 unsigned Opc =
N->getOpcode();
13869 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13889 DAGCombinerInfo &DCI)
const {
13893 EVT VT =
N->getValueType(0);
13903 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13907 if (NumBits <= 32 || NumBits > 64)
13919 unsigned NumUsers = 0;
13944 bool MulSignedLo =
false;
13945 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13954 if (VT != MVT::i64) {
13977 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13979 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13980 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13982 if (!MulLHSUnsigned32) {
13989 if (!MulRHSUnsigned32) {
14000 if (VT != MVT::i64)
14006SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14007 DAGCombinerInfo &DCI)
const {
14009 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14033 DAG.
getNode(
N->getOpcode(), SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14044static std::optional<ByteProvider<SDValue>>
14047 if (!Byte0 || Byte0->isConstantZero()) {
14048 return std::nullopt;
14051 if (Byte1 && !Byte1->isConstantZero()) {
14052 return std::nullopt;
14058 unsigned FirstCs =
First & 0x0c0c0c0c;
14059 unsigned SecondCs = Second & 0x0c0c0c0c;
14060 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14061 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14063 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14064 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14065 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14066 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14068 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14092 for (
int BPI = 0; BPI < 2; BPI++) {
14095 BPP = {Src1, Src0};
14097 unsigned ZeroMask = 0x0c0c0c0c;
14098 unsigned FMask = 0xFF << (8 * (3 - Step));
14100 unsigned FirstMask =
14101 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14102 unsigned SecondMask =
14103 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14107 int FirstGroup = -1;
14108 for (
int I = 0;
I < 2;
I++) {
14110 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14111 return IterElt.SrcOp == *BPP.first.Src &&
14112 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14122 if (FirstGroup != -1) {
14124 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14125 return IterElt.SrcOp == *BPP.second.Src &&
14126 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14132 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14140 unsigned ZeroMask = 0x0c0c0c0c;
14141 unsigned FMask = 0xFF << (8 * (3 - Step));
14145 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14149 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14158 if (Srcs.
size() == 1) {
14159 auto *Elt = Srcs.
begin();
14163 if (Elt->PermMask == 0x3020100)
14170 auto *FirstElt = Srcs.
begin();
14171 auto *SecondElt = std::next(FirstElt);
14178 auto FirstMask = FirstElt->PermMask;
14179 auto SecondMask = SecondElt->PermMask;
14181 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14182 unsigned FirstPlusFour = FirstMask | 0x04040404;
14185 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14197 FirstElt = std::next(SecondElt);
14198 if (FirstElt == Srcs.
end())
14201 SecondElt = std::next(FirstElt);
14204 if (SecondElt == Srcs.
end()) {
14210 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14216 return Perms.
size() == 2
14222 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14223 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14224 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14225 EntryMask += ZeroMask;
14230 auto Opcode =
Op.getOpcode();
14236static std::optional<bool>
14247 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14250 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14252 assert(!(S0IsUnsigned && S0IsSigned));
14253 assert(!(S1IsUnsigned && S1IsSigned));
14261 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14267 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14268 return std::nullopt;
14280 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14281 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14286 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14292 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14293 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14294 return std::nullopt;
14300 DAGCombinerInfo &DCI)
const {
14302 EVT VT =
N->getValueType(0);
14309 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14314 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14318 if (VT == MVT::i64) {
14319 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14326 std::optional<bool> IsSigned;
14332 int ChainLength = 0;
14333 for (
int I = 0;
I < 4;
I++) {
14334 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14337 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14340 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14345 TempNode->getOperand(MulIdx), *Src0, *Src1,
14346 TempNode->getOperand(MulIdx)->getOperand(0),
14347 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14351 IsSigned = *IterIsSigned;
14352 if (*IterIsSigned != *IsSigned)
14355 auto AddIdx = 1 - MulIdx;
14358 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14359 Src2s.
push_back(TempNode->getOperand(AddIdx));
14369 TempNode->getOperand(AddIdx), *Src0, *Src1,
14370 TempNode->getOperand(AddIdx)->getOperand(0),
14371 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14375 if (*IterIsSigned != *IsSigned)
14379 ChainLength =
I + 2;
14383 TempNode = TempNode->getOperand(AddIdx);
14385 ChainLength =
I + 1;
14386 if (TempNode->getNumOperands() < 2)
14388 LHS = TempNode->getOperand(0);
14389 RHS = TempNode->getOperand(1);
14392 if (ChainLength < 2)
14398 if (ChainLength < 4) {
14408 bool UseOriginalSrc =
false;
14409 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14410 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14411 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14412 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14414 auto Src0Mask = Src0s.
begin()->PermMask;
14415 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14416 bool UniqueEntries =
true;
14417 for (
auto I = 1;
I < 4;
I++) {
14418 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14421 UniqueEntries =
false;
14427 if (UniqueEntries) {
14428 UseOriginalSrc =
true;
14430 auto *FirstElt = Src0s.
begin();
14434 auto *SecondElt = Src1s.
begin();
14436 SecondElt->DWordOffset);
14445 if (!UseOriginalSrc) {
14452 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14455 : Intrinsic::amdgcn_udot4,
14465 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14470 unsigned Opc =
LHS.getOpcode();
14475 Opc =
RHS.getOpcode();
14482 auto Cond =
RHS.getOperand(0);
14490 return DAG.
getNode(Opc, SL, VTList, Args);
14504 DAGCombinerInfo &DCI)
const {
14506 EVT VT =
N->getValueType(0);
14508 if (VT == MVT::i64) {
14509 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14513 if (VT != MVT::i32)
14522 unsigned Opc =
RHS.getOpcode();
14529 auto Cond =
RHS.getOperand(0);
14537 return DAG.
getNode(Opc, SL, VTList, Args);
14552SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14553 DAGCombinerInfo &DCI)
const {
14555 if (
N->getValueType(0) != MVT::i32)
14566 unsigned LHSOpc =
LHS.getOpcode();
14567 unsigned Opc =
N->getOpcode();
14577 DAGCombinerInfo &DCI)
const {
14582 EVT VT =
N->getValueType(0);
14594 if (
A ==
LHS.getOperand(1)) {
14595 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14596 if (FusedOp != 0) {
14598 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14606 if (
A ==
RHS.getOperand(1)) {
14607 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14608 if (FusedOp != 0) {
14610 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14619 DAGCombinerInfo &DCI)
const {
14625 EVT VT =
N->getValueType(0);
14638 if (
A ==
LHS.getOperand(1)) {
14639 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14640 if (FusedOp != 0) {
14644 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14653 if (
A ==
RHS.getOperand(1)) {
14654 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14655 if (FusedOp != 0) {
14657 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14666 DAGCombinerInfo &DCI)
const {
14669 EVT VT =
N->getValueType(0);
14683 bool IsNegative =
false;
14684 if (CLHS->isExactlyValue(1.0) ||
14685 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14701 DAGCombinerInfo &DCI)
const {
14703 EVT VT =
N->getValueType(0);
14717 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14732 if (ScalarVT == MVT::f32 &&
14738 if (TrueNodeExpVal == INT_MIN)
14741 if (FalseNodeExpVal == INT_MIN)
14761 DAGCombinerInfo &DCI)
const {
14763 EVT VT =
N->getValueType(0);
14784 (
N->getFlags().hasAllowContract() &&
14785 FMA->getFlags().hasAllowContract())) {
14819 if (Vec1 == Vec2 || Vec3 == Vec4)
14825 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14834 DAGCombinerInfo &DCI)
const {
14840 EVT VT =
LHS.getValueType();
14843 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14845 CRHS = dyn_cast<ConstantSDNode>(LHS);
14869 return LHS.getOperand(0);
14875 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14876 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14877 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14884 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14885 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14893 return LHS.getOperand(0);
14897 if (VT != MVT::f32 && VT != MVT::f64 &&
14913 const unsigned IsInfMask =
14915 const unsigned IsFiniteMask =
14929SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
14930 DAGCombinerInfo &DCI)
const {
14948 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14952 unsigned ShiftOffset = 8 *
Offset;
14954 ShiftOffset -=
C->getZExtValue();
14956 ShiftOffset +=
C->getZExtValue();
14958 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14960 MVT::f32, Shifted);
14971 DCI.AddToWorklist(
N);
14978 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14984 DAGCombinerInfo &DCI)
const {
14994 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14997 APFloat One(
F.getSemantics(),
"1.0");
14999 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
15006 switch (
N->getOpcode()) {
15022 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
15032 switch (
N->getOpcode()) {
15034 return performAddCombine(
N, DCI);
15036 return performSubCombine(
N, DCI);
15039 return performAddCarrySubCarryCombine(
N, DCI);
15041 return performFAddCombine(
N, DCI);
15043 return performFSubCombine(
N, DCI);
15045 return performFDivCombine(
N, DCI);
15047 return performFMulCombine(
N, DCI);
15049 return performSetCCCombine(
N, DCI);
15062 return performMinMaxCombine(
N, DCI);
15064 return performFMACombine(
N, DCI);
15066 return performAndCombine(
N, DCI);
15068 return performOrCombine(
N, DCI);
15071 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15072 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15078 return performXorCombine(
N, DCI);
15080 return performZeroExtendCombine(
N, DCI);
15082 return performSignExtendInRegCombine(
N, DCI);
15084 return performClassCombine(
N, DCI);
15086 return performFCanonicalizeCombine(
N, DCI);
15088 return performRcpCombine(
N, DCI);
15103 return performUCharToFloatCombine(
N, DCI);
15105 return performFCopySignCombine(
N, DCI);
15110 return performCvtF32UByteNCombine(
N, DCI);
15112 return performFMed3Combine(
N, DCI);
15114 return performCvtPkRTZCombine(
N, DCI);
15116 return performClampCombine(
N, DCI);
15119 EVT VT =
N->getValueType(0);
15122 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15125 EVT EltVT = Src.getValueType();
15126 if (EltVT != MVT::i16)
15136 return performExtractVectorEltCombine(
N, DCI);
15138 return performInsertVectorEltCombine(
N, DCI);
15140 return performFPRoundCombine(
N, DCI);
15142 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15148 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15149 return performMemSDNodeCombine(MemNode, DCI);
15180 unsigned Opcode =
Node->getMachineOpcode();
15184 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15189 unsigned DmaskIdx =
15191 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15192 unsigned NewDmask = 0;
15195 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15196 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15199 unsigned TFCLane = 0;
15200 bool HasChain =
Node->getNumValues() > 1;
15202 if (OldDmask == 0) {
15210 TFCLane = OldBitsSet;
15217 if (
Use.getResNo() != 0)
15223 if (!
User->isMachineOpcode() ||
15224 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15236 if (UsesTFC && Lane == TFCLane) {
15241 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15243 Dmask &= ~(1 << Comp);
15251 NewDmask |= 1 << Comp;
15256 bool NoChannels = !NewDmask;
15263 if (OldBitsSet == 1)
15269 if (NewDmask == OldDmask)
15278 unsigned NewChannels = BitsSet + UsesTFC;
15282 assert(NewOpcode != -1 &&
15283 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15284 "failed to find equivalent MIMG op");
15292 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15294 MVT ResultVT = NewChannels == 1
15297 : NewChannels == 5 ? 8
15311 if (NewChannels == 1) {
15321 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15326 if (i || !NoChannels)
15331 if (NewUser !=
User) {
15341 Idx = AMDGPU::sub1;
15344 Idx = AMDGPU::sub2;
15347 Idx = AMDGPU::sub3;
15350 Idx = AMDGPU::sub4;
15361 Op =
Op.getOperand(0);
15363 return isa<FrameIndexSDNode>(
Op);
15373 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15374 SDValue SrcVal = Node->getOperand(2);
15382 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15384 SDNode *Glued = Node->getGluedNode();
15386 Node->getOperand(0), SL, VReg, SrcVal,
15392 return ToResultReg.
getNode();
15397 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15405 Node->getOperand(i).getValueType(),
15406 Node->getOperand(i)),
15418 unsigned Opcode = Node->getMachineOpcode();
15420 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15421 !
TII->isGather4(Opcode) &&
15423 return adjustWritemask(Node, DAG);
15426 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15432 case AMDGPU::V_DIV_SCALE_F32_e64:
15433 case AMDGPU::V_DIV_SCALE_F64_e64: {
15437 SDValue Src0 = Node->getOperand(1);
15438 SDValue Src1 = Node->getOperand(3);
15439 SDValue Src2 = Node->getOperand(5);
15443 (Src0 == Src1 || Src0 == Src2))
15500 unsigned InitIdx = 0;
15502 if (
TII->isImage(
MI)) {
15510 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15511 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15512 unsigned D16Val = D16 ? D16->getImm() : 0;
15514 if (!TFEVal && !LWEVal)
15525 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15527 unsigned dmask = MO_Dmask->
getImm();
15534 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15540 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15541 if (DstSize < InitIdx)
15544 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15552 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15553 unsigned NewDst = 0;
15562 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15563 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15583 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15596 if (
TII->isVOP3(
MI.getOpcode())) {
15598 TII->legalizeOperandsVOP3(
MRI,
MI);
15603 if (!
MI.getDesc().operands().empty()) {
15604 unsigned Opc =
MI.getOpcode();
15605 bool HasAGPRs =
Info->mayNeedAGPRs();
15613 if ((
I == Src2Idx) && (HasAGPRs))
15616 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15618 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15619 if (!
TRI->hasAGPRs(RC))
15621 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15622 if (!Src || !Src->isCopy() ||
15623 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15625 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15629 MRI.setRegClass(
Op.getReg(), NewRC);
15632 if (
TII->isMAI(
MI)) {
15638 AMDGPU::OpName::scale_src0);
15639 if (Src0Idx != -1) {
15641 AMDGPU::OpName::scale_src1);
15642 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15643 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15644 TII->legalizeOpWithMove(
MI, Src1Idx);
15652 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15653 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15654 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15655 if (
TRI->isVectorSuperClass(RC)) {
15656 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15657 MRI.setRegClass(Src2->getReg(), NewRC);
15658 if (Src2->isTied())
15659 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15668 if (
TII->isImage(
MI))
15669 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15743std::pair<unsigned, const TargetRegisterClass *>
15750 if (Constraint.
size() == 1) {
15752 switch (Constraint[0]) {
15759 RC = &AMDGPU::SReg_32RegClass;
15762 RC = &AMDGPU::SGPR_64RegClass;
15767 return std::pair(0U,
nullptr);
15774 RC = &AMDGPU::VGPR_32RegClass;
15779 return std::pair(0U,
nullptr);
15788 RC = &AMDGPU::AGPR_32RegClass;
15793 return std::pair(0U,
nullptr);
15802 return std::pair(0U, RC);
15807 if (
RegName.consume_front(
"v")) {
15808 RC = &AMDGPU::VGPR_32RegClass;
15809 }
else if (
RegName.consume_front(
"s")) {
15810 RC = &AMDGPU::SGPR_32RegClass;
15811 }
else if (
RegName.consume_front(
"a")) {
15812 RC = &AMDGPU::AGPR_32RegClass;
15817 if (
RegName.consume_front(
"[")) {
15828 return std::pair(0U,
nullptr);
15831 RC =
TRI->getVGPRClassForBitWidth(Width);
15833 RC =
TRI->getSGPRClassForBitWidth(Width);
15835 RC =
TRI->getAGPRClassForBitWidth(Width);
15837 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15838 return std::pair(Reg, RC);
15844 return std::pair(0U,
nullptr);
15846 if (!
Failed && Idx < RC->getNumRegs())
15854 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15860 if (Constraint.
size() == 1) {
15861 switch (Constraint[0]) {
15871 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
15879 if (Constraint.
size() == 1) {
15880 switch (Constraint[0]) {
15897 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15904 std::vector<SDValue> &Ops,
15919 unsigned Size =
Op.getScalarValueSizeInBits();
15927 Val =
C->getSExtValue();
15931 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15937 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15940 Val =
C->getSExtValue();
15944 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15954 if (Constraint.
size() == 1) {
15955 switch (Constraint[0]) {
15959 return isInt<16>(Val);
15963 return isInt<32>(Val);
15970 }
else if (Constraint.
size() == 2) {
15971 if (Constraint ==
"DA") {
15972 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15973 int64_t LoBits =
static_cast<int32_t
>(Val);
15977 if (Constraint ==
"DB") {
15985 unsigned MaxSize)
const {
15986 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15989 MVT VT =
Op.getSimpleValueType();
16014 switch (UnalignedClassID) {
16015 case AMDGPU::VReg_64RegClassID:
16016 return AMDGPU::VReg_64_Align2RegClassID;
16017 case AMDGPU::VReg_96RegClassID:
16018 return AMDGPU::VReg_96_Align2RegClassID;
16019 case AMDGPU::VReg_128RegClassID:
16020 return AMDGPU::VReg_128_Align2RegClassID;
16021 case AMDGPU::VReg_160RegClassID:
16022 return AMDGPU::VReg_160_Align2RegClassID;
16023 case AMDGPU::VReg_192RegClassID:
16024 return AMDGPU::VReg_192_Align2RegClassID;
16025 case AMDGPU::VReg_224RegClassID:
16026 return AMDGPU::VReg_224_Align2RegClassID;
16027 case AMDGPU::VReg_256RegClassID:
16028 return AMDGPU::VReg_256_Align2RegClassID;
16029 case AMDGPU::VReg_288RegClassID:
16030 return AMDGPU::VReg_288_Align2RegClassID;
16031 case AMDGPU::VReg_320RegClassID:
16032 return AMDGPU::VReg_320_Align2RegClassID;
16033 case AMDGPU::VReg_352RegClassID:
16034 return AMDGPU::VReg_352_Align2RegClassID;
16035 case AMDGPU::VReg_384RegClassID:
16036 return AMDGPU::VReg_384_Align2RegClassID;
16037 case AMDGPU::VReg_512RegClassID:
16038 return AMDGPU::VReg_512_Align2RegClassID;
16039 case AMDGPU::VReg_1024RegClassID:
16040 return AMDGPU::VReg_1024_Align2RegClassID;
16041 case AMDGPU::AReg_64RegClassID:
16042 return AMDGPU::AReg_64_Align2RegClassID;
16043 case AMDGPU::AReg_96RegClassID:
16044 return AMDGPU::AReg_96_Align2RegClassID;
16045 case AMDGPU::AReg_128RegClassID:
16046 return AMDGPU::AReg_128_Align2RegClassID;
16047 case AMDGPU::AReg_160RegClassID:
16048 return AMDGPU::AReg_160_Align2RegClassID;
16049 case AMDGPU::AReg_192RegClassID:
16050 return AMDGPU::AReg_192_Align2RegClassID;
16051 case AMDGPU::AReg_256RegClassID:
16052 return AMDGPU::AReg_256_Align2RegClassID;
16053 case AMDGPU::AReg_512RegClassID:
16054 return AMDGPU::AReg_512_Align2RegClassID;
16055 case AMDGPU::AReg_1024RegClassID:
16056 return AMDGPU::AReg_1024_Align2RegClassID;
16072 if (
Info->isEntryFunction()) {
16079 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16081 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16082 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16083 &AMDGPU::SGPR_64RegClass);
16084 Info->setSGPRForEXECCopy(SReg);
16087 Info->getStackPtrOffsetReg()));
16088 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16089 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16093 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16094 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16096 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16097 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16099 Info->limitOccupancy(MF);
16101 if (ST.isWave32() && !MF.
empty()) {
16102 for (
auto &
MBB : MF) {
16103 for (
auto &
MI :
MBB) {
16104 TII->fixImplicitOperands(
MI);
16114 if (ST.needsAlignedVGPRs()) {
16115 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16121 if (NewClassID != -1)
16122 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16131 const APInt &DemandedElts,
16133 unsigned Depth)
const {
16135 unsigned Opc =
Op.getOpcode();
16138 unsigned IID =
Op.getConstantOperandVal(0);
16140 case Intrinsic::amdgcn_mbcnt_lo:
16141 case Intrinsic::amdgcn_mbcnt_hi: {
16147 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16157 Op, Known, DemandedElts, DAG,
Depth);
16172 unsigned MaxValue =
16181 switch (
MI->getOpcode()) {
16182 case AMDGPU::G_INTRINSIC:
16183 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16186 case Intrinsic::amdgcn_workitem_id_x:
16189 case Intrinsic::amdgcn_workitem_id_y:
16192 case Intrinsic::amdgcn_workitem_id_z:
16195 case Intrinsic::amdgcn_mbcnt_lo:
16196 case Intrinsic::amdgcn_mbcnt_hi: {
16208 case Intrinsic::amdgcn_groupstaticsize: {
16219 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16222 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16225 case AMDGPU::G_AMDGPU_SMED3:
16226 case AMDGPU::G_AMDGPU_UMED3: {
16227 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16254 unsigned Depth)
const {
16256 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16262 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16289 if (Header->getAlignment() != PrefAlign)
16290 return Header->getAlignment();
16292 unsigned LoopSize = 0;
16300 LoopSize +=
TII->getInstSizeInBytes(
MI);
16301 if (LoopSize > 192)
16306 if (LoopSize <= 64)
16309 if (LoopSize <= 128)
16310 return CacheLineAlign;
16316 auto I = Exit->getFirstNonDebugInstr();
16317 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16318 return CacheLineAlign;
16327 if (PreTerm == Pre->
begin() ||
16328 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16332 auto ExitHead = Exit->getFirstNonDebugInstr();
16333 if (ExitHead == Exit->end() ||
16334 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16339 return CacheLineAlign;
16347 N =
N->getOperand(0).getNode();
16357 switch (
N->getOpcode()) {
16365 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16366 return !
TRI->isSGPRReg(
MRI, Reg);
16368 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16372 return !
TRI->isSGPRReg(
MRI, Reg);
16376 unsigned AS = L->getAddressSpace();
16407 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16409 return A->readMem() &&
A->writeMem();
16444 unsigned Depth)
const {
16449 if (
Info->getMode().DX10Clamp)
16461 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16481 <<
"Hardware instruction generated for atomic "
16483 <<
" operation at memory scope " << MemScope;
16487 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16488 Type *EltTy = VT->getElementType();
16489 return VT->getNumElements() == 2 &&
16508 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16509 unsigned BW =
IT->getBitWidth();
16510 return BW == 32 || BW == 64;
16522 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16524 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16525 return BW == 32 || BW == 64;
16532 return VT->getNumElements() == 2 &&
16533 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16543 bool HasSystemScope) {
16550 if (HasSystemScope) {
16557 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16570 const MDNode *NoaliasAddrSpaceMD =
16571 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16572 if (!NoaliasAddrSpaceMD)
16575 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16577 auto *
Low = mdconst::extract<ConstantInt>(
16580 auto *
High = mdconst::extract<ConstantInt>(
16602 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16615 bool HasSystemScope =
16802 if (HasSystemScope)
16854 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16855 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16856 : &AMDGPU::SReg_32RegClass;
16857 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16858 return TRI->getEquivalentSGPRClass(RC);
16859 if (
TRI->isSGPRClass(RC) && isDivergent)
16860 return TRI->getEquivalentVGPRClass(RC);
16872 unsigned WaveSize) {
16877 if (!
IT ||
IT->getBitWidth() != WaveSize)
16880 if (!isa<Instruction>(V))
16882 if (!Visited.
insert(V).second)
16884 bool Result =
false;
16885 for (
const auto *U : V->users()) {
16886 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16887 if (V == U->getOperand(1)) {
16888 switch (Intrinsic->getIntrinsicID()) {
16892 case Intrinsic::amdgcn_if_break:
16893 case Intrinsic::amdgcn_if:
16894 case Intrinsic::amdgcn_else:
16899 if (V == U->getOperand(0)) {
16900 switch (Intrinsic->getIntrinsicID()) {
16904 case Intrinsic::amdgcn_end_cf:
16905 case Intrinsic::amdgcn_loop:
16911 Result =
hasCFUser(U, Visited, WaveSize);
16920 const Value *V)
const {
16921 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16922 if (CI->isInlineAsm()) {
16931 for (
auto &TC : TargetConstraints) {
16973 return MRI.hasOneNonDBGUse(N0);
16980 if (
I.getMetadata(
"amdgpu.noclobber"))
16982 if (
I.getMetadata(
"amdgpu.last.use"))
16992 if (!Def->isMachineOpcode())
17002 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17003 PhysReg = AMDGPU::SCC;
17005 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17014 if (!
I->hasOneUse())
17020 switch (
I->getOpcode()) {
17021 case Instruction::FMul: {
17022 if (
User->getOpcode() != Instruction::FSub &&
17023 User->getOpcode() != Instruction::FAdd)
17028 return ((!
I->hasAllowContract() || !
User->hasAllowContract()) &&
17087 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17098 Alignment = RMW->getAlign();
17113 RMW->getType()->isFloatTy();
17116 bool ReturnValueIsUsed = !AI->
use_empty();
17125 if (FullFlatEmulation) {
17136 std::prev(BB->
end())->eraseFromParent();
17139 Value *LoadedShared =
nullptr;
17140 if (FullFlatEmulation) {
17142 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17143 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17151 LoadedShared = Clone;
17158 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17166 Value *LoadedPrivate;
17169 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17172 LoadedPrivate, RMW->getValOperand());
17176 auto [ResultLoad, Equal] =
17191 if (FullFlatEmulation) {
17201 if (!FullFlatEmulation) {
17206 MDNode *RangeNotPrivate =
17209 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17217 if (ReturnValueIsUsed) {
17220 if (FullFlatEmulation)
17235 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17236 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const