39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
792 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
800 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1005 EVT DestVT,
EVT SrcVT)
const {
1015 LLT DestTy,
LLT SrcTy)
const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1045 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1072 return (NumElts + 1) / 2;
1078 return NumElts * ((
Size + 31) / 32);
1087 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1100 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((
Size + 31) / 32);
1133 return NumIntermediates;
1138 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1147 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1181 DL.getPointerSizeInBits(AS) == 192)
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1201 unsigned IntrID)
const {
1203 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1218 if (RsrcIntr->IsImage) {
1226 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1233 Info.ptrVal = RsrcArg;
1236 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1245 if (RsrcIntr->IsImage) {
1246 unsigned MaxNumLanes = 4;
1261 std::numeric_limits<unsigned>::max());
1271 if (RsrcIntr->IsImage) {
1272 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1292 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1294 Info.memVT = MVT::i32;
1301 case Intrinsic::amdgcn_raw_buffer_load_lds:
1302 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_buffer_load_lds:
1304 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1305 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1310 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1311 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1313 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1316 std::numeric_limits<unsigned>::max());
1326 case Intrinsic::amdgcn_ds_ordered_add:
1327 case Intrinsic::amdgcn_ds_ordered_swap: {
1340 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1341 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1344 Info.ptrVal =
nullptr;
1349 case Intrinsic::amdgcn_ds_append:
1350 case Intrinsic::amdgcn_ds_consume: {
1363 case Intrinsic::amdgcn_global_atomic_csub: {
1372 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1382 case Intrinsic::amdgcn_global_atomic_fmin_num:
1383 case Intrinsic::amdgcn_global_atomic_fmax_num:
1384 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1385 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1386 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1387 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1397 case Intrinsic::amdgcn_global_load_tr_b64:
1398 case Intrinsic::amdgcn_global_load_tr_b128:
1399 case Intrinsic::amdgcn_ds_read_tr4_b64:
1400 case Intrinsic::amdgcn_ds_read_tr6_b96:
1401 case Intrinsic::amdgcn_ds_read_tr8_b64:
1402 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1410 case Intrinsic::amdgcn_ds_gws_init:
1411 case Intrinsic::amdgcn_ds_gws_barrier:
1412 case Intrinsic::amdgcn_ds_gws_sema_v:
1413 case Intrinsic::amdgcn_ds_gws_sema_br:
1414 case Intrinsic::amdgcn_ds_gws_sema_p:
1415 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1425 Info.memVT = MVT::i32;
1429 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1435 case Intrinsic::amdgcn_global_load_lds: {
1437 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1443 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1453 Info.memVT = MVT::i32;
1460 case Intrinsic::amdgcn_s_prefetch_data: {
1475 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1478 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1479 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1491 Type *&AccessTy)
const {
1493 switch (
II->getIntrinsicID()) {
1494 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1495 case Intrinsic::amdgcn_ds_append:
1496 case Intrinsic::amdgcn_ds_consume:
1497 case Intrinsic::amdgcn_ds_read_tr4_b64:
1498 case Intrinsic::amdgcn_ds_read_tr6_b96:
1499 case Intrinsic::amdgcn_ds_read_tr8_b64:
1500 case Intrinsic::amdgcn_ds_read_tr16_b64:
1501 case Intrinsic::amdgcn_ds_ordered_add:
1502 case Intrinsic::amdgcn_ds_ordered_swap:
1503 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1504 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1505 case Intrinsic::amdgcn_global_atomic_csub:
1506 case Intrinsic::amdgcn_global_atomic_fmax_num:
1507 case Intrinsic::amdgcn_global_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b64:
1510 case Intrinsic::amdgcn_global_load_tr_b128:
1511 Ptr =
II->getArgOperand(0);
1513 case Intrinsic::amdgcn_global_load_lds:
1514 Ptr =
II->getArgOperand(1);
1519 AccessTy =
II->getType();
1525 unsigned AddrSpace)
const {
1537 return AM.
Scale == 0 &&
1539 AM.
BaseOffs, AddrSpace, FlatVariant));
1559 return isLegalMUBUFAddressingMode(AM);
1562bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1573 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1585 if (AM.HasBaseReg) {
1617 return isLegalMUBUFAddressingMode(AM);
1624 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1674 : isLegalMUBUFAddressingMode(AM);
1721 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1733 Align RequiredAlignment(
1736 Alignment < RequiredAlignment)
1757 RequiredAlignment =
Align(4);
1775 *IsFast = (Alignment >= RequiredAlignment) ? 64
1776 : (Alignment <
Align(4)) ? 32
1798 *IsFast = (Alignment >= RequiredAlignment) ? 96
1799 : (Alignment <
Align(4)) ? 32
1812 RequiredAlignment =
Align(8);
1823 *IsFast = (Alignment >= RequiredAlignment) ? 128
1824 : (Alignment <
Align(4)) ? 32
1841 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1843 return Alignment >= RequiredAlignment ||
1852 bool AlignedBy4 = Alignment >=
Align(4);
1854 *IsFast = AlignedBy4;
1865 return Alignment >=
Align(4) ||
1879 return Size >= 32 && Alignment >=
Align(4);
1884 unsigned *IsFast)
const {
1886 Alignment, Flags, IsFast);
1896 if (
Op.size() >= 16 &&
1900 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1908 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1918 unsigned DestAS)
const {
1926 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1944 unsigned Index)
const {
1980 auto [InputPtrReg, RC, ArgTy] =
1990 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1996 const SDLoc &SL)
const {
2003 const SDLoc &SL)
const {
2006 std::optional<uint32_t> KnownSize =
2008 if (KnownSize.has_value())
2034 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2043SDValue SITargetLowering::lowerKernargMemParameter(
2055 int64_t OffsetDiff =
Offset - AlignDownOffset;
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2071 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2081 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2129 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2134SDValue SITargetLowering::getPreloadedValue(
2156 Reg = &WorkGroupIDX;
2157 RC = &AMDGPU::SReg_32RegClass;
2161 Reg = &WorkGroupIDY;
2162 RC = &AMDGPU::SReg_32RegClass;
2166 Reg = &WorkGroupIDZ;
2167 RC = &AMDGPU::SReg_32RegClass;
2198 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2202 "vector type argument should have been split");
2207 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2215 "unexpected vector split in ps argument type");
2229 Info->markPSInputAllocated(PSInputNum);
2231 Info->markPSInputEnabled(PSInputNum);
2247 if (
Info.hasWorkItemIDX()) {
2257 if (
Info.hasWorkItemIDY()) {
2260 Info.setWorkItemIDY(
2263 unsigned Reg = AMDGPU::VGPR1;
2271 if (
Info.hasWorkItemIDZ()) {
2274 Info.setWorkItemIDZ(
2277 unsigned Reg = AMDGPU::VGPR2;
2297 if (RegIdx == ArgVGPRs.
size()) {
2304 unsigned Reg = ArgVGPRs[RegIdx];
2306 assert(Reg != AMDGPU::NoRegister);
2316 unsigned NumArgRegs) {
2319 if (RegIdx == ArgSGPRs.
size())
2322 unsigned Reg = ArgSGPRs[RegIdx];
2324 assert(Reg != AMDGPU::NoRegister);
2338 assert(Reg != AMDGPU::NoRegister);
2364 const unsigned Mask = 0x3ff;
2367 if (
Info.hasWorkItemIDX()) {
2369 Info.setWorkItemIDX(Arg);
2372 if (
Info.hasWorkItemIDY()) {
2374 Info.setWorkItemIDY(Arg);
2377 if (
Info.hasWorkItemIDZ())
2389 const unsigned Mask = 0x3ff;
2410 if (
Info.hasImplicitArgPtr())
2418 if (
Info.hasWorkGroupIDX())
2421 if (
Info.hasWorkGroupIDY())
2424 if (
Info.hasWorkGroupIDZ())
2427 if (
Info.hasLDSKernelId())
2439 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2446 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2452 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2458 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2473 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2479 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2485 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2502 bool InPreloadSequence =
true;
2504 bool AlignedForImplictArgs =
false;
2505 unsigned ImplicitArgOffset = 0;
2506 for (
auto &Arg :
F.args()) {
2507 if (!InPreloadSequence || !Arg.hasInRegAttr())
2510 unsigned ArgIdx = Arg.getArgNo();
2513 if (InIdx < Ins.size() &&
2514 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2517 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2518 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2520 assert(ArgLocs[ArgIdx].isMemLoc());
2521 auto &ArgLoc = ArgLocs[InIdx];
2523 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2525 unsigned NumAllocSGPRs =
2526 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2529 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2530 if (!AlignedForImplictArgs) {
2532 alignTo(LastExplicitArgOffset,
2534 LastExplicitArgOffset;
2535 AlignedForImplictArgs =
true;
2537 ArgOffset += ImplicitArgOffset;
2541 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2542 assert(InIdx >= 1 &&
"No previous SGPR");
2543 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2544 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2548 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2549 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2552 InPreloadSequence =
false;
2558 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2560 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2562 if (PreloadRegs->
size() > 1)
2563 RC = &AMDGPU::SGPR_32RegClass;
2564 for (
auto &Reg : *PreloadRegs) {
2570 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2579 if (
Info.hasLDSKernelId()) {
2581 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2590 bool IsShader)
const {
2598 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2600 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2604 unsigned NumRequiredSystemSGPRs =
2605 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2606 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2607 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2609 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2614 if (!HasArchitectedSGPRs) {
2615 if (
Info.hasWorkGroupIDX()) {
2617 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2621 if (
Info.hasWorkGroupIDY()) {
2623 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2627 if (
Info.hasWorkGroupIDZ()) {
2629 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2634 if (
Info.hasWorkGroupInfo()) {
2636 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2640 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2642 unsigned PrivateSegmentWaveByteOffsetReg;
2645 PrivateSegmentWaveByteOffsetReg =
2646 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2650 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2652 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2655 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2657 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2658 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2662 Info.getNumPreloadedSGPRs() >= 16);
2677 if (HasStackObjects)
2678 Info.setHasNonSpillStackObjects(
true);
2683 HasStackObjects =
true;
2687 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2689 if (!ST.enableFlatScratch()) {
2690 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2697 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2699 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2709 Info.setScratchRSrcReg(ReservedBufferReg);
2728 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2729 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2736 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2737 if (!
MRI.isLiveIn(Reg)) {
2738 Info.setStackPtrOffsetReg(Reg);
2743 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2750 if (ST.getFrameLowering()->hasFP(MF)) {
2751 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2757 return !
Info->isEntryFunction();
2767 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2776 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2777 RC = &AMDGPU::SGPR_64RegClass;
2778 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2779 RC = &AMDGPU::SGPR_32RegClass;
2785 Entry->addLiveIn(*
I);
2790 for (
auto *Exit : Exits)
2792 TII->get(TargetOpcode::COPY), *
I)
2810 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2829 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2830 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2838 !
Info->hasWorkGroupIDZ());
2857 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2858 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2861 Info->markPSInputAllocated(0);
2862 Info->markPSInputEnabled(0);
2873 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2874 if ((PsInputBits & 0x7F) == 0 ||
2875 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2878 }
else if (IsKernel) {
2881 Splits.
append(Ins.begin(), Ins.end());
2894 }
else if (!IsGraphics) {
2919 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2929 if (IsEntryFunc && VA.
isMemLoc()) {
2952 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2956 int64_t OffsetDiff =
Offset - AlignDownOffset;
2963 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2974 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2975 Ins[i].Flags.isSExt(), &Ins[i]);
2983 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2986 if (PreloadRegs.
size() == 1) {
2987 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2992 TRI->getRegSizeInBits(*RC)));
3000 for (
auto Reg : PreloadRegs) {
3007 PreloadRegs.size()),
3024 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3025 Ins[i].Flags.isSExt(), &Ins[i]);
3037 "hidden argument in kernel signature was not preloaded",
3044 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3045 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3050 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3065 if (!IsEntryFunc && VA.
isMemLoc()) {
3066 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3077 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3078 RC = &AMDGPU::VGPR_32RegClass;
3079 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3080 RC = &AMDGPU::SGPR_32RegClass;
3140 Info->setBytesInStackArgArea(StackArgSize);
3142 return Chains.
empty() ? Chain
3158 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3164 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3165 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3166 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3189 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3207 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3208 ++
I, ++RealRVLocIdx) {
3212 SDValue Arg = OutVals[RealRVLocIdx];
3240 if (!
Info->isEntryFunction()) {
3246 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3248 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3264 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3347 auto &ArgUsageInfo =
3349 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3375 const auto [OutgoingArg, ArgRC, ArgTy] =
3380 const auto [IncomingArg, IncomingArgRC, Ty] =
3382 assert(IncomingArgRC == ArgRC);
3385 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3393 InputReg = getImplicitArgPtr(DAG,
DL);
3395 std::optional<uint32_t> Id =
3397 if (Id.has_value()) {
3408 if (OutgoingArg->isRegister()) {
3409 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3410 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3413 unsigned SpecialArgOffset =
3424 auto [OutgoingArg, ArgRC, Ty] =
3427 std::tie(OutgoingArg, ArgRC, Ty) =
3430 std::tie(OutgoingArg, ArgRC, Ty) =
3445 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3446 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3447 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3479 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3480 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3491 : IncomingArgY ? *IncomingArgY
3498 if (OutgoingArg->isRegister()) {
3500 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3541 if (Callee->isDivergent())
3548 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3552 if (!CallerPreserved)
3555 bool CCMatch = CallerCC == CalleeCC;
3568 if (Arg.hasByValAttr())
3582 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3583 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3592 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3605 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3607 if (!CCVA.isRegLoc())
3612 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3614 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3643 if (IsChainCallConv) {
3647 RequestedExec = CLI.
Args.back();
3648 assert(RequestedExec.
Node &&
"No node for EXEC");
3653 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3654 CLI.
Outs.pop_back();
3658 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3659 CLI.
Outs.pop_back();
3664 "Haven't popped all the pieces of the EXEC mask");
3675 bool IsSibCall =
false;
3689 "unsupported call to variadic function ");
3697 "unsupported required tail call to function ");
3702 Outs, OutVals, Ins, DAG);
3706 "site marked musttail or on llvm.amdgcn.cs.chain");
3713 if (!TailCallOpt && IsTailCall)
3759 if (!IsSibCall || IsChainCallConv) {
3766 RegsToPass.emplace_back(IsChainCallConv
3767 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3768 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3775 const unsigned NumSpecialInputs = RegsToPass.size();
3777 MVT PtrVT = MVT::i32;
3780 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3808 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3816 int32_t
Offset = LocMemOffset;
3823 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3829 ? Flags.getNonZeroByValAlign()
3856 if (Outs[i].Flags.isByVal()) {
3858 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3861 Outs[i].Flags.getNonZeroByValAlign(),
3863 nullptr, std::nullopt, DstInfo,
3869 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3875 if (!MemOpChains.
empty())
3891 unsigned ArgIdx = 0;
3892 for (
auto [Reg, Val] : RegsToPass) {
3893 if (ArgIdx++ >= NumSpecialInputs &&
3894 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3920 if (IsTailCall && !IsSibCall) {
3925 std::vector<SDValue> Ops({Chain});
3931 Ops.push_back(Callee);
3948 Ops.push_back(Callee);
3959 if (IsChainCallConv)
3960 Ops.push_back(RequestedExec.
Node);
3964 for (
auto &[Reg, Val] : RegsToPass)
3968 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3969 assert(Mask &&
"Missing call preserved mask for calling convention");
3979 MVT::Glue, GlueOps),
3984 Ops.push_back(InGlue);
4001 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4006 Chain = Call.getValue(0);
4007 InGlue = Call.getValue(1);
4009 uint64_t CalleePopBytes = NumBytes;
4030 EVT VT =
Op.getValueType();
4040 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4044 "Stack grows upwards for AMDGPU");
4046 Chain = BaseAddr.getValue(1);
4048 if (Alignment > StackAlign) {
4051 uint64_t StackAlignMask = ScaledAlignment - 1;
4058 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4060 if (isa<ConstantSDNode>(
Size)) {
4091 if (
Op.getValueType() != MVT::i32)
4110 assert(
Op.getValueType() == MVT::i32);
4119 Op.getOperand(0), IntrinID, GetRoundBothImm);
4153 SDValue RoundModeTimesNumBits =
4173 TableEntry, EnumOffset);
4187 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4189 static_cast<uint32_t>(ConstMode->getZExtValue()),
4201 if (UseReducedTable) {
4207 SDValue RoundModeTimesNumBits =
4227 SDValue RoundModeTimesNumBits =
4236 NewMode = TruncTable;
4245 ReadFirstLaneID, NewMode);
4258 IntrinID, RoundBothImm, NewMode);
4264 if (
Op->isDivergent())
4283 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4284 EVT SrcVT = Src.getValueType();
4293 EVT DstVT =
Op.getValueType();
4302 if (
Op.getValueType() != MVT::i64)
4316 Op.getOperand(0), IntrinID, ModeHwRegImm);
4318 Op.getOperand(0), IntrinID, TrapHwRegImm);
4332 if (
Op.getOperand(1).getValueType() != MVT::i64)
4344 ReadFirstLaneID, NewModeReg);
4346 ReadFirstLaneID, NewTrapReg);
4348 unsigned ModeHwReg =
4351 unsigned TrapHwReg =
4359 IntrinID, ModeHwRegImm, NewModeReg);
4362 IntrinID, TrapHwRegImm, NewTrapReg);
4369 .
Case(
"m0", AMDGPU::M0)
4370 .
Case(
"exec", AMDGPU::EXEC)
4371 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4372 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4373 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4374 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4375 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4378 if (Reg == AMDGPU::NoRegister) {
4386 "\" for subtarget."));
4391 case AMDGPU::EXEC_LO:
4392 case AMDGPU::EXEC_HI:
4393 case AMDGPU::FLAT_SCR_LO:
4394 case AMDGPU::FLAT_SCR_HI:
4399 case AMDGPU::FLAT_SCR:
4418 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4427static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4449 auto Next = std::next(
I);
4462 return std::pair(LoopBB, RemainderBB);
4469 auto I =
MI.getIterator();
4470 auto E = std::next(
I);
4492 Src->setIsKill(
false);
4502 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4508 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4511 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4535 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4536 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4545 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4546 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4547 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4548 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4556 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4563 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4567 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4573 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4574 : AMDGPU::S_AND_SAVEEXEC_B64),
4578 MRI.setSimpleHint(NewExec, CondReg);
4580 if (UseGPRIdxMode) {
4582 SGPRIdxReg = CurrentIdxReg;
4584 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4585 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4592 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4595 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4602 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4605 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4606 : AMDGPU::S_XOR_B64_term),
4630 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4631 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4639 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4641 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4642 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4643 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4644 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4659 InitResultReg, DstReg, PhiReg, TmpExec,
4660 Offset, UseGPRIdxMode, SGPRIdxReg);
4666 LoopBB->removeSuccessor(RemainderBB);
4668 LoopBB->addSuccessor(LandingPad);
4679static std::pair<unsigned, int>
4683 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4688 return std::pair(AMDGPU::sub0,
Offset);
4702 assert(
Idx->getReg() != AMDGPU::NoRegister);
4726 return Idx->getReg();
4728 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4745 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4746 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4755 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4758 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4762 if (UseGPRIdxMode) {
4769 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4782 MI.eraseFromParent();
4791 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4792 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4798 UseGPRIdxMode, SGPRIdxReg);
4802 if (UseGPRIdxMode) {
4804 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4806 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4811 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4816 MI.eraseFromParent();
4833 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4843 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4845 if (
Idx->getReg() == AMDGPU::NoRegister) {
4856 MI.eraseFromParent();
4861 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4865 if (UseGPRIdxMode) {
4869 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4878 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4879 TRI.getRegSizeInBits(*VecRC), 32,
false);
4885 MI.eraseFromParent();
4895 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4899 UseGPRIdxMode, SGPRIdxReg);
4902 if (UseGPRIdxMode) {
4904 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4906 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4912 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4913 TRI.getRegSizeInBits(*VecRC), 32,
false);
4914 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4920 MI.eraseFromParent();
4935 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4966 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4967 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4969 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4970 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4971 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4973 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4974 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4976 bool IsWave32 = ST.isWave32();
4977 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4978 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4983 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4986 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4994 I = ComputeLoop->end();
4996 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5000 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5001 .
addReg(TmpSReg->getOperand(0).getReg())
5005 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5006 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5007 .
addReg(ActiveBits->getOperand(0).getReg());
5008 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5009 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5011 .
addReg(FF1->getOperand(0).getReg());
5012 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5014 .
addReg(LaneValue->getOperand(0).getReg());
5017 unsigned BITSETOpc =
5018 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5019 auto NewActiveBits =
5020 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5021 .
addReg(FF1->getOperand(0).getReg())
5022 .
addReg(ActiveBits->getOperand(0).getReg());
5025 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5026 .addMBB(ComputeLoop);
5027 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5028 .addMBB(ComputeLoop);
5031 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5033 .
addReg(NewActiveBits->getOperand(0).getReg())
5035 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5040 MI.eraseFromParent();
5052 switch (
MI.getOpcode()) {
5053 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5055 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5057 case AMDGPU::S_UADDO_PSEUDO:
5058 case AMDGPU::S_USUBO_PSEUDO: {
5065 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5067 : AMDGPU::S_SUB_I32;
5078 MI.eraseFromParent();
5081 case AMDGPU::S_ADD_U64_PSEUDO:
5082 case AMDGPU::S_SUB_U64_PSEUDO: {
5091 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5093 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5103 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5104 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5109 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5112 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5114 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5116 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5117 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5130 MI.eraseFromParent();
5133 case AMDGPU::V_ADD_U64_PSEUDO:
5134 case AMDGPU::V_SUB_U64_PSEUDO: {
5140 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5146 if (IsAdd && ST.hasLshlAddB64()) {
5152 TII->legalizeOperands(*
Add);
5153 MI.eraseFromParent();
5157 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5159 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5160 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5162 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5163 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5167 : &AMDGPU::VReg_64RegClass;
5170 : &AMDGPU::VReg_64RegClass;
5173 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5175 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5178 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5180 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5183 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5185 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5188 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5195 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5209 TII->legalizeOperands(*LoHalf);
5210 TII->legalizeOperands(*HiHalf);
5211 MI.eraseFromParent();
5214 case AMDGPU::S_ADD_CO_PSEUDO:
5215 case AMDGPU::S_SUB_CO_PSEUDO: {
5229 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5230 ? AMDGPU::S_ADDC_U32
5231 : AMDGPU::S_SUBB_U32;
5233 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5234 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5239 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5244 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5246 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5252 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5253 assert(WaveSize == 64 || WaveSize == 32);
5255 if (WaveSize == 64) {
5256 if (ST.hasScalarCompareEq64()) {
5262 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5264 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5266 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5267 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5269 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5290 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5296 MI.eraseFromParent();
5299 case AMDGPU::SI_INIT_M0: {
5301 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5302 .
add(
MI.getOperand(0));
5303 MI.eraseFromParent();
5306 case AMDGPU::GET_GROUPSTATICSIZE: {
5311 .
add(
MI.getOperand(0))
5313 MI.eraseFromParent();
5316 case AMDGPU::GET_SHADERCYCLESHILO: {
5330 using namespace AMDGPU::Hwreg;
5331 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5333 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5334 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5336 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5337 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5339 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5343 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5348 .
add(
MI.getOperand(0))
5353 MI.eraseFromParent();
5356 case AMDGPU::SI_INDIRECT_SRC_V1:
5357 case AMDGPU::SI_INDIRECT_SRC_V2:
5358 case AMDGPU::SI_INDIRECT_SRC_V4:
5359 case AMDGPU::SI_INDIRECT_SRC_V8:
5360 case AMDGPU::SI_INDIRECT_SRC_V9:
5361 case AMDGPU::SI_INDIRECT_SRC_V10:
5362 case AMDGPU::SI_INDIRECT_SRC_V11:
5363 case AMDGPU::SI_INDIRECT_SRC_V12:
5364 case AMDGPU::SI_INDIRECT_SRC_V16:
5365 case AMDGPU::SI_INDIRECT_SRC_V32:
5367 case AMDGPU::SI_INDIRECT_DST_V1:
5368 case AMDGPU::SI_INDIRECT_DST_V2:
5369 case AMDGPU::SI_INDIRECT_DST_V4:
5370 case AMDGPU::SI_INDIRECT_DST_V8:
5371 case AMDGPU::SI_INDIRECT_DST_V9:
5372 case AMDGPU::SI_INDIRECT_DST_V10:
5373 case AMDGPU::SI_INDIRECT_DST_V11:
5374 case AMDGPU::SI_INDIRECT_DST_V12:
5375 case AMDGPU::SI_INDIRECT_DST_V16:
5376 case AMDGPU::SI_INDIRECT_DST_V32:
5378 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5379 case AMDGPU::SI_KILL_I1_PSEUDO:
5381 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5390 Register SrcCond =
MI.getOperand(3).getReg();
5392 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5393 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5394 const auto *CondRC =
TRI->getWaveMaskRegClass();
5395 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5399 : &AMDGPU::VReg_64RegClass;
5402 : &AMDGPU::VReg_64RegClass;
5405 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5407 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5410 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5412 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5415 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5417 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5438 MI.eraseFromParent();
5441 case AMDGPU::SI_BR_UNDEF: {
5445 .
add(
MI.getOperand(0));
5447 MI.eraseFromParent();
5450 case AMDGPU::ADJCALLSTACKUP:
5451 case AMDGPU::ADJCALLSTACKDOWN: {
5458 case AMDGPU::SI_CALL_ISEL: {
5462 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5465 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5471 MI.eraseFromParent();
5474 case AMDGPU::V_ADD_CO_U32_e32:
5475 case AMDGPU::V_SUB_CO_U32_e32:
5476 case AMDGPU::V_SUBREV_CO_U32_e32: {
5479 unsigned Opc =
MI.getOpcode();
5481 bool NeedClampOperand =
false;
5482 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5484 NeedClampOperand =
true;
5488 if (
TII->isVOP3(*
I)) {
5493 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5494 if (NeedClampOperand)
5497 TII->legalizeOperands(*
I);
5499 MI.eraseFromParent();
5502 case AMDGPU::V_ADDC_U32_e32:
5503 case AMDGPU::V_SUBB_U32_e32:
5504 case AMDGPU::V_SUBBREV_U32_e32:
5507 TII->legalizeOperands(
MI);
5509 case AMDGPU::DS_GWS_INIT:
5510 case AMDGPU::DS_GWS_SEMA_BR:
5511 case AMDGPU::DS_GWS_BARRIER:
5512 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5514 case AMDGPU::DS_GWS_SEMA_V:
5515 case AMDGPU::DS_GWS_SEMA_P:
5516 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5524 case AMDGPU::S_SETREG_B32: {
5539 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5540 const unsigned SetMask = WidthMask <<
Offset;
5543 unsigned SetDenormOp = 0;
5544 unsigned SetRoundOp = 0;
5552 SetRoundOp = AMDGPU::S_ROUND_MODE;
5553 SetDenormOp = AMDGPU::S_DENORM_MODE;
5555 SetRoundOp = AMDGPU::S_ROUND_MODE;
5557 SetDenormOp = AMDGPU::S_DENORM_MODE;
5560 if (SetRoundOp || SetDenormOp) {
5563 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5564 unsigned ImmVal = Def->getOperand(1).getImm();
5578 MI.eraseFromParent();
5587 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5591 case AMDGPU::S_INVERSE_BALLOT_U32:
5592 case AMDGPU::S_INVERSE_BALLOT_U64:
5595 MI.setDesc(
TII->get(AMDGPU::COPY));
5597 case AMDGPU::ENDPGM_TRAP: {
5600 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5620 MI.eraseFromParent();
5623 case AMDGPU::SIMULATED_TRAP: {
5627 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5628 MI.eraseFromParent();
5665 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5781 EVT VT =
N->getValueType(0);
5785 if (VT == MVT::f16) {
5801 unsigned Opc =
Op.getOpcode();
5802 EVT VT =
Op.getValueType();
5803 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5804 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5805 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5806 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5821 unsigned Opc =
Op.getOpcode();
5822 EVT VT =
Op.getValueType();
5823 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5824 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5825 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5826 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5834 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5836 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5843 unsigned Opc =
Op.getOpcode();
5844 EVT VT =
Op.getValueType();
5845 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5846 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5847 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5848 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5849 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5850 VT == MVT::v32bf16);
5855 : std::pair(Op0, Op0);
5864 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5866 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5872 switch (
Op.getOpcode()) {
5876 return LowerBRCOND(
Op, DAG);
5878 return LowerRETURNADDR(
Op, DAG);
5881 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5882 "Load should return a value and a chain");
5886 EVT VT =
Op.getValueType();
5888 return lowerFSQRTF32(
Op, DAG);
5890 return lowerFSQRTF64(
Op, DAG);
5895 return LowerTrig(
Op, DAG);
5897 return LowerSELECT(
Op, DAG);
5899 return LowerFDIV(
Op, DAG);
5901 return LowerFFREXP(
Op, DAG);
5903 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5905 return LowerSTORE(
Op, DAG);
5909 return LowerGlobalAddress(MFI,
Op, DAG);
5912 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5914 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5916 return LowerINTRINSIC_VOID(
Op, DAG);
5918 return lowerADDRSPACECAST(
Op, DAG);
5920 return lowerINSERT_SUBVECTOR(
Op, DAG);
5922 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5924 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5926 return lowerVECTOR_SHUFFLE(
Op, DAG);
5928 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5930 return lowerBUILD_VECTOR(
Op, DAG);
5933 return lowerFP_ROUND(
Op, DAG);
5935 return lowerTRAP(
Op, DAG);
5937 return lowerDEBUGTRAP(
Op, DAG);
5946 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5949 return lowerFLDEXP(
Op, DAG);
5978 return lowerMUL(
Op, DAG);
5981 return lowerXMULO(
Op, DAG);
5984 return lowerXMUL_LOHI(
Op, DAG);
6017 EVT FittingLoadVT = LoadVT;
6049SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6052 bool IsIntrinsic)
const {
6056 EVT LoadVT =
M->getValueType(0);
6058 EVT EquivLoadVT = LoadVT;
6076 M->getMemoryVT(),
M->getMemOperand());
6087 EVT LoadVT =
M->getValueType(0);
6093 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6094 bool IsTFE =
M->getNumValues() == 3;
6107 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6111 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6112 M->getMemOperand(), DAG);
6117 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6118 M->getMemOperand(), DAG);
6126 EVT VT =
N->getValueType(0);
6127 unsigned CondCode =
N->getConstantOperandVal(3);
6138 EVT CmpVT =
LHS.getValueType();
6139 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6140 unsigned PromoteOp =
6160 EVT VT =
N->getValueType(0);
6162 unsigned CondCode =
N->getConstantOperandVal(3);
6171 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6189 EVT VT =
N->getValueType(0);
6196 Src.getOperand(1), Src.getOperand(2));
6207 Exec = AMDGPU::EXEC_LO;
6209 Exec = AMDGPU::EXEC;
6226 EVT VT =
N->getValueType(0);
6228 unsigned IID =
N->getConstantOperandVal(0);
6229 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6230 IID == Intrinsic::amdgcn_permlanex16;
6231 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6232 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6236 unsigned SplitSize = 32;
6237 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6238 ST->hasDPALU_DPP() &&
6246 case Intrinsic::amdgcn_permlane16:
6247 case Intrinsic::amdgcn_permlanex16:
6248 case Intrinsic::amdgcn_update_dpp:
6253 case Intrinsic::amdgcn_writelane:
6256 case Intrinsic::amdgcn_readlane:
6257 case Intrinsic::amdgcn_set_inactive:
6258 case Intrinsic::amdgcn_set_inactive_chain_arg:
6259 case Intrinsic::amdgcn_mov_dpp8:
6262 case Intrinsic::amdgcn_readfirstlane:
6263 case Intrinsic::amdgcn_permlane64:
6273 if (
SDNode *GL =
N->getGluedNode()) {
6275 GL = GL->getOperand(0).getNode();
6285 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6286 IID == Intrinsic::amdgcn_mov_dpp8 ||
6287 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6288 Src1 =
N->getOperand(2);
6289 if (IID == Intrinsic::amdgcn_writelane ||
6290 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6291 Src2 =
N->getOperand(3);
6294 if (ValSize == SplitSize) {
6304 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6309 if (IID == Intrinsic::amdgcn_writelane) {
6314 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6316 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6319 if (ValSize % SplitSize != 0)
6323 EVT VT =
N->getValueType(0);
6327 unsigned NumOperands =
N->getNumOperands();
6329 SDNode *GL =
N->getGluedNode();
6334 for (
unsigned i = 0; i != NE; ++i) {
6335 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6337 SDValue Operand =
N->getOperand(j);
6367 if (SplitSize == 32) {
6369 return unrollLaneOp(LaneOp.
getNode());
6375 unsigned SubVecNumElt =
6379 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6380 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6384 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6389 if (IID == Intrinsic::amdgcn_writelane)
6394 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6395 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6396 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6397 EltIdx += SubVecNumElt;
6411 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6414 if (IID == Intrinsic::amdgcn_writelane)
6417 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6425 switch (
N->getOpcode()) {
6437 unsigned IID =
N->getConstantOperandVal(0);
6439 case Intrinsic::amdgcn_make_buffer_rsrc:
6440 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6442 case Intrinsic::amdgcn_cvt_pkrtz: {
6451 case Intrinsic::amdgcn_cvt_pknorm_i16:
6452 case Intrinsic::amdgcn_cvt_pknorm_u16:
6453 case Intrinsic::amdgcn_cvt_pk_i16:
6454 case Intrinsic::amdgcn_cvt_pk_u16: {
6460 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6462 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6464 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6469 EVT VT =
N->getValueType(0);
6478 case Intrinsic::amdgcn_s_buffer_load: {
6490 EVT VT =
Op.getValueType();
6491 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6503 if (!
Offset->isDivergent()) {
6522 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6534 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6535 Results.push_back(Res.getOperand(
I));
6539 Results.push_back(Res.getValue(1));
6548 EVT VT =
N->getValueType(0);
6553 EVT SelectVT = NewVT;
6554 if (NewVT.
bitsLT(MVT::i32)) {
6557 SelectVT = MVT::i32;
6563 if (NewVT != SelectVT)
6569 if (
N->getValueType(0) != MVT::v2f16)
6581 if (
N->getValueType(0) != MVT::v2f16)
6593 if (
N->getValueType(0) != MVT::f16)
6608 if (U.get() !=
Value)
6611 if (U.getUser()->getOpcode() == Opcode)
6617unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6619 switch (
Intr->getConstantOperandVal(1)) {
6620 case Intrinsic::amdgcn_if:
6622 case Intrinsic::amdgcn_else:
6624 case Intrinsic::amdgcn_loop:
6626 case Intrinsic::amdgcn_end_cf:
6673 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6686 assert(BR &&
"brcond missing unconditional branch user");
6687 Target = BR->getOperand(1);
6690 unsigned CFNode = isCFIntrinsic(
Intr);
6709 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6733 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6746 Intr->getOperand(0));
6752 MVT VT =
Op.getSimpleValueType();
6755 if (
Op.getConstantOperandVal(0) != 0)
6761 if (
Info->isEntryFunction())
6778 return Op.getValueType().bitsLE(VT)
6785 assert(
Op.getValueType() == MVT::f16 &&
6786 "Do not know how to custom lower FP_ROUND for non-f16 type");
6789 EVT SrcVT = Src.getValueType();
6790 if (SrcVT != MVT::f64)
6806 EVT VT =
Op.getValueType();
6809 bool IsIEEEMode =
Info->getMode().IEEE;
6818 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6826 EVT VT =
Op.getValueType();
6830 EVT ExpVT =
Exp.getValueType();
6831 if (ExpVT == MVT::i16)
6852 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6859 switch (
Op->getOpcode()) {
6889 DAGCombinerInfo &DCI)
const {
6890 const unsigned Opc =
Op.getOpcode();
6898 :
Op->getOperand(0).getValueType();
6901 if (DCI.isBeforeLegalizeOps() ||
6905 auto &DAG = DCI.DAG;
6911 LHS =
Op->getOperand(1);
6912 RHS =
Op->getOperand(2);
6914 LHS =
Op->getOperand(0);
6915 RHS =
Op->getOperand(1);
6946 EVT VT =
Op.getValueType();
6952 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6979 if (
Op->isDivergent())
6992 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6994 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6997 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6999 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7005 EVT VT =
Op.getValueType();
7012 const APInt &
C = RHSC->getAPIntValue();
7014 if (
C.isPowerOf2()) {
7016 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7043 if (
Op->isDivergent()) {
7060 return lowerTrapEndpgm(
Op, DAG);
7063 : lowerTrapHsaQueuePtr(
Op, DAG);
7073SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7075 ImplicitParameter Param)
const {
7095 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7101 if (UserSGPR == AMDGPU::NoRegister) {
7143 "debugtrap handler not supported",
7156SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7160 ? AMDGPU::SRC_SHARED_BASE
7161 : AMDGPU::SRC_PRIVATE_BASE;
7184 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7193 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7199 if (UserSGPR == AMDGPU::NoRegister) {
7229 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7230 isa<BasicBlockSDNode>(Val))
7233 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7234 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7248 unsigned DestAS, SrcAS;
7250 bool IsNonNull =
false;
7251 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7252 SrcAS = ASC->getSrcAddressSpace();
7253 Src = ASC->getOperand(0);
7254 DestAS = ASC->getDestAddressSpace();
7257 Op.getConstantOperandVal(0) ==
7258 Intrinsic::amdgcn_addrspacecast_nonnull);
7259 Src =
Op->getOperand(1);
7260 SrcAS =
Op->getConstantOperandVal(2);
7261 DestAS =
Op->getConstantOperandVal(3);
7276 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7290 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7298 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7310 Op.getValueType() == MVT::i64) {
7319 Src.getValueType() == MVT::i64)
7343 EVT InsVT =
Ins.getValueType();
7346 unsigned IdxVal =
Idx->getAsZExtVal();
7351 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7356 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7358 MVT::i32, InsNumElts / 2);
7363 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7365 if (InsNumElts == 2) {
7378 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7400 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7401 if (NumElts == 4 && EltSize == 16 && KIdx) {
7412 unsigned Idx = KIdx->getZExtValue();
7413 bool InsertLo =
Idx < 2;
7430 if (isa<ConstantSDNode>(
Idx))
7436 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7442 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7471 EVT ResultVT =
Op.getValueType();
7484 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7487 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7491 if (VecSize == 128) {
7499 }
else if (VecSize == 256) {
7502 for (
unsigned P = 0;
P < 4; ++
P) {
7508 Parts[0], Parts[1]));
7510 Parts[2], Parts[3]));
7516 for (
unsigned P = 0;
P < 8; ++
P) {
7523 Parts[0], Parts[1], Parts[2], Parts[3]));
7526 Parts[4], Parts[5], Parts[6], Parts[7]));
7529 EVT IdxVT =
Idx.getValueType();
7546 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7561 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7571 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7577 EVT ResultVT =
Op.getValueType();
7581 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7597 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7598 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7606 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7607 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7608 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7609 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7628 EVT ResultVT =
Op.getValueType();
7644 EVT VT =
Op.getValueType();
7646 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7681 for (
unsigned P = 0;
P < NumParts; ++
P) {
7683 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7716 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7754 EVT PtrVT =
Op.getValueType();
7770 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7842 SDValue Param = lowerKernargMemParameter(
7852 "non-hsa intrinsic with hsa target",
7861 "intrinsic not supported on subtarget",
7871 unsigned NumElts = Elts.
size();
7873 if (NumElts <= 12) {
7882 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7888 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7889 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7898 EVT SrcVT = Src.getValueType();
7919 bool Unpacked,
bool IsD16,
int DMaskPop,
7920 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7924 EVT ReqRetVT = ResultTypes[0];
7926 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7927 ? (ReqRetNumElts + 1) / 2
7930 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7941 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7952 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7954 NumDataDwords - MaskPopDwords);
7959 EVT LegalReqRetVT = ReqRetVT;
7961 if (!
Data.getValueType().isInteger())
7963 Data.getValueType().changeTypeToInteger(),
Data);
7984 if (Result->getNumValues() == 1)
7991 SDValue *LWE,
bool &IsTexFail) {
7992 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8011 unsigned DimIdx,
unsigned EndIdx,
8012 unsigned NumGradients) {
8014 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8022 if (((
I + 1) >= EndIdx) ||
8023 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8024 I == DimIdx + NumGradients - 1))) {
8025 if (
Addr.getValueType() != MVT::i16)
8046 unsigned IntrOpcode =
Intr->BaseOpcode;
8057 int NumVDataDwords = 0;
8058 bool AdjustRetType =
false;
8059 bool IsAtomicPacked16Bit =
false;
8062 const unsigned ArgOffset = WithChain ? 2 : 1;
8065 unsigned DMaskLanes = 0;
8067 if (BaseOpcode->Atomic) {
8068 VData =
Op.getOperand(2);
8070 IsAtomicPacked16Bit =
8071 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8072 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8075 if (BaseOpcode->AtomicX2) {
8082 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8083 DMask = Is64Bit ? 0xf : 0x3;
8084 NumVDataDwords = Is64Bit ? 4 : 2;
8086 DMask = Is64Bit ? 0x3 : 0x1;
8087 NumVDataDwords = Is64Bit ? 2 : 1;
8090 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8093 if (BaseOpcode->Store) {
8094 VData =
Op.getOperand(2);
8102 VData = handleD16VData(VData, DAG,
true);
8106 }
else if (!BaseOpcode->NoReturn) {
8119 (!LoadVT.
isVector() && DMaskLanes > 1))
8127 NumVDataDwords = (DMaskLanes + 1) / 2;
8129 NumVDataDwords = DMaskLanes;
8131 AdjustRetType =
true;
8135 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8140 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8142 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8143 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8145 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8147 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8148 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8151 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8152 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8153 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8158 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8162 "Bias needs to be converted to 16 bit in A16 mode");
8167 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8171 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8172 "require 16 bit args for both gradients and addresses");
8177 if (!
ST->hasA16()) {
8178 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8179 "support 16 bit addresses\n");
8189 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8193 IntrOpcode = G16MappingInfo->
G16;
8201 ArgOffset +
Intr->GradientStart,
8202 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8204 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8205 I < ArgOffset + Intr->CoordStart;
I++)
8212 ArgOffset +
Intr->CoordStart, VAddrEnd,
8216 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8234 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8235 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8236 const bool UseNSA =
ST->hasNSAEncoding() &&
8237 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8238 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8239 const bool UsePartialNSA =
8240 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8243 if (UsePartialNSA) {
8245 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8246 }
else if (!UseNSA) {
8253 if (!BaseOpcode->Sampler) {
8257 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8259 Unorm = UnormConst ? True : False;
8264 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8265 bool IsTexFail =
false;
8266 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8277 NumVDataDwords += 1;
8278 AdjustRetType =
true;
8283 if (AdjustRetType) {
8286 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8289 if (isa<MemSDNode>(
Op))
8295 MVT::i32, NumVDataDwords)
8298 ResultTypes[0] = NewVT;
8299 if (ResultTypes.size() == 3) {
8303 ResultTypes.erase(&ResultTypes[1]);
8307 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8308 if (BaseOpcode->Atomic)
8315 if (BaseOpcode->Store || BaseOpcode->Atomic)
8317 if (UsePartialNSA) {
8326 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8329 if (BaseOpcode->Sampler) {
8338 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8342 ST->hasFeature(AMDGPU::FeatureR128A16)
8352 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8356 if (BaseOpcode->HasD16)
8358 if (isa<MemSDNode>(
Op))
8361 int NumVAddrDwords =
8367 NumVDataDwords, NumVAddrDwords);
8368 }
else if (IsGFX11Plus) {
8370 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8371 : AMDGPU::MIMGEncGfx11Default,
8372 NumVDataDwords, NumVAddrDwords);
8373 }
else if (IsGFX10Plus) {
8375 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8376 : AMDGPU::MIMGEncGfx10Default,
8377 NumVDataDwords, NumVAddrDwords);
8381 NumVDataDwords, NumVAddrDwords);
8384 "requested image instruction is not supported on this GPU");
8389 NumVDataDwords, NumVAddrDwords);
8392 NumVDataDwords, NumVAddrDwords);
8398 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8403 if (BaseOpcode->AtomicX2) {
8408 if (BaseOpcode->NoReturn)
8412 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8430 if (!
Offset->isDivergent()) {
8475 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8479 unsigned NumLoads = 1;
8485 if (NumElts == 8 || NumElts == 16) {
8486 NumLoads = NumElts / 4;
8494 setBufferOffsets(
Offset, DAG, &Ops[3],
8495 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8498 for (
unsigned i = 0; i < NumLoads; ++i) {
8504 if (NumElts == 8 || NumElts == 16)
8551 EVT VT =
Op.getValueType();
8553 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8557 switch (IntrinsicID) {
8558 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8561 return getPreloadedValue(DAG, *MFI, VT,
8564 case Intrinsic::amdgcn_dispatch_ptr:
8565 case Intrinsic::amdgcn_queue_ptr: {
8568 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8574 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8577 return getPreloadedValue(DAG, *MFI, VT, RegID);
8579 case Intrinsic::amdgcn_implicitarg_ptr: {
8581 return getImplicitArgPtr(DAG,
DL);
8582 return getPreloadedValue(DAG, *MFI, VT,
8585 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8591 return getPreloadedValue(DAG, *MFI, VT,
8594 case Intrinsic::amdgcn_dispatch_id: {
8597 case Intrinsic::amdgcn_rcp:
8599 case Intrinsic::amdgcn_rsq:
8601 case Intrinsic::amdgcn_rsq_legacy:
8605 case Intrinsic::amdgcn_rcp_legacy:
8609 case Intrinsic::amdgcn_rsq_clamp: {
8623 case Intrinsic::r600_read_ngroups_x:
8627 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8630 case Intrinsic::r600_read_ngroups_y:
8634 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8637 case Intrinsic::r600_read_ngroups_z:
8641 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8644 case Intrinsic::r600_read_global_size_x:
8648 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8651 case Intrinsic::r600_read_global_size_y:
8655 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8658 case Intrinsic::r600_read_global_size_z:
8662 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8665 case Intrinsic::r600_read_local_size_x:
8669 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8671 case Intrinsic::r600_read_local_size_y:
8675 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8677 case Intrinsic::r600_read_local_size_z:
8681 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8683 case Intrinsic::amdgcn_workgroup_id_x:
8684 return getPreloadedValue(DAG, *MFI, VT,
8686 case Intrinsic::amdgcn_workgroup_id_y:
8687 return getPreloadedValue(DAG, *MFI, VT,
8689 case Intrinsic::amdgcn_workgroup_id_z:
8690 return getPreloadedValue(DAG, *MFI, VT,
8692 case Intrinsic::amdgcn_wave_id:
8693 return lowerWaveID(DAG,
Op);
8694 case Intrinsic::amdgcn_lds_kernel_id: {
8696 return getLDSKernelId(DAG,
DL);
8697 return getPreloadedValue(DAG, *MFI, VT,
8700 case Intrinsic::amdgcn_workitem_id_x:
8701 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8702 case Intrinsic::amdgcn_workitem_id_y:
8703 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8704 case Intrinsic::amdgcn_workitem_id_z:
8705 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8706 case Intrinsic::amdgcn_wavefrontsize:
8709 case Intrinsic::amdgcn_s_buffer_load: {
8710 unsigned CPol =
Op.getConstantOperandVal(3);
8717 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8718 Op.getOperand(3), DAG);
8720 case Intrinsic::amdgcn_fdiv_fast:
8721 return lowerFDIV_FAST(
Op, DAG);
8722 case Intrinsic::amdgcn_sin:
8725 case Intrinsic::amdgcn_cos:
8728 case Intrinsic::amdgcn_mul_u24:
8731 case Intrinsic::amdgcn_mul_i24:
8735 case Intrinsic::amdgcn_log_clamp: {
8741 case Intrinsic::amdgcn_fract:
8744 case Intrinsic::amdgcn_class:
8747 case Intrinsic::amdgcn_div_fmas:
8749 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8751 case Intrinsic::amdgcn_div_fixup:
8753 Op.getOperand(2),
Op.getOperand(3));
8755 case Intrinsic::amdgcn_div_scale: {
8768 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8771 Denominator, Numerator);
8773 case Intrinsic::amdgcn_icmp: {
8775 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8776 Op.getConstantOperandVal(2) == 0 &&
8781 case Intrinsic::amdgcn_fcmp: {
8784 case Intrinsic::amdgcn_ballot:
8786 case Intrinsic::amdgcn_fmed3:
8788 Op.getOperand(2),
Op.getOperand(3));
8789 case Intrinsic::amdgcn_fdot2:
8791 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8792 case Intrinsic::amdgcn_fmul_legacy:
8795 case Intrinsic::amdgcn_sffbh:
8797 case Intrinsic::amdgcn_sbfe:
8799 Op.getOperand(2),
Op.getOperand(3));
8800 case Intrinsic::amdgcn_ubfe:
8802 Op.getOperand(2),
Op.getOperand(3));
8803 case Intrinsic::amdgcn_cvt_pkrtz:
8804 case Intrinsic::amdgcn_cvt_pknorm_i16:
8805 case Intrinsic::amdgcn_cvt_pknorm_u16:
8806 case Intrinsic::amdgcn_cvt_pk_i16:
8807 case Intrinsic::amdgcn_cvt_pk_u16: {
8809 EVT VT =
Op.getValueType();
8812 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8814 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8816 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8818 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8824 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8827 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8830 case Intrinsic::amdgcn_fmad_ftz:
8832 Op.getOperand(2),
Op.getOperand(3));
8834 case Intrinsic::amdgcn_if_break:
8836 Op->getOperand(1),
Op->getOperand(2)),
8839 case Intrinsic::amdgcn_groupstaticsize: {
8851 case Intrinsic::amdgcn_is_shared:
8852 case Intrinsic::amdgcn_is_private: {
8854 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8857 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8865 case Intrinsic::amdgcn_perm:
8867 Op.getOperand(2),
Op.getOperand(3));
8868 case Intrinsic::amdgcn_reloc_constant: {
8872 auto *RelocSymbol = cast<GlobalVariable>(
8878 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8879 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8880 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8881 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8882 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8883 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8884 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8885 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8886 if (
Op.getOperand(4).getValueType() == MVT::i32)
8892 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8893 Op.getOperand(3), IndexKeyi32);
8895 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8896 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8897 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8898 if (
Op.getOperand(6).getValueType() == MVT::i32)
8904 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8905 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8906 IndexKeyi32, Op.getOperand(7)});
8908 case Intrinsic::amdgcn_addrspacecast_nonnull:
8909 return lowerADDRSPACECAST(
Op, DAG);
8910 case Intrinsic::amdgcn_readlane:
8911 case Intrinsic::amdgcn_readfirstlane:
8912 case Intrinsic::amdgcn_writelane:
8913 case Intrinsic::amdgcn_permlane16:
8914 case Intrinsic::amdgcn_permlanex16:
8915 case Intrinsic::amdgcn_permlane64:
8916 case Intrinsic::amdgcn_set_inactive:
8917 case Intrinsic::amdgcn_set_inactive_chain_arg:
8918 case Intrinsic::amdgcn_mov_dpp8:
8919 case Intrinsic::amdgcn_update_dpp:
8924 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8935 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8941 unsigned NewOpcode)
const {
8945 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8946 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
8960 auto *
M = cast<MemSDNode>(
Op);
8964 M->getMemOperand());
8969 unsigned NewOpcode)
const {
8973 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8974 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
8988 auto *
M = cast<MemSDNode>(
Op);
8992 M->getMemOperand());
8997 unsigned IntrID =
Op.getConstantOperandVal(1);
9001 case Intrinsic::amdgcn_ds_ordered_add:
9002 case Intrinsic::amdgcn_ds_ordered_swap: {
9007 unsigned IndexOperand =
M->getConstantOperandVal(7);
9008 unsigned WaveRelease =
M->getConstantOperandVal(8);
9009 unsigned WaveDone =
M->getConstantOperandVal(9);
9011 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9012 IndexOperand &= ~0x3f;
9013 unsigned CountDw = 0;
9016 CountDw = (IndexOperand >> 24) & 0xf;
9017 IndexOperand &= ~(0xf << 24);
9019 if (CountDw < 1 || CountDw > 4) {
9021 "ds_ordered_count: dword count must be between 1 and 4");
9028 if (WaveDone && !WaveRelease)
9031 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9032 unsigned ShaderType =
9034 unsigned Offset0 = OrderedCountIndex << 2;
9035 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9038 Offset1 |= (CountDw - 1) << 6;
9041 Offset1 |= ShaderType << 2;
9043 unsigned Offset = Offset0 | (Offset1 << 8);
9050 M->getVTList(), Ops,
M->getMemoryVT(),
9051 M->getMemOperand());
9053 case Intrinsic::amdgcn_raw_buffer_load:
9054 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9055 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9056 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9057 case Intrinsic::amdgcn_raw_buffer_load_format:
9058 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9059 const bool IsFormat =
9060 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9061 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9063 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9064 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9077 auto *
M = cast<MemSDNode>(
Op);
9078 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9080 case Intrinsic::amdgcn_struct_buffer_load:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9082 case Intrinsic::amdgcn_struct_buffer_load_format:
9083 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9084 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9085 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9086 const bool IsFormat =
9087 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9088 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9090 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9091 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9104 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9106 case Intrinsic::amdgcn_raw_tbuffer_load:
9107 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9109 EVT LoadVT =
Op.getValueType();
9110 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9111 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9130 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9133 case Intrinsic::amdgcn_struct_tbuffer_load:
9134 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9136 EVT LoadVT =
Op.getValueType();
9137 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9138 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9157 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9160 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9163 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9164 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9165 return lowerStructBufferAtomicIntrin(
Op, DAG,
9167 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9168 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9170 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9172 return lowerStructBufferAtomicIntrin(
Op, DAG,
9174 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9175 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9177 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9178 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9179 return lowerStructBufferAtomicIntrin(
Op, DAG,
9181 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9182 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9184 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9187 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9188 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9190 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9193 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9196 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9199 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9200 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9202 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9203 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9205 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9208 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9211 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9212 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9214 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9215 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9217 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9218 return lowerRawBufferAtomicIntrin(
Op, DAG,
9220 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9222 return lowerStructBufferAtomicIntrin(
Op, DAG,
9224 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9227 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9230 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9231 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9232 return lowerStructBufferAtomicIntrin(
Op, DAG,
9234 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9235 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9236 return lowerStructBufferAtomicIntrin(
Op, DAG,
9238 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9239 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9240 return lowerStructBufferAtomicIntrin(
Op, DAG,
9242 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9243 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9244 return lowerStructBufferAtomicIntrin(
Op, DAG,
9246 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9247 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9249 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9252 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9255 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9258 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9261 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9262 return lowerStructBufferAtomicIntrin(
Op, DAG,
9265 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9266 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9267 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9268 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9282 EVT VT =
Op.getValueType();
9283 auto *
M = cast<MemSDNode>(
Op);
9286 Op->getVTList(), Ops, VT,
9287 M->getMemOperand());
9289 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9290 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9291 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9292 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9306 EVT VT =
Op.getValueType();
9307 auto *
M = cast<MemSDNode>(
Op);
9310 Op->getVTList(), Ops, VT,
9311 M->getMemOperand());
9313 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9315 SDValue NodePtr =
M->getOperand(2);
9316 SDValue RayExtent =
M->getOperand(3);
9317 SDValue RayOrigin =
M->getOperand(4);
9319 SDValue RayInvDir =
M->getOperand(6);
9337 const unsigned NumVDataDwords = 4;
9338 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9339 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9343 const unsigned BaseOpcodes[2][2] = {
9344 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9345 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9346 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9350 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9351 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9352 : AMDGPU::MIMGEncGfx10NSA,
9353 NumVDataDwords, NumVAddrDwords);
9357 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9358 : AMDGPU::MIMGEncGfx10Default,
9359 NumVDataDwords, NumVAddrDwords);
9365 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9368 if (Lanes[0].getValueSizeInBits() == 32) {
9369 for (
unsigned I = 0;
I < 3; ++
I)
9388 if (UseNSA && IsGFX11Plus) {
9396 for (
unsigned I = 0;
I < 3; ++
I) {
9399 {DirLanes[I], InvDirLanes[I]})));
9414 packLanes(RayOrigin,
true);
9415 packLanes(RayDir,
true);
9416 packLanes(RayInvDir,
false);
9421 if (NumVAddrDwords > 12) {
9441 case Intrinsic::amdgcn_global_atomic_fmin_num:
9442 case Intrinsic::amdgcn_global_atomic_fmax_num:
9443 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9444 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9451 unsigned Opcode = 0;
9453 case Intrinsic::amdgcn_global_atomic_fmin_num:
9454 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9458 case Intrinsic::amdgcn_global_atomic_fmax_num:
9459 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9467 Ops,
M->getMemOperand());
9469 case Intrinsic::amdgcn_s_get_barrier_state:
9470 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9475 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9476 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9477 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9478 BarID = (BarID >> 4) & 0x3F;
9479 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9484 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9485 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9505 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9513SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9523 bool IsTFE = VTList.
NumVTs == 3;
9526 unsigned NumOpDWords = NumValueDWords + 1;
9531 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9532 OpDWordsVT, OpDWordsMMO, DAG);
9547 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9553 WidenedMemVT, WidenedMMO);
9563 bool ImageStore)
const {
9598 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9604 if ((NumElements % 2) == 1) {
9606 unsigned I = Elts.
size() / 2;
9622 if (NumElements == 3) {
9643 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9646 switch (IntrinsicID) {
9647 case Intrinsic::amdgcn_exp_compr: {
9651 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9674 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9677 case Intrinsic::amdgcn_s_barrier:
9678 case Intrinsic::amdgcn_s_barrier_signal:
9679 case Intrinsic::amdgcn_s_barrier_wait: {
9682 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9683 if (WGSize <=
ST.getWavefrontSize()) {
9686 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9687 return Op.getOperand(0);
9690 MVT::Other,
Op.getOperand(0)),
9695 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9701 MVT::Other, K,
Op.getOperand(0)),
9713 case Intrinsic::amdgcn_struct_tbuffer_store:
9714 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9718 VData = handleD16VData(VData, DAG);
9719 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9720 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9738 M->getMemoryVT(),
M->getMemOperand());
9741 case Intrinsic::amdgcn_raw_tbuffer_store:
9742 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9746 VData = handleD16VData(VData, DAG);
9747 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9748 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9766 M->getMemoryVT(),
M->getMemOperand());
9769 case Intrinsic::amdgcn_raw_buffer_store:
9770 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9771 case Intrinsic::amdgcn_raw_buffer_store_format:
9772 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9773 const bool IsFormat =
9774 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9775 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9782 VData = handleD16VData(VData, DAG);
9792 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9793 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9813 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9816 M->getMemoryVT(),
M->getMemOperand());
9819 case Intrinsic::amdgcn_struct_buffer_store:
9820 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9821 case Intrinsic::amdgcn_struct_buffer_store_format:
9822 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9823 const bool IsFormat =
9824 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9825 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9833 VData = handleD16VData(VData, DAG);
9843 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9844 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9865 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9868 M->getMemoryVT(),
M->getMemOperand());
9870 case Intrinsic::amdgcn_raw_buffer_load_lds:
9871 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9872 case Intrinsic::amdgcn_struct_buffer_load_lds:
9873 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9877 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9878 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9879 unsigned OpOffset = HasVIndex ? 1 : 0;
9880 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9882 unsigned Size =
Op->getConstantOperandVal(4);
9888 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9889 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9890 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9891 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9894 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9895 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9896 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9897 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9900 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9901 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9902 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9903 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9908 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9909 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9910 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9911 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9916 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9917 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9918 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9919 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9927 if (HasVIndex && HasVOffset)
9933 else if (HasVOffset)
9936 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9941 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9953 auto *
M = cast<MemSDNode>(
Op);
9980 case Intrinsic::amdgcn_global_load_lds: {
9982 unsigned Size =
Op->getConstantOperandVal(4);
9987 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9990 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9993 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9998 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10003 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10007 auto *
M = cast<MemSDNode>(
Op);
10020 if (
LHS->isDivergent())
10024 RHS.getOperand(0).getValueType() == MVT::i32) {
10027 VOffset =
RHS.getOperand(0);
10032 if (!
Addr->isDivergent()) {
10049 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10069 case Intrinsic::amdgcn_end_cf:
10071 Op->getOperand(2), Chain),
10073 case Intrinsic::amdgcn_s_barrier_init:
10074 case Intrinsic::amdgcn_s_barrier_signal_var: {
10081 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10082 ? AMDGPU::S_BARRIER_INIT_M0
10083 : AMDGPU::S_BARRIER_SIGNAL_M0;
10098 constexpr unsigned ShAmt = 16;
10110 case Intrinsic::amdgcn_s_barrier_join: {
10117 if (isa<ConstantSDNode>(BarOp)) {
10118 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10119 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10122 unsigned BarID = (BarVal >> 4) & 0x3F;
10127 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10143 case Intrinsic::amdgcn_s_prefetch_data: {
10146 return Op.getOperand(0);
10149 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10151 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10158 Op->getVTList(), Ops,
M->getMemoryVT(),
10159 M->getMemOperand());
10164 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10177std::pair<SDValue, SDValue>
10184 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10187 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10201 unsigned Overflow = ImmOffset & ~MaxImm;
10202 ImmOffset -= Overflow;
10203 if ((int32_t)Overflow < 0) {
10204 Overflow += ImmOffset;
10209 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10213 SDValue Ops[] = {N0, OverflowVal};
10228void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10230 Align Alignment)
const {
10233 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10236 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10247 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10249 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10266SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10269 return MaybePointer;
10283 SDValue NumRecords =
Op->getOperand(3);
10286 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10289 std::optional<uint32_t> ConstStride = std::nullopt;
10290 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10291 ConstStride = ConstNode->getZExtValue();
10294 if (!ConstStride || *ConstStride != 0) {
10297 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10308 NewHighHalf, NumRecords, Flags);
10318 bool IsTFE)
const {
10328 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10356 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10360 Ops[1] = BufferStoreExt;
10365 M->getMemOperand());
10390 DAGCombinerInfo &DCI)
const {
10406 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10413 "unexpected vector extload");
10426 "unexpected fp extload");
10444 DCI.AddToWorklist(Cvt.
getNode());
10449 DCI.AddToWorklist(Cvt.
getNode());
10460 if (
Info.isEntryFunction())
10461 return Info.getUserSGPRInfo().hasFlatScratchInit();
10469 EVT MemVT =
Load->getMemoryVT();
10482 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10510 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10511 "Custom lowering for non-i32 vectors hasn't been implemented.");
10514 unsigned AS =
Load->getAddressSpace();
10538 Alignment >=
Align(4) && NumElements < 32) {
10552 if (NumElements > 4)
10571 if (NumElements > 2)
10576 if (NumElements > 4)
10588 auto Flags =
Load->getMemOperand()->getFlags();
10590 Load->getAlign(), Flags, &
Fast) &&
10599 MemVT, *
Load->getMemOperand())) {
10608 EVT VT =
Op.getValueType();
10645 EVT VT =
Op.getValueType();
10648 bool AllowInaccurateRcp =
10655 if (!AllowInaccurateRcp && VT != MVT::f16)
10658 if (CLHS->isExactlyValue(1.0)) {
10675 if (CLHS->isExactlyValue(-1.0)) {
10684 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10698 EVT VT =
Op.getValueType();
10701 bool AllowInaccurateDiv =
10703 if (!AllowInaccurateDiv)
10724 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10738 return DAG.
getNode(Opcode, SL, VTList,
10747 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10761 return DAG.
getNode(Opcode, SL, VTList,
10767 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10768 return FastLowered;
10788 unsigned FMADOpCode =
10798 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10800 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10801 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10824 const APFloat K0Val(0x1p+96f);
10827 const APFloat K1Val(0x1p-32f);
10854 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10855 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10856 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10861 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10862 return FastLowered;
10869 Flags.setNoFPExcept(
true);
10890 using namespace AMDGPU::Hwreg;
10891 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10899 const bool HasDynamicDenormals =
10905 if (!PreservesDenormals) {
10913 if (HasDynamicDenormals) {
10917 SavedDenormMode =
SDValue(GetReg, 0);
10925 const SDValue EnableDenormValue =
10932 const SDValue EnableDenormValue =
10934 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10935 {EnableDenormValue,
BitField, Glue});
10945 ApproxRcp, One, NegDivScale0, Flags);
10948 ApproxRcp, Fma0, Flags);
10954 NumeratorScaled,
Mul, Flags);
10960 NumeratorScaled, Fma3, Flags);
10962 if (!PreservesDenormals) {
10970 DisableDenormValue, Fma4.
getValue(2))
10973 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10974 const SDValue DisableDenormValue =
10975 HasDynamicDenormals
10980 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10991 {Fma4, Fma1, Fma3, Scale},
Flags);
10997 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10998 return FastLowered;
11066 EVT VT =
Op.getValueType();
11068 if (VT == MVT::f32)
11069 return LowerFDIV32(
Op, DAG);
11071 if (VT == MVT::f64)
11072 return LowerFDIV64(
Op, DAG);
11074 if (VT == MVT::f16)
11075 return LowerFDIV16(
Op, DAG);
11084 EVT ResultExpVT =
Op->getValueType(1);
11085 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11115 if (VT == MVT::i1) {
11119 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11123 Store->getValue().getValueType().getScalarType() == MVT::i32);
11125 unsigned AS =
Store->getAddressSpace();
11144 if (NumElements > 4)
11151 VT, *
Store->getMemOperand()))
11161 if (NumElements > 2)
11165 if (NumElements > 4 ||
11174 auto Flags =
Store->getMemOperand()->getFlags();
11209 MVT VT =
Op.getValueType().getSimpleVT();
11380 EVT VT =
Op.getValueType();
11397 switch (
Op.getOpcode()) {
11424 EVT VT =
Op.getValueType();
11432 Op->getVTList(), Ops, VT,
11441SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11442 DAGCombinerInfo &DCI)
const {
11443 EVT VT =
N->getValueType(0);
11445 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11452 EVT SrcVT = Src.getValueType();
11458 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11461 DCI.AddToWorklist(Cvt.
getNode());
11464 if (ScalarVT != MVT::f32) {
11476 DAGCombinerInfo &DCI)
const {
11477 SDValue MagnitudeOp =
N->getOperand(0);
11478 SDValue SignOp =
N->getOperand(1);
11534SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11536 DAGCombinerInfo &DCI)
const {
11566 AM.HasBaseReg =
true;
11567 AM.BaseOffs =
Offset.getSExtValue();
11572 EVT VT =
N->getValueType(0);
11578 Flags.setNoUnsignedWrap(
11579 N->getFlags().hasNoUnsignedWrap() &&
11589 switch (
N->getOpcode()) {
11600 DAGCombinerInfo &DCI)
const {
11609 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11610 N->getMemoryVT(), DCI);
11614 NewOps[PtrIdx] = NewPtr;
11623 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11624 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11633SDValue SITargetLowering::splitBinaryBitConstantOp(
11634 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11654 if (V.getValueType() != MVT::i1)
11656 switch (V.getOpcode()) {
11675 if (!(
C & 0x000000ff))
11676 ZeroByteMask |= 0x000000ff;
11677 if (!(
C & 0x0000ff00))
11678 ZeroByteMask |= 0x0000ff00;
11679 if (!(
C & 0x00ff0000))
11680 ZeroByteMask |= 0x00ff0000;
11681 if (!(
C & 0xff000000))
11682 ZeroByteMask |= 0xff000000;
11683 uint32_t NonZeroByteMask = ~ZeroByteMask;
11684 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11697 assert(V.getValueSizeInBits() == 32);
11699 if (V.getNumOperands() != 2)
11708 switch (V.getOpcode()) {
11713 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11718 return (0x03020100 & ~ConstMask) | ConstMask;
11725 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11731 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11738 DAGCombinerInfo &DCI)
const {
11739 if (DCI.isBeforeLegalize())
11743 EVT VT =
N->getValueType(0);
11748 if (VT == MVT::i64 && CRHS) {
11754 if (CRHS && VT == MVT::i32) {
11763 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11764 unsigned Shift = CShift->getZExtValue();
11766 unsigned Offset = NB + Shift;
11767 if ((
Offset & (Bits - 1)) == 0) {
11785 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11791 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11806 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11811 if (
X !=
LHS.getOperand(1))
11816 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11849 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11850 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11852 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11853 :
Mask->getZExtValue() & OrdMask;
11874 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11877 if (LHSMask != ~0u && RHSMask != ~0u) {
11880 if (LHSMask > RHSMask) {
11887 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11888 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11891 if (!(LHSUsedLanes & RHSUsedLanes) &&
11894 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11901 for (
unsigned I = 0;
I < 32;
I += 8) {
11903 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11904 Mask &= (0x0c <<
I) & 0xffffffff;
11962static const std::optional<ByteProvider<SDValue>>
11964 unsigned Depth = 0) {
11967 return std::nullopt;
11969 if (
Op.getValueSizeInBits() < 8)
11970 return std::nullopt;
11972 if (
Op.getValueType().isVector())
11975 switch (
Op->getOpcode()) {
11986 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11987 NarrowVT = VTSign->getVT();
11990 return std::nullopt;
11993 if (SrcIndex >= NarrowByteWidth)
11994 return std::nullopt;
12000 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12002 return std::nullopt;
12004 uint64_t BitShift = ShiftOp->getZExtValue();
12006 if (BitShift % 8 != 0)
12007 return std::nullopt;
12009 SrcIndex += BitShift / 8;
12027static const std::optional<ByteProvider<SDValue>>
12029 unsigned StartingIndex = 0) {
12033 return std::nullopt;
12035 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12037 return std::nullopt;
12039 return std::nullopt;
12041 bool IsVec =
Op.getValueType().isVector();
12042 switch (
Op.getOpcode()) {
12045 return std::nullopt;
12050 return std::nullopt;
12054 return std::nullopt;
12057 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12058 return std::nullopt;
12059 if (!
LHS ||
LHS->isConstantZero())
12061 if (!
RHS ||
RHS->isConstantZero())
12063 return std::nullopt;
12068 return std::nullopt;
12070 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12072 return std::nullopt;
12074 uint32_t BitMask = BitMaskOp->getZExtValue();
12076 uint32_t IndexMask = 0xFF << (Index * 8);
12078 if ((IndexMask & BitMask) != IndexMask) {
12081 if (IndexMask & BitMask)
12082 return std::nullopt;
12091 return std::nullopt;
12094 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12095 if (!ShiftOp ||
Op.getValueType().isVector())
12096 return std::nullopt;
12098 uint64_t BitsProvided =
Op.getValueSizeInBits();
12099 if (BitsProvided % 8 != 0)
12100 return std::nullopt;
12102 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12104 return std::nullopt;
12106 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12107 uint64_t ByteShift = BitShift / 8;
12109 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12110 uint64_t BytesProvided = BitsProvided / 8;
12111 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12112 NewIndex %= BytesProvided;
12119 return std::nullopt;
12121 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12123 return std::nullopt;
12125 uint64_t BitShift = ShiftOp->getZExtValue();
12127 return std::nullopt;
12129 auto BitsProvided =
Op.getScalarValueSizeInBits();
12130 if (BitsProvided % 8 != 0)
12131 return std::nullopt;
12133 uint64_t BytesProvided = BitsProvided / 8;
12134 uint64_t ByteShift = BitShift / 8;
12139 return BytesProvided - ByteShift > Index
12147 return std::nullopt;
12149 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12151 return std::nullopt;
12153 uint64_t BitShift = ShiftOp->getZExtValue();
12154 if (BitShift % 8 != 0)
12155 return std::nullopt;
12156 uint64_t ByteShift = BitShift / 8;
12162 return Index < ByteShift
12165 Depth + 1, StartingIndex);
12174 return std::nullopt;
12181 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12182 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12184 if (NarrowBitWidth % 8 != 0)
12185 return std::nullopt;
12186 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12188 if (Index >= NarrowByteWidth)
12190 ? std::optional<ByteProvider<SDValue>>(
12198 return std::nullopt;
12202 if (NarrowByteWidth >= Index) {
12207 return std::nullopt;
12214 return std::nullopt;
12218 auto *L = cast<LoadSDNode>(
Op.getNode());
12220 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12221 if (NarrowBitWidth % 8 != 0)
12222 return std::nullopt;
12223 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12228 if (Index >= NarrowByteWidth) {
12230 ? std::optional<ByteProvider<SDValue>>(
12235 if (NarrowByteWidth > Index) {
12239 return std::nullopt;
12244 return std::nullopt;
12247 Depth + 1, StartingIndex);
12251 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12253 return std::nullopt;
12254 auto VecIdx = IdxOp->getZExtValue();
12255 auto ScalarSize =
Op.getScalarValueSizeInBits();
12256 if (ScalarSize < 32)
12257 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12259 StartingIndex, Index);
12264 return std::nullopt;
12266 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12268 return std::nullopt;
12271 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12272 if (IdxMask > 0x07 && IdxMask != 0x0c)
12273 return std::nullopt;
12275 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12276 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12278 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12284 return std::nullopt;
12299 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12303 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12306 auto MemVT = L->getMemoryVT();
12309 return L->getMemoryVT().getSizeInBits() == 16;
12319 int Low8 = Mask & 0xff;
12320 int Hi8 = (Mask & 0xff00) >> 8;
12322 assert(Low8 < 8 && Hi8 < 8);
12324 bool IsConsecutive = (Hi8 - Low8 == 1);
12329 bool Is16Aligned = !(Low8 % 2);
12331 return IsConsecutive && Is16Aligned;
12339 int Low16 = PermMask & 0xffff;
12340 int Hi16 = (PermMask & 0xffff0000) >> 16;
12350 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12352 if (!OtherOpIs16Bit)
12360 unsigned DWordOffset) {
12363 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12365 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12370 if (Src.getValueType().isVector()) {
12371 auto ScalarTySize = Src.getScalarValueSizeInBits();
12372 auto ScalarTy = Src.getValueType().getScalarType();
12373 if (ScalarTySize == 32) {
12377 if (ScalarTySize > 32) {
12380 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12381 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12388 assert(ScalarTySize < 32);
12389 auto NumElements =
TypeSize / ScalarTySize;
12390 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12391 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12392 auto NumElementsIn32 = 32 / ScalarTySize;
12393 auto NumAvailElements = DWordOffset < Trunc32Elements
12395 : NumElements - NormalizedTrunc;
12408 auto ShiftVal = 32 * DWordOffset;
12416 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12421 for (
int i = 0; i < 4; i++) {
12423 std::optional<ByteProvider<SDValue>>
P =
12426 if (!
P ||
P->isConstantZero())
12431 if (PermNodes.
size() != 4)
12434 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12435 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12437 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12438 auto PermOp = PermNodes[i];
12441 int SrcByteAdjust = 4;
12445 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12446 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12448 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12449 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12453 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12454 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12457 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12459 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12462 SDValue Op = *PermNodes[FirstSrc.first].Src;
12464 assert(
Op.getValueSizeInBits() == 32);
12468 int Low16 = PermMask & 0xffff;
12469 int Hi16 = (PermMask & 0xffff0000) >> 16;
12471 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12472 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12475 if (WellFormedLow && WellFormedHi)
12479 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12488 assert(
Op.getValueType().isByteSized() &&
12506 DAGCombinerInfo &DCI)
const {
12511 EVT VT =
N->getValueType(0);
12512 if (VT == MVT::i1) {
12517 if (Src !=
RHS.getOperand(0))
12522 if (!CLHS || !CRHS)
12526 static const uint32_t MaxMask = 0x3ff;
12541 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12546 Sel |=
LHS.getConstantOperandVal(2);
12555 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12559 auto usesCombinedOperand = [](
SDNode *OrUse) {
12562 !OrUse->getValueType(0).isVector())
12566 for (
auto *VUser : OrUse->users()) {
12567 if (!VUser->getValueType(0).isVector())
12574 if (VUser->getOpcode() == VectorwiseOp)
12580 if (!
any_of(
N->users(), usesCombinedOperand))
12586 if (LHSMask != ~0u && RHSMask != ~0u) {
12589 if (LHSMask > RHSMask) {
12596 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12597 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12600 if (!(LHSUsedLanes & RHSUsedLanes) &&
12603 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12605 LHSMask &= ~RHSUsedLanes;
12606 RHSMask &= ~LHSUsedLanes;
12608 LHSMask |= LHSUsedLanes & 0x04040404;
12618 if (LHSMask == ~0u || RHSMask == ~0u) {
12624 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12639 if (SrcVT == MVT::i32) {
12644 DCI.AddToWorklist(LowOr.
getNode());
12645 DCI.AddToWorklist(HiBits.getNode());
12653 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12656 N->getOperand(0), CRHS))
12664 DAGCombinerInfo &DCI)
const {
12665 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12674 EVT VT =
N->getValueType(0);
12675 if (CRHS && VT == MVT::i64) {
12697 LHS->getOperand(0), FNegLHS, FNegRHS);
12706 DAGCombinerInfo &DCI)
const {
12711 EVT VT =
N->getValueType(0);
12712 if (VT != MVT::i32)
12716 if (Src.getValueType() != MVT::i16)
12723SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12724 DAGCombinerInfo &DCI)
const {
12726 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12731 VTSign->getVT() == MVT::i8) ||
12733 VTSign->getVT() == MVT::i16))) {
12735 "s_buffer_load_{u8, i8} are supported "
12736 "in GFX12 (or newer) architectures.");
12737 EVT VT = Src.getValueType();
12742 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12748 auto *
M = cast<MemSDNode>(Src);
12749 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12750 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12755 VTSign->getVT() == MVT::i8) ||
12757 VTSign->getVT() == MVT::i16)) &&
12759 auto *
M = cast<MemSDNode>(Src);
12760 SDValue Ops[] = {Src.getOperand(0),
12766 Src.getOperand(6), Src.getOperand(7)};
12769 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12773 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12774 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12775 return DCI.DAG.getMergeValues(
12782 DAGCombinerInfo &DCI)
const {
12790 if (
N->getOperand(0).isUndef())
12797 DAGCombinerInfo &DCI)
const {
12798 EVT VT =
N->getValueType(0);
12823 unsigned MaxDepth)
const {
12824 unsigned Opcode =
Op.getOpcode();
12828 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12829 const auto &
F = CFP->getValueAPF();
12830 if (
F.isNaN() &&
F.isSignaling())
12832 if (!
F.isDenormal())
12895 if (
Op.getValueType() == MVT::i32) {
12900 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12901 if (
RHS->getZExtValue() == 0xffff0000) {
12911 return Op.getValueType().getScalarType() != MVT::f16;
12979 if (
Op.getValueType() == MVT::i16) {
12990 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12992 switch (IntrinsicID) {
12993 case Intrinsic::amdgcn_cvt_pkrtz:
12994 case Intrinsic::amdgcn_cubeid:
12995 case Intrinsic::amdgcn_frexp_mant:
12996 case Intrinsic::amdgcn_fdot2:
12997 case Intrinsic::amdgcn_rcp:
12998 case Intrinsic::amdgcn_rsq:
12999 case Intrinsic::amdgcn_rsq_clamp:
13000 case Intrinsic::amdgcn_rcp_legacy:
13001 case Intrinsic::amdgcn_rsq_legacy:
13002 case Intrinsic::amdgcn_trig_preop:
13003 case Intrinsic::amdgcn_log:
13004 case Intrinsic::amdgcn_exp2:
13005 case Intrinsic::amdgcn_sqrt:
13023 unsigned MaxDepth)
const {
13026 unsigned Opcode =
MI->getOpcode();
13028 if (Opcode == AMDGPU::G_FCANONICALIZE)
13031 std::optional<FPValueAndVReg> FCR;
13034 if (FCR->Value.isSignaling())
13036 if (!FCR->Value.isDenormal())
13047 case AMDGPU::G_FADD:
13048 case AMDGPU::G_FSUB:
13049 case AMDGPU::G_FMUL:
13050 case AMDGPU::G_FCEIL:
13051 case AMDGPU::G_FFLOOR:
13052 case AMDGPU::G_FRINT:
13053 case AMDGPU::G_FNEARBYINT:
13054 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13055 case AMDGPU::G_INTRINSIC_TRUNC:
13056 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13057 case AMDGPU::G_FMA:
13058 case AMDGPU::G_FMAD:
13059 case AMDGPU::G_FSQRT:
13060 case AMDGPU::G_FDIV:
13061 case AMDGPU::G_FREM:
13062 case AMDGPU::G_FPOW:
13063 case AMDGPU::G_FPEXT:
13064 case AMDGPU::G_FLOG:
13065 case AMDGPU::G_FLOG2:
13066 case AMDGPU::G_FLOG10:
13067 case AMDGPU::G_FPTRUNC:
13068 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13069 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13070 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13071 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13072 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13074 case AMDGPU::G_FNEG:
13075 case AMDGPU::G_FABS:
13076 case AMDGPU::G_FCOPYSIGN:
13078 case AMDGPU::G_FMINNUM:
13079 case AMDGPU::G_FMAXNUM:
13080 case AMDGPU::G_FMINNUM_IEEE:
13081 case AMDGPU::G_FMAXNUM_IEEE:
13082 case AMDGPU::G_FMINIMUM:
13083 case AMDGPU::G_FMAXIMUM: {
13091 case AMDGPU::G_BUILD_VECTOR:
13096 case AMDGPU::G_INTRINSIC:
13097 case AMDGPU::G_INTRINSIC_CONVERGENT:
13099 case Intrinsic::amdgcn_fmul_legacy:
13100 case Intrinsic::amdgcn_fmad_ftz:
13101 case Intrinsic::amdgcn_sqrt:
13102 case Intrinsic::amdgcn_fmed3:
13103 case Intrinsic::amdgcn_sin:
13104 case Intrinsic::amdgcn_cos:
13105 case Intrinsic::amdgcn_log:
13106 case Intrinsic::amdgcn_exp2:
13107 case Intrinsic::amdgcn_log_clamp:
13108 case Intrinsic::amdgcn_rcp:
13109 case Intrinsic::amdgcn_rcp_legacy:
13110 case Intrinsic::amdgcn_rsq:
13111 case Intrinsic::amdgcn_rsq_clamp:
13112 case Intrinsic::amdgcn_rsq_legacy:
13113 case Intrinsic::amdgcn_div_scale:
13114 case Intrinsic::amdgcn_div_fmas:
13115 case Intrinsic::amdgcn_div_fixup:
13116 case Intrinsic::amdgcn_fract:
13117 case Intrinsic::amdgcn_cvt_pkrtz:
13118 case Intrinsic::amdgcn_cubeid:
13119 case Intrinsic::amdgcn_cubema:
13120 case Intrinsic::amdgcn_cubesc:
13121 case Intrinsic::amdgcn_cubetc:
13122 case Intrinsic::amdgcn_frexp_mant:
13123 case Intrinsic::amdgcn_fdot2:
13124 case Intrinsic::amdgcn_trig_preop:
13143 if (
C.isDenormal()) {
13157 if (
C.isSignaling()) {
13176 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13180SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13181 DAGCombinerInfo &DCI)
const {
13184 EVT VT =
N->getValueType(0);
13193 EVT VT =
N->getValueType(0);
13194 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13210 EVT EltVT =
Lo.getValueType();
13213 for (
unsigned I = 0;
I != 2; ++
I) {
13217 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13218 }
else if (
Op.isUndef()) {
13230 if (isa<ConstantFPSDNode>(NewElts[1]))
13231 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13237 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13289 if (!MinK || !MaxK)
13302 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13303 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13344 if (
Info->getMode().DX10Clamp) {
13353 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13385 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13394 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13403 DAGCombinerInfo &DCI)
const {
13406 EVT VT =
N->getValueType(0);
13407 unsigned Opc =
N->getOpcode();
13436 if (
SDValue Med3 = performIntMed3ImmCombine(
13441 if (
SDValue Med3 = performIntMed3ImmCombine(
13447 if (
SDValue Med3 = performIntMed3ImmCombine(
13452 if (
SDValue Med3 = performIntMed3ImmCombine(
13462 (VT == MVT::f32 || VT == MVT::f64 ||
13466 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13477 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13478 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13487 DAGCombinerInfo &DCI)
const {
13488 EVT VT =
N->getValueType(0);
13511 if (
Info->getMode().DX10Clamp) {
13514 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13517 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13520 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13531 DAGCombinerInfo &DCI)
const {
13535 return DCI.DAG.getUNDEF(
N->getValueType(0));
13543 bool IsDivergentIdx,
13548 unsigned VecSize = EltSize * NumElem;
13551 if (VecSize <= 64 && EltSize < 32)
13560 if (IsDivergentIdx)
13564 unsigned NumInsts = NumElem +
13565 ((EltSize + 31) / 32) * NumElem ;
13570 return NumInsts <= 16;
13575 return NumInsts <= 15;
13582 if (isa<ConstantSDNode>(
Idx))
13596SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13597 DAGCombinerInfo &DCI)
const {
13603 EVT ResVT =
N->getValueType(0);
13622 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13650 DCI.AddToWorklist(Elt0.
getNode());
13651 DCI.AddToWorklist(Elt1.
getNode());
13673 if (!DCI.isBeforeLegalize())
13679 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13680 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13681 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13684 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13685 unsigned EltIdx = BitIndex / 32;
13686 unsigned LeftoverBitIdx = BitIndex % 32;
13690 DCI.AddToWorklist(Cast.
getNode());
13694 DCI.AddToWorklist(Elt.
getNode());
13697 DCI.AddToWorklist(Srl.
getNode());
13701 DCI.AddToWorklist(Trunc.
getNode());
13703 if (VecEltVT == ResVT) {
13715SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13716 DAGCombinerInfo &DCI)
const {
13730 EVT IdxVT =
Idx.getValueType();
13747 Src.getOperand(0).getValueType() == MVT::f16) {
13748 return Src.getOperand(0);
13751 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13752 APFloat Val = CFP->getValueAPF();
13753 bool LosesInfo =
true;
13763 DAGCombinerInfo &DCI)
const {
13765 "combine only useful on gfx8");
13767 SDValue TruncSrc =
N->getOperand(0);
13768 EVT VT =
N->getValueType(0);
13769 if (VT != MVT::f16)
13807unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13809 const SDNode *N1)
const {
13814 if (((VT == MVT::f32 &&
13816 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13836 EVT VT =
N->getValueType(0);
13837 if (VT != MVT::i32 && VT != MVT::i64)
13843 unsigned Opc =
N->getOpcode();
13866 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13886 DAGCombinerInfo &DCI)
const {
13890 EVT VT =
N->getValueType(0);
13900 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13904 if (NumBits <= 32 || NumBits > 64)
13916 unsigned NumUsers = 0;
13941 bool MulSignedLo =
false;
13942 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13951 if (VT != MVT::i64) {
13974 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13976 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13977 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13979 if (!MulLHSUnsigned32) {
13986 if (!MulRHSUnsigned32) {
13997 if (VT != MVT::i64)
14003SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14004 DAGCombinerInfo &DCI)
const {
14006 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14030 DAG.
getNode(
N->getOpcode(), SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14041static std::optional<ByteProvider<SDValue>>
14044 if (!Byte0 || Byte0->isConstantZero()) {
14045 return std::nullopt;
14048 if (Byte1 && !Byte1->isConstantZero()) {
14049 return std::nullopt;
14055 unsigned FirstCs =
First & 0x0c0c0c0c;
14056 unsigned SecondCs = Second & 0x0c0c0c0c;
14057 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14058 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14060 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14061 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14062 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14063 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14065 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14089 for (
int BPI = 0; BPI < 2; BPI++) {
14092 BPP = {Src1, Src0};
14094 unsigned ZeroMask = 0x0c0c0c0c;
14095 unsigned FMask = 0xFF << (8 * (3 - Step));
14097 unsigned FirstMask =
14098 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14099 unsigned SecondMask =
14100 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14104 int FirstGroup = -1;
14105 for (
int I = 0;
I < 2;
I++) {
14107 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14108 return IterElt.SrcOp == *BPP.first.Src &&
14109 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14119 if (FirstGroup != -1) {
14121 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14122 return IterElt.SrcOp == *BPP.second.Src &&
14123 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14129 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14137 unsigned ZeroMask = 0x0c0c0c0c;
14138 unsigned FMask = 0xFF << (8 * (3 - Step));
14142 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14146 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14155 if (Srcs.
size() == 1) {
14156 auto *Elt = Srcs.
begin();
14160 if (Elt->PermMask == 0x3020100)
14167 auto *FirstElt = Srcs.
begin();
14168 auto *SecondElt = std::next(FirstElt);
14175 auto FirstMask = FirstElt->PermMask;
14176 auto SecondMask = SecondElt->PermMask;
14178 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14179 unsigned FirstPlusFour = FirstMask | 0x04040404;
14182 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14194 FirstElt = std::next(SecondElt);
14195 if (FirstElt == Srcs.
end())
14198 SecondElt = std::next(FirstElt);
14201 if (SecondElt == Srcs.
end()) {
14207 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14213 return Perms.
size() == 2
14219 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14220 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14221 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14222 EntryMask += ZeroMask;
14227 auto Opcode =
Op.getOpcode();
14233static std::optional<bool>
14244 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14247 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14249 assert(!(S0IsUnsigned && S0IsSigned));
14250 assert(!(S1IsUnsigned && S1IsSigned));
14258 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14264 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14265 return std::nullopt;
14277 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14278 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14283 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14289 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14290 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14291 return std::nullopt;
14297 DAGCombinerInfo &DCI)
const {
14299 EVT VT =
N->getValueType(0);
14306 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14311 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14315 if (VT == MVT::i64) {
14316 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14323 std::optional<bool> IsSigned;
14329 int ChainLength = 0;
14330 for (
int I = 0;
I < 4;
I++) {
14331 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14334 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14337 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14342 TempNode->getOperand(MulIdx), *Src0, *Src1,
14343 TempNode->getOperand(MulIdx)->getOperand(0),
14344 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14348 IsSigned = *IterIsSigned;
14349 if (*IterIsSigned != *IsSigned)
14352 auto AddIdx = 1 - MulIdx;
14355 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14356 Src2s.
push_back(TempNode->getOperand(AddIdx));
14366 TempNode->getOperand(AddIdx), *Src0, *Src1,
14367 TempNode->getOperand(AddIdx)->getOperand(0),
14368 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14372 if (*IterIsSigned != *IsSigned)
14376 ChainLength =
I + 2;
14380 TempNode = TempNode->getOperand(AddIdx);
14382 ChainLength =
I + 1;
14383 if (TempNode->getNumOperands() < 2)
14385 LHS = TempNode->getOperand(0);
14386 RHS = TempNode->getOperand(1);
14389 if (ChainLength < 2)
14395 if (ChainLength < 4) {
14405 bool UseOriginalSrc =
false;
14406 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14407 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14408 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14409 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14411 auto Src0Mask = Src0s.
begin()->PermMask;
14412 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14413 bool UniqueEntries =
true;
14414 for (
auto I = 1;
I < 4;
I++) {
14415 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14418 UniqueEntries =
false;
14424 if (UniqueEntries) {
14425 UseOriginalSrc =
true;
14427 auto *FirstElt = Src0s.
begin();
14431 auto *SecondElt = Src1s.
begin();
14433 SecondElt->DWordOffset);
14442 if (!UseOriginalSrc) {
14449 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14452 : Intrinsic::amdgcn_udot4,
14462 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14467 unsigned Opc =
LHS.getOpcode();
14472 Opc =
RHS.getOpcode();
14479 auto Cond =
RHS.getOperand(0);
14487 return DAG.
getNode(Opc, SL, VTList, Args);
14501 DAGCombinerInfo &DCI)
const {
14503 EVT VT =
N->getValueType(0);
14505 if (VT == MVT::i64) {
14506 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14510 if (VT != MVT::i32)
14519 unsigned Opc =
RHS.getOpcode();
14526 auto Cond =
RHS.getOperand(0);
14534 return DAG.
getNode(Opc, SL, VTList, Args);
14549SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14550 DAGCombinerInfo &DCI)
const {
14552 if (
N->getValueType(0) != MVT::i32)
14563 unsigned LHSOpc =
LHS.getOpcode();
14564 unsigned Opc =
N->getOpcode();
14574 DAGCombinerInfo &DCI)
const {
14579 EVT VT =
N->getValueType(0);
14591 if (
A ==
LHS.getOperand(1)) {
14592 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14593 if (FusedOp != 0) {
14595 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14603 if (
A ==
RHS.getOperand(1)) {
14604 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14605 if (FusedOp != 0) {
14607 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14616 DAGCombinerInfo &DCI)
const {
14622 EVT VT =
N->getValueType(0);
14635 if (
A ==
LHS.getOperand(1)) {
14636 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14637 if (FusedOp != 0) {
14641 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14650 if (
A ==
RHS.getOperand(1)) {
14651 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14652 if (FusedOp != 0) {
14654 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14663 DAGCombinerInfo &DCI)
const {
14666 EVT VT =
N->getValueType(0);
14680 bool IsNegative =
false;
14681 if (CLHS->isExactlyValue(1.0) ||
14682 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14698 DAGCombinerInfo &DCI)
const {
14700 EVT VT =
N->getValueType(0);
14714 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14729 if (ScalarVT == MVT::f32 &&
14735 if (TrueNodeExpVal == INT_MIN)
14738 if (FalseNodeExpVal == INT_MIN)
14758 DAGCombinerInfo &DCI)
const {
14760 EVT VT =
N->getValueType(0);
14781 (
N->getFlags().hasAllowContract() &&
14782 FMA->getFlags().hasAllowContract())) {
14816 if (Vec1 == Vec2 || Vec3 == Vec4)
14822 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14831 DAGCombinerInfo &DCI)
const {
14837 EVT VT =
LHS.getValueType();
14840 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14842 CRHS = dyn_cast<ConstantSDNode>(LHS);
14866 return LHS.getOperand(0);
14872 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14873 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14874 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14881 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14882 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14890 return LHS.getOperand(0);
14894 if (VT != MVT::f32 && VT != MVT::f64 &&
14910 const unsigned IsInfMask =
14912 const unsigned IsFiniteMask =
14926SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
14927 DAGCombinerInfo &DCI)
const {
14945 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14949 unsigned ShiftOffset = 8 *
Offset;
14951 ShiftOffset -=
C->getZExtValue();
14953 ShiftOffset +=
C->getZExtValue();
14955 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14957 MVT::f32, Shifted);
14968 DCI.AddToWorklist(
N);
14975 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14981 DAGCombinerInfo &DCI)
const {
14991 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14994 APFloat One(
F.getSemantics(),
"1.0");
14996 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
15003 switch (
N->getOpcode()) {
15019 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
15029 switch (
N->getOpcode()) {
15031 return performAddCombine(
N, DCI);
15033 return performSubCombine(
N, DCI);
15036 return performAddCarrySubCarryCombine(
N, DCI);
15038 return performFAddCombine(
N, DCI);
15040 return performFSubCombine(
N, DCI);
15042 return performFDivCombine(
N, DCI);
15044 return performFMulCombine(
N, DCI);
15046 return performSetCCCombine(
N, DCI);
15059 return performMinMaxCombine(
N, DCI);
15061 return performFMACombine(
N, DCI);
15063 return performAndCombine(
N, DCI);
15065 return performOrCombine(
N, DCI);
15068 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15069 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15075 return performXorCombine(
N, DCI);
15077 return performZeroExtendCombine(
N, DCI);
15079 return performSignExtendInRegCombine(
N, DCI);
15081 return performClassCombine(
N, DCI);
15083 return performFCanonicalizeCombine(
N, DCI);
15085 return performRcpCombine(
N, DCI);
15100 return performUCharToFloatCombine(
N, DCI);
15102 return performFCopySignCombine(
N, DCI);
15107 return performCvtF32UByteNCombine(
N, DCI);
15109 return performFMed3Combine(
N, DCI);
15111 return performCvtPkRTZCombine(
N, DCI);
15113 return performClampCombine(
N, DCI);
15116 EVT VT =
N->getValueType(0);
15119 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15122 EVT EltVT = Src.getValueType();
15123 if (EltVT != MVT::i16)
15133 return performExtractVectorEltCombine(
N, DCI);
15135 return performInsertVectorEltCombine(
N, DCI);
15137 return performFPRoundCombine(
N, DCI);
15139 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15145 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15146 return performMemSDNodeCombine(MemNode, DCI);
15177 unsigned Opcode =
Node->getMachineOpcode();
15181 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15186 unsigned DmaskIdx =
15188 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15189 unsigned NewDmask = 0;
15192 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15193 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15196 unsigned TFCLane = 0;
15197 bool HasChain =
Node->getNumValues() > 1;
15199 if (OldDmask == 0) {
15207 TFCLane = OldBitsSet;
15214 if (
Use.getResNo() != 0)
15220 if (!
User->isMachineOpcode() ||
15221 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15233 if (UsesTFC && Lane == TFCLane) {
15238 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15240 Dmask &= ~(1 << Comp);
15248 NewDmask |= 1 << Comp;
15253 bool NoChannels = !NewDmask;
15260 if (OldBitsSet == 1)
15266 if (NewDmask == OldDmask)
15275 unsigned NewChannels = BitsSet + UsesTFC;
15279 assert(NewOpcode != -1 &&
15280 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15281 "failed to find equivalent MIMG op");
15289 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15291 MVT ResultVT = NewChannels == 1
15294 : NewChannels == 5 ? 8
15308 if (NewChannels == 1) {
15318 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15323 if (i || !NoChannels)
15328 if (NewUser !=
User) {
15338 Idx = AMDGPU::sub1;
15341 Idx = AMDGPU::sub2;
15344 Idx = AMDGPU::sub3;
15347 Idx = AMDGPU::sub4;
15358 Op =
Op.getOperand(0);
15360 return isa<FrameIndexSDNode>(
Op);
15370 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15371 SDValue SrcVal = Node->getOperand(2);
15379 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15381 SDNode *Glued = Node->getGluedNode();
15383 Node->getOperand(0), SL, VReg, SrcVal,
15389 return ToResultReg.
getNode();
15394 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15402 Node->getOperand(i).getValueType(),
15403 Node->getOperand(i)),
15415 unsigned Opcode = Node->getMachineOpcode();
15417 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15418 !
TII->isGather4(Opcode) &&
15420 return adjustWritemask(Node, DAG);
15423 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15429 case AMDGPU::V_DIV_SCALE_F32_e64:
15430 case AMDGPU::V_DIV_SCALE_F64_e64: {
15434 SDValue Src0 = Node->getOperand(1);
15435 SDValue Src1 = Node->getOperand(3);
15436 SDValue Src2 = Node->getOperand(5);
15440 (Src0 == Src1 || Src0 == Src2))
15497 unsigned InitIdx = 0;
15499 if (
TII->isImage(
MI)) {
15507 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15508 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15509 unsigned D16Val = D16 ? D16->getImm() : 0;
15511 if (!TFEVal && !LWEVal)
15522 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15524 unsigned dmask = MO_Dmask->
getImm();
15531 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15537 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15538 if (DstSize < InitIdx)
15541 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15549 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15550 unsigned NewDst = 0;
15559 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15560 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15580 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15593 if (
TII->isVOP3(
MI.getOpcode())) {
15595 TII->legalizeOperandsVOP3(
MRI,
MI);
15600 if (!
MI.getDesc().operands().empty()) {
15601 unsigned Opc =
MI.getOpcode();
15602 bool HasAGPRs =
Info->mayNeedAGPRs();
15610 if ((
I == Src2Idx) && (HasAGPRs))
15613 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15615 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15616 if (!
TRI->hasAGPRs(RC))
15618 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15619 if (!Src || !Src->isCopy() ||
15620 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15622 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15626 MRI.setRegClass(
Op.getReg(), NewRC);
15629 if (
TII->isMAI(
MI)) {
15635 AMDGPU::OpName::scale_src0);
15636 if (Src0Idx != -1) {
15638 AMDGPU::OpName::scale_src1);
15639 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15640 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15641 TII->legalizeOpWithMove(
MI, Src1Idx);
15649 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15650 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15651 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15652 if (
TRI->isVectorSuperClass(RC)) {
15653 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15654 MRI.setRegClass(Src2->getReg(), NewRC);
15655 if (Src2->isTied())
15656 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15665 if (
TII->isImage(
MI))
15666 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15740std::pair<unsigned, const TargetRegisterClass *>
15747 if (Constraint.
size() == 1) {
15749 switch (Constraint[0]) {
15756 RC = &AMDGPU::SReg_32RegClass;
15759 RC = &AMDGPU::SGPR_64RegClass;
15764 return std::pair(0U,
nullptr);
15771 RC = &AMDGPU::VGPR_32RegClass;
15776 return std::pair(0U,
nullptr);
15785 RC = &AMDGPU::AGPR_32RegClass;
15790 return std::pair(0U,
nullptr);
15799 return std::pair(0U, RC);
15804 if (
RegName.consume_front(
"v")) {
15805 RC = &AMDGPU::VGPR_32RegClass;
15806 }
else if (
RegName.consume_front(
"s")) {
15807 RC = &AMDGPU::SGPR_32RegClass;
15808 }
else if (
RegName.consume_front(
"a")) {
15809 RC = &AMDGPU::AGPR_32RegClass;
15814 if (
RegName.consume_front(
"[")) {
15825 return std::pair(0U,
nullptr);
15828 RC =
TRI->getVGPRClassForBitWidth(Width);
15830 RC =
TRI->getSGPRClassForBitWidth(Width);
15832 RC =
TRI->getAGPRClassForBitWidth(Width);
15834 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15835 return std::pair(Reg, RC);
15841 return std::pair(0U,
nullptr);
15843 if (!
Failed && Idx < RC->getNumRegs())
15851 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15857 if (Constraint.
size() == 1) {
15858 switch (Constraint[0]) {
15868 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
15876 if (Constraint.
size() == 1) {
15877 switch (Constraint[0]) {
15894 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15901 std::vector<SDValue> &Ops,
15916 unsigned Size =
Op.getScalarValueSizeInBits();
15924 Val =
C->getSExtValue();
15928 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15934 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15937 Val =
C->getSExtValue();
15941 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15951 if (Constraint.
size() == 1) {
15952 switch (Constraint[0]) {
15956 return isInt<16>(Val);
15960 return isInt<32>(Val);
15967 }
else if (Constraint.
size() == 2) {
15968 if (Constraint ==
"DA") {
15969 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15970 int64_t LoBits =
static_cast<int32_t
>(Val);
15974 if (Constraint ==
"DB") {
15982 unsigned MaxSize)
const {
15983 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15986 MVT VT =
Op.getSimpleValueType();
16011 switch (UnalignedClassID) {
16012 case AMDGPU::VReg_64RegClassID:
16013 return AMDGPU::VReg_64_Align2RegClassID;
16014 case AMDGPU::VReg_96RegClassID:
16015 return AMDGPU::VReg_96_Align2RegClassID;
16016 case AMDGPU::VReg_128RegClassID:
16017 return AMDGPU::VReg_128_Align2RegClassID;
16018 case AMDGPU::VReg_160RegClassID:
16019 return AMDGPU::VReg_160_Align2RegClassID;
16020 case AMDGPU::VReg_192RegClassID:
16021 return AMDGPU::VReg_192_Align2RegClassID;
16022 case AMDGPU::VReg_224RegClassID:
16023 return AMDGPU::VReg_224_Align2RegClassID;
16024 case AMDGPU::VReg_256RegClassID:
16025 return AMDGPU::VReg_256_Align2RegClassID;
16026 case AMDGPU::VReg_288RegClassID:
16027 return AMDGPU::VReg_288_Align2RegClassID;
16028 case AMDGPU::VReg_320RegClassID:
16029 return AMDGPU::VReg_320_Align2RegClassID;
16030 case AMDGPU::VReg_352RegClassID:
16031 return AMDGPU::VReg_352_Align2RegClassID;
16032 case AMDGPU::VReg_384RegClassID:
16033 return AMDGPU::VReg_384_Align2RegClassID;
16034 case AMDGPU::VReg_512RegClassID:
16035 return AMDGPU::VReg_512_Align2RegClassID;
16036 case AMDGPU::VReg_1024RegClassID:
16037 return AMDGPU::VReg_1024_Align2RegClassID;
16038 case AMDGPU::AReg_64RegClassID:
16039 return AMDGPU::AReg_64_Align2RegClassID;
16040 case AMDGPU::AReg_96RegClassID:
16041 return AMDGPU::AReg_96_Align2RegClassID;
16042 case AMDGPU::AReg_128RegClassID:
16043 return AMDGPU::AReg_128_Align2RegClassID;
16044 case AMDGPU::AReg_160RegClassID:
16045 return AMDGPU::AReg_160_Align2RegClassID;
16046 case AMDGPU::AReg_192RegClassID:
16047 return AMDGPU::AReg_192_Align2RegClassID;
16048 case AMDGPU::AReg_256RegClassID:
16049 return AMDGPU::AReg_256_Align2RegClassID;
16050 case AMDGPU::AReg_512RegClassID:
16051 return AMDGPU::AReg_512_Align2RegClassID;
16052 case AMDGPU::AReg_1024RegClassID:
16053 return AMDGPU::AReg_1024_Align2RegClassID;
16069 if (
Info->isEntryFunction()) {
16076 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16078 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16079 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16080 &AMDGPU::SGPR_64RegClass);
16081 Info->setSGPRForEXECCopy(SReg);
16084 Info->getStackPtrOffsetReg()));
16085 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16086 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16090 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16091 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16093 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16094 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16096 Info->limitOccupancy(MF);
16098 if (ST.isWave32() && !MF.
empty()) {
16099 for (
auto &
MBB : MF) {
16100 for (
auto &
MI :
MBB) {
16101 TII->fixImplicitOperands(
MI);
16111 if (ST.needsAlignedVGPRs()) {
16112 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16118 if (NewClassID != -1)
16119 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16128 const APInt &DemandedElts,
16130 unsigned Depth)
const {
16132 unsigned Opc =
Op.getOpcode();
16135 unsigned IID =
Op.getConstantOperandVal(0);
16137 case Intrinsic::amdgcn_mbcnt_lo:
16138 case Intrinsic::amdgcn_mbcnt_hi: {
16144 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16154 Op, Known, DemandedElts, DAG,
Depth);
16169 unsigned MaxValue =
16178 switch (
MI->getOpcode()) {
16179 case AMDGPU::G_INTRINSIC:
16180 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16183 case Intrinsic::amdgcn_workitem_id_x:
16186 case Intrinsic::amdgcn_workitem_id_y:
16189 case Intrinsic::amdgcn_workitem_id_z:
16192 case Intrinsic::amdgcn_mbcnt_lo:
16193 case Intrinsic::amdgcn_mbcnt_hi: {
16205 case Intrinsic::amdgcn_groupstaticsize: {
16216 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16219 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16222 case AMDGPU::G_AMDGPU_SMED3:
16223 case AMDGPU::G_AMDGPU_UMED3: {
16224 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16251 unsigned Depth)
const {
16253 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16259 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16286 if (Header->getAlignment() != PrefAlign)
16287 return Header->getAlignment();
16289 unsigned LoopSize = 0;
16297 LoopSize +=
TII->getInstSizeInBytes(
MI);
16298 if (LoopSize > 192)
16303 if (LoopSize <= 64)
16306 if (LoopSize <= 128)
16307 return CacheLineAlign;
16313 auto I = Exit->getFirstNonDebugInstr();
16314 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16315 return CacheLineAlign;
16324 if (PreTerm == Pre->
begin() ||
16325 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16329 auto ExitHead = Exit->getFirstNonDebugInstr();
16330 if (ExitHead == Exit->end() ||
16331 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16336 return CacheLineAlign;
16344 N =
N->getOperand(0).getNode();
16354 switch (
N->getOpcode()) {
16362 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16363 return !
TRI->isSGPRReg(
MRI, Reg);
16365 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16369 return !
TRI->isSGPRReg(
MRI, Reg);
16373 unsigned AS = L->getAddressSpace();
16404 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16406 return A->readMem() &&
A->writeMem();
16441 unsigned Depth)
const {
16446 if (
Info->getMode().DX10Clamp)
16458 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16478 <<
"Hardware instruction generated for atomic "
16480 <<
" operation at memory scope " << MemScope;
16484 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16485 Type *EltTy = VT->getElementType();
16486 return VT->getNumElements() == 2 &&
16505 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16506 unsigned BW =
IT->getBitWidth();
16507 return BW == 32 || BW == 64;
16519 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16521 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16522 return BW == 32 || BW == 64;
16529 return VT->getNumElements() == 2 &&
16530 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16540 bool HasSystemScope) {
16547 if (HasSystemScope) {
16554 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16567 const MDNode *NoaliasAddrSpaceMD =
16568 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16569 if (!NoaliasAddrSpaceMD)
16572 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16574 auto *
Low = mdconst::extract<ConstantInt>(
16577 auto *
High = mdconst::extract<ConstantInt>(
16599 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16612 bool HasSystemScope =
16799 if (HasSystemScope)
16851 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16852 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16853 : &AMDGPU::SReg_32RegClass;
16854 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16855 return TRI->getEquivalentSGPRClass(RC);
16856 if (
TRI->isSGPRClass(RC) && isDivergent)
16857 return TRI->getEquivalentVGPRClass(RC);
16869 unsigned WaveSize) {
16874 if (!
IT ||
IT->getBitWidth() != WaveSize)
16877 if (!isa<Instruction>(V))
16879 if (!Visited.
insert(V).second)
16881 bool Result =
false;
16882 for (
const auto *U : V->users()) {
16883 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16884 if (V == U->getOperand(1)) {
16885 switch (Intrinsic->getIntrinsicID()) {
16889 case Intrinsic::amdgcn_if_break:
16890 case Intrinsic::amdgcn_if:
16891 case Intrinsic::amdgcn_else:
16896 if (V == U->getOperand(0)) {
16897 switch (Intrinsic->getIntrinsicID()) {
16901 case Intrinsic::amdgcn_end_cf:
16902 case Intrinsic::amdgcn_loop:
16908 Result =
hasCFUser(U, Visited, WaveSize);
16917 const Value *V)
const {
16918 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16919 if (CI->isInlineAsm()) {
16928 for (
auto &TC : TargetConstraints) {
16970 return MRI.hasOneNonDBGUse(N0);
16977 if (
I.getMetadata(
"amdgpu.noclobber"))
16979 if (
I.getMetadata(
"amdgpu.last.use"))
16989 if (!Def->isMachineOpcode())
16999 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17000 PhysReg = AMDGPU::SCC;
17002 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17011 if (!
I->hasOneUse())
17017 switch (
I->getOpcode()) {
17018 case Instruction::FMul: {
17019 if (
User->getOpcode() != Instruction::FSub &&
17020 User->getOpcode() != Instruction::FAdd)
17025 return ((!
I->hasAllowContract() || !
User->hasAllowContract()) &&
17084 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17095 Alignment = RMW->getAlign();
17110 RMW->getType()->isFloatTy();
17113 bool ReturnValueIsUsed = !AI->
use_empty();
17122 if (FullFlatEmulation) {
17133 std::prev(BB->
end())->eraseFromParent();
17136 Value *LoadedShared =
nullptr;
17137 if (FullFlatEmulation) {
17139 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17140 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17148 LoadedShared = Clone;
17155 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17163 Value *LoadedPrivate;
17166 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17169 LoadedPrivate, RMW->getValOperand());
17173 auto [ResultLoad, Equal] =
17188 if (FullFlatEmulation) {
17198 if (!FullFlatEmulation) {
17203 MDNode *RangeNotPrivate =
17206 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17214 if (ReturnValueIsUsed) {
17217 if (FullFlatEmulation)
17232 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17233 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const