39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
787 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
788 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
789 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
792 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
800 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
869 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
870 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
875 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
876 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
877 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
878 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
882 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
883 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
884 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
885 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
992 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1005 EVT DestVT,
EVT SrcVT)
const {
1015 LLT DestTy,
LLT SrcTy)
const {
1016 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1017 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1043 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1045 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1072 return (NumElts + 1) / 2;
1078 return NumElts * ((
Size + 31) / 32);
1087 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1096 if (ScalarVT == MVT::bf16) {
1097 RegisterVT = MVT::i32;
1098 IntermediateVT = MVT::v2bf16;
1100 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1101 IntermediateVT = RegisterVT;
1103 NumIntermediates = (NumElts + 1) / 2;
1104 return NumIntermediates;
1109 IntermediateVT = RegisterVT;
1110 NumIntermediates = NumElts;
1111 return NumIntermediates;
1114 if (Size < 16 && Subtarget->has16BitInsts()) {
1116 RegisterVT = MVT::i16;
1117 IntermediateVT = ScalarVT;
1118 NumIntermediates = NumElts;
1119 return NumIntermediates;
1123 RegisterVT = MVT::i32;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = RegisterVT;
1132 NumIntermediates = NumElts * ((
Size + 31) / 32);
1133 return NumIntermediates;
1138 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1143 unsigned MaxNumLanes) {
1144 assert(MaxNumLanes != 0);
1147 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1148 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1159 unsigned MaxNumLanes) {
1160 auto *ST = dyn_cast<StructType>(Ty);
1165 assert(ST->getNumContainedTypes() == 2 &&
1166 ST->getContainedType(1)->isIntegerTy(32));
1181 DL.getPointerSizeInBits(AS) == 192)
1191 DL.getPointerSizeInBits(AS) == 160) ||
1193 DL.getPointerSizeInBits(AS) == 192))
1201 unsigned IntrID)
const {
1203 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1221 if (RsrcIntr->IsImage) {
1229 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1236 Info.ptrVal = RsrcArg;
1239 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1248 if (RsrcIntr->IsImage) {
1249 unsigned MaxNumLanes = 4;
1264 std::numeric_limits<unsigned>::max());
1274 if (RsrcIntr->IsImage) {
1275 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1295 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1297 Info.memVT = MVT::i32;
1304 case Intrinsic::amdgcn_raw_buffer_load_lds:
1305 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1306 case Intrinsic::amdgcn_struct_buffer_load_lds:
1307 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1308 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1313 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1314 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1315 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1316 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1319 std::numeric_limits<unsigned>::max());
1329 case Intrinsic::amdgcn_ds_ordered_add:
1330 case Intrinsic::amdgcn_ds_ordered_swap: {
1343 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1344 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1347 Info.ptrVal =
nullptr;
1352 case Intrinsic::amdgcn_ds_append:
1353 case Intrinsic::amdgcn_ds_consume: {
1366 case Intrinsic::amdgcn_global_atomic_csub: {
1375 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1385 case Intrinsic::amdgcn_global_atomic_fmin_num:
1386 case Intrinsic::amdgcn_global_atomic_fmax_num:
1387 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1388 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1389 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1390 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1400 case Intrinsic::amdgcn_global_load_tr_b64:
1401 case Intrinsic::amdgcn_global_load_tr_b128:
1402 case Intrinsic::amdgcn_ds_read_tr4_b64:
1403 case Intrinsic::amdgcn_ds_read_tr6_b96:
1404 case Intrinsic::amdgcn_ds_read_tr8_b64:
1405 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1413 case Intrinsic::amdgcn_ds_gws_init:
1414 case Intrinsic::amdgcn_ds_gws_barrier:
1415 case Intrinsic::amdgcn_ds_gws_sema_v:
1416 case Intrinsic::amdgcn_ds_gws_sema_br:
1417 case Intrinsic::amdgcn_ds_gws_sema_p:
1418 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1428 Info.memVT = MVT::i32;
1432 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1438 case Intrinsic::amdgcn_global_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1446 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1456 Info.memVT = MVT::i32;
1463 case Intrinsic::amdgcn_s_prefetch_data: {
1478 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1481 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1482 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1494 Type *&AccessTy)
const {
1496 switch (
II->getIntrinsicID()) {
1497 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume:
1500 case Intrinsic::amdgcn_ds_read_tr4_b64:
1501 case Intrinsic::amdgcn_ds_read_tr6_b96:
1502 case Intrinsic::amdgcn_ds_read_tr8_b64:
1503 case Intrinsic::amdgcn_ds_read_tr16_b64:
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap:
1506 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1507 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1508 case Intrinsic::amdgcn_global_atomic_csub:
1509 case Intrinsic::amdgcn_global_atomic_fmax_num:
1510 case Intrinsic::amdgcn_global_atomic_fmin_num:
1511 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1512 case Intrinsic::amdgcn_global_load_tr_b64:
1513 case Intrinsic::amdgcn_global_load_tr_b128:
1514 Ptr =
II->getArgOperand(0);
1516 case Intrinsic::amdgcn_global_load_lds:
1517 Ptr =
II->getArgOperand(1);
1522 AccessTy =
II->getType();
1528 unsigned AddrSpace)
const {
1540 return AM.
Scale == 0 &&
1542 AM.
BaseOffs, AddrSpace, FlatVariant));
1562 return isLegalMUBUFAddressingMode(AM);
1565bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1576 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1588 if (AM.HasBaseReg) {
1620 return isLegalMUBUFAddressingMode(AM);
1627 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1677 : isLegalMUBUFAddressingMode(AM);
1724 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1736 Align RequiredAlignment(
1739 Alignment < RequiredAlignment)
1760 RequiredAlignment =
Align(4);
1778 *IsFast = (Alignment >= RequiredAlignment) ? 64
1779 : (Alignment <
Align(4)) ? 32
1801 *IsFast = (Alignment >= RequiredAlignment) ? 96
1802 : (Alignment <
Align(4)) ? 32
1815 RequiredAlignment =
Align(8);
1826 *IsFast = (Alignment >= RequiredAlignment) ? 128
1827 : (Alignment <
Align(4)) ? 32
1844 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1846 return Alignment >= RequiredAlignment ||
1855 bool AlignedBy4 = Alignment >=
Align(4);
1857 *IsFast = AlignedBy4;
1868 return Alignment >=
Align(4) ||
1882 return Size >= 32 && Alignment >=
Align(4);
1887 unsigned *IsFast)
const {
1889 Alignment, Flags, IsFast);
1899 if (
Op.size() >= 16 &&
1903 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1911 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1921 unsigned DestAS)
const {
1929 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1947 unsigned Index)
const {
1990 auto [InputPtrReg, RC, ArgTy] =
2000 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2006 const SDLoc &SL)
const {
2013 const SDLoc &SL)
const {
2016 std::optional<uint32_t> KnownSize =
2018 if (KnownSize.has_value())
2044 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2053SDValue SITargetLowering::lowerKernargMemParameter(
2065 int64_t OffsetDiff =
Offset - AlignDownOffset;
2071 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2081 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2091 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2139 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2144SDValue SITargetLowering::getPreloadedValue(
2166 Reg = &WorkGroupIDX;
2167 RC = &AMDGPU::SReg_32RegClass;
2171 Reg = &WorkGroupIDY;
2172 RC = &AMDGPU::SReg_32RegClass;
2176 Reg = &WorkGroupIDZ;
2177 RC = &AMDGPU::SReg_32RegClass;
2208 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2212 "vector type argument should have been split");
2217 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2225 "unexpected vector split in ps argument type");
2239 Info->markPSInputAllocated(PSInputNum);
2241 Info->markPSInputEnabled(PSInputNum);
2257 if (
Info.hasWorkItemIDX()) {
2267 if (
Info.hasWorkItemIDY()) {
2270 Info.setWorkItemIDY(
2273 unsigned Reg = AMDGPU::VGPR1;
2281 if (
Info.hasWorkItemIDZ()) {
2284 Info.setWorkItemIDZ(
2287 unsigned Reg = AMDGPU::VGPR2;
2307 if (RegIdx == ArgVGPRs.
size()) {
2314 unsigned Reg = ArgVGPRs[RegIdx];
2316 assert(Reg != AMDGPU::NoRegister);
2326 unsigned NumArgRegs) {
2329 if (RegIdx == ArgSGPRs.
size())
2332 unsigned Reg = ArgSGPRs[RegIdx];
2334 assert(Reg != AMDGPU::NoRegister);
2348 assert(Reg != AMDGPU::NoRegister);
2374 const unsigned Mask = 0x3ff;
2377 if (
Info.hasWorkItemIDX()) {
2379 Info.setWorkItemIDX(Arg);
2382 if (
Info.hasWorkItemIDY()) {
2384 Info.setWorkItemIDY(Arg);
2387 if (
Info.hasWorkItemIDZ())
2399 const unsigned Mask = 0x3ff;
2420 if (
Info.hasImplicitArgPtr())
2428 if (
Info.hasWorkGroupIDX())
2431 if (
Info.hasWorkGroupIDY())
2434 if (
Info.hasWorkGroupIDZ())
2437 if (
Info.hasLDSKernelId())
2449 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2456 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2462 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2468 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2483 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2489 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2495 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2512 bool InPreloadSequence =
true;
2514 bool AlignedForImplictArgs =
false;
2515 unsigned ImplicitArgOffset = 0;
2516 for (
auto &Arg :
F.args()) {
2517 if (!InPreloadSequence || !Arg.hasInRegAttr())
2520 unsigned ArgIdx = Arg.getArgNo();
2523 if (InIdx < Ins.size() &&
2524 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2527 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2528 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2530 assert(ArgLocs[ArgIdx].isMemLoc());
2531 auto &ArgLoc = ArgLocs[InIdx];
2533 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2535 unsigned NumAllocSGPRs =
2536 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2539 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2540 if (!AlignedForImplictArgs) {
2542 alignTo(LastExplicitArgOffset,
2544 LastExplicitArgOffset;
2545 AlignedForImplictArgs =
true;
2547 ArgOffset += ImplicitArgOffset;
2551 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2552 assert(InIdx >= 1 &&
"No previous SGPR");
2553 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2554 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2558 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2559 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2562 InPreloadSequence =
false;
2568 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2570 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2572 if (PreloadRegs->
size() > 1)
2573 RC = &AMDGPU::SGPR_32RegClass;
2574 for (
auto &Reg : *PreloadRegs) {
2580 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2589 if (
Info.hasLDSKernelId()) {
2591 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600 bool IsShader)
const {
2608 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2610 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2614 unsigned NumRequiredSystemSGPRs =
2615 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2616 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2617 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2619 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2624 if (!HasArchitectedSGPRs) {
2625 if (
Info.hasWorkGroupIDX()) {
2627 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2631 if (
Info.hasWorkGroupIDY()) {
2633 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2637 if (
Info.hasWorkGroupIDZ()) {
2639 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2644 if (
Info.hasWorkGroupInfo()) {
2646 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2650 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2652 unsigned PrivateSegmentWaveByteOffsetReg;
2655 PrivateSegmentWaveByteOffsetReg =
2656 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2660 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2662 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2665 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2667 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2668 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2672 Info.getNumPreloadedSGPRs() >= 16);
2687 if (HasStackObjects)
2688 Info.setHasNonSpillStackObjects(
true);
2693 HasStackObjects =
true;
2697 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2699 if (!ST.enableFlatScratch()) {
2700 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2707 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2709 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2719 Info.setScratchRSrcReg(ReservedBufferReg);
2738 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2739 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2746 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2747 if (!
MRI.isLiveIn(Reg)) {
2748 Info.setStackPtrOffsetReg(Reg);
2753 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2760 if (ST.getFrameLowering()->hasFP(MF)) {
2761 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2767 return !
Info->isEntryFunction();
2777 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2786 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2787 RC = &AMDGPU::SGPR_64RegClass;
2788 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2789 RC = &AMDGPU::SGPR_32RegClass;
2795 Entry->addLiveIn(*
I);
2800 for (
auto *Exit : Exits)
2802 TII->get(TargetOpcode::COPY), *
I)
2820 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2839 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2840 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2848 !
Info->hasWorkGroupIDZ());
2867 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2868 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2871 Info->markPSInputAllocated(0);
2872 Info->markPSInputEnabled(0);
2883 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2884 if ((PsInputBits & 0x7F) == 0 ||
2885 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2888 }
else if (IsKernel) {
2891 Splits.
append(Ins.begin(), Ins.end());
2904 }
else if (!IsGraphics) {
2929 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2939 if (IsEntryFunc && VA.
isMemLoc()) {
2962 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2966 int64_t OffsetDiff =
Offset - AlignDownOffset;
2973 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2984 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2985 Ins[i].Flags.isSExt(), &Ins[i]);
2993 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2996 if (PreloadRegs.
size() == 1) {
2997 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3002 TRI->getRegSizeInBits(*RC)));
3010 for (
auto Reg : PreloadRegs) {
3017 PreloadRegs.size()),
3034 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3035 Ins[i].Flags.isSExt(), &Ins[i]);
3047 "hidden argument in kernel signature was not preloaded",
3054 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3055 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3060 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3075 if (!IsEntryFunc && VA.
isMemLoc()) {
3076 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3087 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3088 RC = &AMDGPU::VGPR_32RegClass;
3089 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3090 RC = &AMDGPU::SGPR_32RegClass;
3150 Info->setBytesInStackArgArea(StackArgSize);
3152 return Chains.
empty() ? Chain
3169 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3175 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3176 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3177 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3200 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3218 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3219 ++
I, ++RealRVLocIdx) {
3223 SDValue Arg = OutVals[RealRVLocIdx];
3251 if (!
Info->isEntryFunction()) {
3257 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3259 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3275 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3358 auto &ArgUsageInfo =
3360 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3386 const auto [OutgoingArg, ArgRC, ArgTy] =
3391 const auto [IncomingArg, IncomingArgRC, Ty] =
3393 assert(IncomingArgRC == ArgRC);
3396 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3404 InputReg = getImplicitArgPtr(DAG,
DL);
3406 std::optional<uint32_t> Id =
3408 if (Id.has_value()) {
3419 if (OutgoingArg->isRegister()) {
3420 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3421 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3424 unsigned SpecialArgOffset =
3435 auto [OutgoingArg, ArgRC, Ty] =
3438 std::tie(OutgoingArg, ArgRC, Ty) =
3441 std::tie(OutgoingArg, ArgRC, Ty) =
3456 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3457 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3458 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3490 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3491 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3502 : IncomingArgY ? *IncomingArgY
3509 if (OutgoingArg->isRegister()) {
3511 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3552 if (Callee->isDivergent())
3559 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3563 if (!CallerPreserved)
3566 bool CCMatch = CallerCC == CalleeCC;
3579 if (Arg.hasByValAttr())
3593 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3594 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3603 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3616 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3618 if (!CCVA.isRegLoc())
3623 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3625 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3654 if (IsChainCallConv) {
3658 RequestedExec = CLI.
Args.back();
3659 assert(RequestedExec.
Node &&
"No node for EXEC");
3664 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3665 CLI.
Outs.pop_back();
3669 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3670 CLI.
Outs.pop_back();
3675 "Haven't popped all the pieces of the EXEC mask");
3686 bool IsSibCall =
false;
3700 "unsupported call to variadic function ");
3708 "unsupported required tail call to function ");
3713 Outs, OutVals, Ins, DAG);
3717 "site marked musttail or on llvm.amdgcn.cs.chain");
3724 if (!TailCallOpt && IsTailCall)
3770 if (!IsSibCall || IsChainCallConv) {
3777 RegsToPass.emplace_back(IsChainCallConv
3778 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3779 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3786 const unsigned NumSpecialInputs = RegsToPass.size();
3788 MVT PtrVT = MVT::i32;
3791 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3819 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3827 int32_t
Offset = LocMemOffset;
3834 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3840 ? Flags.getNonZeroByValAlign()
3867 if (Outs[i].Flags.isByVal()) {
3869 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3872 Outs[i].Flags.getNonZeroByValAlign(),
3874 nullptr, std::nullopt, DstInfo,
3880 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3886 if (!MemOpChains.
empty())
3902 unsigned ArgIdx = 0;
3903 for (
auto [Reg, Val] : RegsToPass) {
3904 if (ArgIdx++ >= NumSpecialInputs &&
3905 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3931 if (IsTailCall && !IsSibCall) {
3936 std::vector<SDValue> Ops({Chain});
3942 Ops.push_back(Callee);
3959 Ops.push_back(Callee);
3970 if (IsChainCallConv)
3971 Ops.push_back(RequestedExec.
Node);
3975 for (
auto &[Reg, Val] : RegsToPass)
3979 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3980 assert(Mask &&
"Missing call preserved mask for calling convention");
3990 MVT::Glue, GlueOps),
3995 Ops.push_back(InGlue);
4012 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4017 Chain = Call.getValue(0);
4018 InGlue = Call.getValue(1);
4020 uint64_t CalleePopBytes = NumBytes;
4041 EVT VT =
Op.getValueType();
4051 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4055 "Stack grows upwards for AMDGPU");
4057 Chain = BaseAddr.getValue(1);
4059 if (Alignment > StackAlign) {
4062 uint64_t StackAlignMask = ScaledAlignment - 1;
4069 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4071 if (isa<ConstantSDNode>(
Size)) {
4102 if (
Op.getValueType() != MVT::i32)
4121 assert(
Op.getValueType() == MVT::i32);
4130 Op.getOperand(0), IntrinID, GetRoundBothImm);
4164 SDValue RoundModeTimesNumBits =
4184 TableEntry, EnumOffset);
4198 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4200 static_cast<uint32_t>(ConstMode->getZExtValue()),
4212 if (UseReducedTable) {
4218 SDValue RoundModeTimesNumBits =
4238 SDValue RoundModeTimesNumBits =
4247 NewMode = TruncTable;
4256 ReadFirstLaneID, NewMode);
4269 IntrinID, RoundBothImm, NewMode);
4275 if (
Op->isDivergent())
4294 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4295 EVT SrcVT = Src.getValueType();
4304 EVT DstVT =
Op.getValueType();
4313 if (
Op.getValueType() != MVT::i64)
4327 Op.getOperand(0), IntrinID, ModeHwRegImm);
4329 Op.getOperand(0), IntrinID, TrapHwRegImm);
4343 if (
Op.getOperand(1).getValueType() != MVT::i64)
4355 ReadFirstLaneID, NewModeReg);
4357 ReadFirstLaneID, NewTrapReg);
4359 unsigned ModeHwReg =
4362 unsigned TrapHwReg =
4370 IntrinID, ModeHwRegImm, NewModeReg);
4373 IntrinID, TrapHwRegImm, NewTrapReg);
4380 .
Case(
"m0", AMDGPU::M0)
4381 .
Case(
"exec", AMDGPU::EXEC)
4382 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4383 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4384 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4385 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4386 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4389 if (Reg == AMDGPU::NoRegister) {
4397 "\" for subtarget."));
4402 case AMDGPU::EXEC_LO:
4403 case AMDGPU::EXEC_HI:
4404 case AMDGPU::FLAT_SCR_LO:
4405 case AMDGPU::FLAT_SCR_HI:
4410 case AMDGPU::FLAT_SCR:
4429 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4438static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4460 auto Next = std::next(
I);
4473 return std::pair(LoopBB, RemainderBB);
4480 auto I =
MI.getIterator();
4481 auto E = std::next(
I);
4503 Src->setIsKill(
false);
4513 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4519 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4522 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4546 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4547 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4556 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4557 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4558 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4559 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4567 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4574 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4578 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4584 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4585 : AMDGPU::S_AND_SAVEEXEC_B64),
4589 MRI.setSimpleHint(NewExec, CondReg);
4591 if (UseGPRIdxMode) {
4593 SGPRIdxReg = CurrentIdxReg;
4595 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4596 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4603 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4606 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4613 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4616 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4617 : AMDGPU::S_XOR_B64_term),
4641 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4642 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4650 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4652 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4653 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4654 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4655 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4670 InitResultReg, DstReg, PhiReg, TmpExec,
4671 Offset, UseGPRIdxMode, SGPRIdxReg);
4677 LoopBB->removeSuccessor(RemainderBB);
4679 LoopBB->addSuccessor(LandingPad);
4690static std::pair<unsigned, int>
4694 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4699 return std::pair(AMDGPU::sub0,
Offset);
4713 assert(
Idx->getReg() != AMDGPU::NoRegister);
4737 return Idx->getReg();
4739 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4756 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4757 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4766 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4769 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4773 if (UseGPRIdxMode) {
4780 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4793 MI.eraseFromParent();
4802 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4803 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4809 UseGPRIdxMode, SGPRIdxReg);
4813 if (UseGPRIdxMode) {
4815 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4817 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4822 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4827 MI.eraseFromParent();
4844 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4854 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4856 if (
Idx->getReg() == AMDGPU::NoRegister) {
4867 MI.eraseFromParent();
4872 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4876 if (UseGPRIdxMode) {
4880 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4889 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4890 TRI.getRegSizeInBits(*VecRC), 32,
false);
4896 MI.eraseFromParent();
4906 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4910 UseGPRIdxMode, SGPRIdxReg);
4913 if (UseGPRIdxMode) {
4915 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4917 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4923 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4924 TRI.getRegSizeInBits(*VecRC), 32,
false);
4925 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4931 MI.eraseFromParent();
4946 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4977 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4978 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4980 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4981 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4982 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4984 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4985 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4987 bool IsWave32 = ST.isWave32();
4988 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4989 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4994 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4997 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5005 I = ComputeLoop->end();
5007 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5011 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5012 .
addReg(TmpSReg->getOperand(0).getReg())
5016 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5017 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5018 .
addReg(ActiveBits->getOperand(0).getReg());
5019 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5020 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5022 .
addReg(FF1->getOperand(0).getReg());
5023 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5025 .
addReg(LaneValue->getOperand(0).getReg());
5028 unsigned BITSETOpc =
5029 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5030 auto NewActiveBits =
5031 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5032 .
addReg(FF1->getOperand(0).getReg())
5033 .
addReg(ActiveBits->getOperand(0).getReg());
5036 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5037 .addMBB(ComputeLoop);
5038 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5039 .addMBB(ComputeLoop);
5042 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5044 .
addReg(NewActiveBits->getOperand(0).getReg())
5046 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5051 MI.eraseFromParent();
5063 switch (
MI.getOpcode()) {
5064 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5066 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5068 case AMDGPU::S_UADDO_PSEUDO:
5069 case AMDGPU::S_USUBO_PSEUDO: {
5076 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5078 : AMDGPU::S_SUB_I32;
5089 MI.eraseFromParent();
5092 case AMDGPU::S_ADD_U64_PSEUDO:
5093 case AMDGPU::S_SUB_U64_PSEUDO: {
5102 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5104 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5114 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5115 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5118 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5120 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5123 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5125 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5127 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5128 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5141 MI.eraseFromParent();
5144 case AMDGPU::V_ADD_U64_PSEUDO:
5145 case AMDGPU::V_SUB_U64_PSEUDO: {
5151 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5157 if (IsAdd && ST.hasLshlAddB64()) {
5163 TII->legalizeOperands(*
Add);
5164 MI.eraseFromParent();
5168 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5170 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5171 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5173 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5174 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5178 : &AMDGPU::VReg_64RegClass;
5181 : &AMDGPU::VReg_64RegClass;
5184 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5186 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5189 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5191 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5194 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5196 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5199 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5206 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5220 TII->legalizeOperands(*LoHalf);
5221 TII->legalizeOperands(*HiHalf);
5222 MI.eraseFromParent();
5225 case AMDGPU::S_ADD_CO_PSEUDO:
5226 case AMDGPU::S_SUB_CO_PSEUDO: {
5240 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5241 ? AMDGPU::S_ADDC_U32
5242 : AMDGPU::S_SUBB_U32;
5244 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5245 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5250 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5251 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5255 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5257 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5263 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5264 assert(WaveSize == 64 || WaveSize == 32);
5266 if (WaveSize == 64) {
5267 if (ST.hasScalarCompareEq64()) {
5273 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5275 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5277 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5278 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5280 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5301 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5307 MI.eraseFromParent();
5310 case AMDGPU::SI_INIT_M0: {
5312 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5313 .
add(
MI.getOperand(0));
5314 MI.eraseFromParent();
5317 case AMDGPU::GET_GROUPSTATICSIZE: {
5322 .
add(
MI.getOperand(0))
5324 MI.eraseFromParent();
5327 case AMDGPU::GET_SHADERCYCLESHILO: {
5341 using namespace AMDGPU::Hwreg;
5342 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5344 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5345 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5347 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5348 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5350 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5354 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5359 .
add(
MI.getOperand(0))
5364 MI.eraseFromParent();
5367 case AMDGPU::SI_INDIRECT_SRC_V1:
5368 case AMDGPU::SI_INDIRECT_SRC_V2:
5369 case AMDGPU::SI_INDIRECT_SRC_V4:
5370 case AMDGPU::SI_INDIRECT_SRC_V8:
5371 case AMDGPU::SI_INDIRECT_SRC_V9:
5372 case AMDGPU::SI_INDIRECT_SRC_V10:
5373 case AMDGPU::SI_INDIRECT_SRC_V11:
5374 case AMDGPU::SI_INDIRECT_SRC_V12:
5375 case AMDGPU::SI_INDIRECT_SRC_V16:
5376 case AMDGPU::SI_INDIRECT_SRC_V32:
5378 case AMDGPU::SI_INDIRECT_DST_V1:
5379 case AMDGPU::SI_INDIRECT_DST_V2:
5380 case AMDGPU::SI_INDIRECT_DST_V4:
5381 case AMDGPU::SI_INDIRECT_DST_V8:
5382 case AMDGPU::SI_INDIRECT_DST_V9:
5383 case AMDGPU::SI_INDIRECT_DST_V10:
5384 case AMDGPU::SI_INDIRECT_DST_V11:
5385 case AMDGPU::SI_INDIRECT_DST_V12:
5386 case AMDGPU::SI_INDIRECT_DST_V16:
5387 case AMDGPU::SI_INDIRECT_DST_V32:
5389 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5390 case AMDGPU::SI_KILL_I1_PSEUDO:
5392 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5401 Register SrcCond =
MI.getOperand(3).getReg();
5403 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5404 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5405 const auto *CondRC =
TRI->getWaveMaskRegClass();
5406 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5410 : &AMDGPU::VReg_64RegClass;
5413 : &AMDGPU::VReg_64RegClass;
5416 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5418 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5421 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5423 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5426 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5428 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5449 MI.eraseFromParent();
5452 case AMDGPU::SI_BR_UNDEF: {
5456 .
add(
MI.getOperand(0));
5458 MI.eraseFromParent();
5461 case AMDGPU::ADJCALLSTACKUP:
5462 case AMDGPU::ADJCALLSTACKDOWN: {
5469 case AMDGPU::SI_CALL_ISEL: {
5473 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5476 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5482 MI.eraseFromParent();
5485 case AMDGPU::V_ADD_CO_U32_e32:
5486 case AMDGPU::V_SUB_CO_U32_e32:
5487 case AMDGPU::V_SUBREV_CO_U32_e32: {
5490 unsigned Opc =
MI.getOpcode();
5492 bool NeedClampOperand =
false;
5493 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5495 NeedClampOperand =
true;
5499 if (
TII->isVOP3(*
I)) {
5504 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5505 if (NeedClampOperand)
5508 TII->legalizeOperands(*
I);
5510 MI.eraseFromParent();
5513 case AMDGPU::V_ADDC_U32_e32:
5514 case AMDGPU::V_SUBB_U32_e32:
5515 case AMDGPU::V_SUBBREV_U32_e32:
5518 TII->legalizeOperands(
MI);
5520 case AMDGPU::DS_GWS_INIT:
5521 case AMDGPU::DS_GWS_SEMA_BR:
5522 case AMDGPU::DS_GWS_BARRIER:
5523 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5525 case AMDGPU::DS_GWS_SEMA_V:
5526 case AMDGPU::DS_GWS_SEMA_P:
5527 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5535 case AMDGPU::S_SETREG_B32: {
5550 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5551 const unsigned SetMask = WidthMask <<
Offset;
5554 unsigned SetDenormOp = 0;
5555 unsigned SetRoundOp = 0;
5563 SetRoundOp = AMDGPU::S_ROUND_MODE;
5564 SetDenormOp = AMDGPU::S_DENORM_MODE;
5566 SetRoundOp = AMDGPU::S_ROUND_MODE;
5568 SetDenormOp = AMDGPU::S_DENORM_MODE;
5571 if (SetRoundOp || SetDenormOp) {
5574 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5575 unsigned ImmVal = Def->getOperand(1).getImm();
5589 MI.eraseFromParent();
5598 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5602 case AMDGPU::S_INVERSE_BALLOT_U32:
5603 case AMDGPU::S_INVERSE_BALLOT_U64:
5606 MI.setDesc(
TII->get(AMDGPU::COPY));
5608 case AMDGPU::ENDPGM_TRAP: {
5611 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5631 MI.eraseFromParent();
5634 case AMDGPU::SIMULATED_TRAP: {
5638 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5639 MI.eraseFromParent();
5676 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5792 EVT VT =
N->getValueType(0);
5796 if (VT == MVT::f16) {
5812 unsigned Opc =
Op.getOpcode();
5813 EVT VT =
Op.getValueType();
5814 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5815 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5816 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5817 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5832 unsigned Opc =
Op.getOpcode();
5833 EVT VT =
Op.getValueType();
5834 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5835 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5836 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5837 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5845 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5847 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5854 unsigned Opc =
Op.getOpcode();
5855 EVT VT =
Op.getValueType();
5856 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5857 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5858 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5859 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5860 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5861 VT == MVT::v32bf16);
5866 : std::pair(Op0, Op0);
5875 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5877 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5883 switch (
Op.getOpcode()) {
5887 return LowerBRCOND(
Op, DAG);
5889 return LowerRETURNADDR(
Op, DAG);
5892 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5893 "Load should return a value and a chain");
5897 EVT VT =
Op.getValueType();
5899 return lowerFSQRTF32(
Op, DAG);
5901 return lowerFSQRTF64(
Op, DAG);
5906 return LowerTrig(
Op, DAG);
5908 return LowerSELECT(
Op, DAG);
5910 return LowerFDIV(
Op, DAG);
5912 return LowerFFREXP(
Op, DAG);
5914 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5916 return LowerSTORE(
Op, DAG);
5920 return LowerGlobalAddress(MFI,
Op, DAG);
5923 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5925 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5927 return LowerINTRINSIC_VOID(
Op, DAG);
5929 return lowerADDRSPACECAST(
Op, DAG);
5931 return lowerINSERT_SUBVECTOR(
Op, DAG);
5933 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5935 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5937 return lowerVECTOR_SHUFFLE(
Op, DAG);
5939 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5941 return lowerBUILD_VECTOR(
Op, DAG);
5944 return lowerFP_ROUND(
Op, DAG);
5946 return lowerTRAP(
Op, DAG);
5948 return lowerDEBUGTRAP(
Op, DAG);
5957 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5960 return lowerFLDEXP(
Op, DAG);
5989 return lowerMUL(
Op, DAG);
5992 return lowerXMULO(
Op, DAG);
5995 return lowerXMUL_LOHI(
Op, DAG);
6028 EVT FittingLoadVT = LoadVT;
6060SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6063 bool IsIntrinsic)
const {
6067 EVT LoadVT =
M->getValueType(0);
6069 EVT EquivLoadVT = LoadVT;
6087 M->getMemoryVT(),
M->getMemOperand());
6098 EVT LoadVT =
M->getValueType(0);
6104 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6105 bool IsTFE =
M->getNumValues() == 3;
6118 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6122 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6123 M->getMemOperand(), DAG);
6128 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6129 M->getMemOperand(), DAG);
6137 EVT VT =
N->getValueType(0);
6138 unsigned CondCode =
N->getConstantOperandVal(3);
6149 EVT CmpVT =
LHS.getValueType();
6150 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6151 unsigned PromoteOp =
6171 EVT VT =
N->getValueType(0);
6173 unsigned CondCode =
N->getConstantOperandVal(3);
6182 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6200 EVT VT =
N->getValueType(0);
6207 Src.getOperand(1), Src.getOperand(2));
6218 Exec = AMDGPU::EXEC_LO;
6220 Exec = AMDGPU::EXEC;
6237 EVT VT =
N->getValueType(0);
6239 unsigned IID =
N->getConstantOperandVal(0);
6240 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6241 IID == Intrinsic::amdgcn_permlanex16;
6242 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6243 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6247 unsigned SplitSize = 32;
6248 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6249 ST->hasDPALU_DPP() &&
6257 case Intrinsic::amdgcn_permlane16:
6258 case Intrinsic::amdgcn_permlanex16:
6259 case Intrinsic::amdgcn_update_dpp:
6264 case Intrinsic::amdgcn_writelane:
6267 case Intrinsic::amdgcn_readlane:
6268 case Intrinsic::amdgcn_set_inactive:
6269 case Intrinsic::amdgcn_set_inactive_chain_arg:
6270 case Intrinsic::amdgcn_mov_dpp8:
6273 case Intrinsic::amdgcn_readfirstlane:
6274 case Intrinsic::amdgcn_permlane64:
6284 if (
SDNode *GL =
N->getGluedNode()) {
6286 GL = GL->getOperand(0).getNode();
6296 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6297 IID == Intrinsic::amdgcn_mov_dpp8 ||
6298 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6299 Src1 =
N->getOperand(2);
6300 if (IID == Intrinsic::amdgcn_writelane ||
6301 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6302 Src2 =
N->getOperand(3);
6305 if (ValSize == SplitSize) {
6315 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6320 if (IID == Intrinsic::amdgcn_writelane) {
6325 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6327 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6330 if (ValSize % SplitSize != 0)
6334 EVT VT =
N->getValueType(0);
6338 unsigned NumOperands =
N->getNumOperands();
6340 SDNode *GL =
N->getGluedNode();
6345 for (
unsigned i = 0; i != NE; ++i) {
6346 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6348 SDValue Operand =
N->getOperand(j);
6378 if (SplitSize == 32) {
6380 return unrollLaneOp(LaneOp.
getNode());
6386 unsigned SubVecNumElt =
6390 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6391 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6395 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6400 if (IID == Intrinsic::amdgcn_writelane)
6405 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6406 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6407 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6408 EltIdx += SubVecNumElt;
6422 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6425 if (IID == Intrinsic::amdgcn_writelane)
6428 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6436 switch (
N->getOpcode()) {
6448 unsigned IID =
N->getConstantOperandVal(0);
6450 case Intrinsic::amdgcn_make_buffer_rsrc:
6451 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6453 case Intrinsic::amdgcn_cvt_pkrtz: {
6462 case Intrinsic::amdgcn_cvt_pknorm_i16:
6463 case Intrinsic::amdgcn_cvt_pknorm_u16:
6464 case Intrinsic::amdgcn_cvt_pk_i16:
6465 case Intrinsic::amdgcn_cvt_pk_u16: {
6471 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6473 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6475 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6480 EVT VT =
N->getValueType(0);
6489 case Intrinsic::amdgcn_s_buffer_load: {
6501 EVT VT =
Op.getValueType();
6502 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6514 if (!
Offset->isDivergent()) {
6533 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6545 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6546 Results.push_back(Res.getOperand(
I));
6550 Results.push_back(Res.getValue(1));
6559 EVT VT =
N->getValueType(0);
6564 EVT SelectVT = NewVT;
6565 if (NewVT.
bitsLT(MVT::i32)) {
6568 SelectVT = MVT::i32;
6574 if (NewVT != SelectVT)
6580 if (
N->getValueType(0) != MVT::v2f16)
6592 if (
N->getValueType(0) != MVT::v2f16)
6604 if (
N->getValueType(0) != MVT::f16)
6619 if (U.get() !=
Value)
6622 if (U.getUser()->getOpcode() == Opcode)
6628unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6630 switch (
Intr->getConstantOperandVal(1)) {
6631 case Intrinsic::amdgcn_if:
6633 case Intrinsic::amdgcn_else:
6635 case Intrinsic::amdgcn_loop:
6637 case Intrinsic::amdgcn_end_cf:
6684 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6697 assert(BR &&
"brcond missing unconditional branch user");
6698 Target = BR->getOperand(1);
6701 unsigned CFNode = isCFIntrinsic(
Intr);
6720 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6744 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6757 Intr->getOperand(0));
6763 MVT VT =
Op.getSimpleValueType();
6766 if (
Op.getConstantOperandVal(0) != 0)
6772 if (
Info->isEntryFunction())
6789 return Op.getValueType().bitsLE(VT)
6796 assert(
Op.getValueType() == MVT::f16 &&
6797 "Do not know how to custom lower FP_ROUND for non-f16 type");
6800 EVT SrcVT = Src.getValueType();
6801 if (SrcVT != MVT::f64)
6817 EVT VT =
Op.getValueType();
6820 bool IsIEEEMode =
Info->getMode().IEEE;
6829 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6837 EVT VT =
Op.getValueType();
6841 EVT ExpVT =
Exp.getValueType();
6842 if (ExpVT == MVT::i16)
6863 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6870 switch (
Op->getOpcode()) {
6900 DAGCombinerInfo &DCI)
const {
6901 const unsigned Opc =
Op.getOpcode();
6909 :
Op->getOperand(0).getValueType();
6912 if (DCI.isBeforeLegalizeOps() ||
6916 auto &DAG = DCI.DAG;
6922 LHS =
Op->getOperand(1);
6923 RHS =
Op->getOperand(2);
6925 LHS =
Op->getOperand(0);
6926 RHS =
Op->getOperand(1);
6957 EVT VT =
Op.getValueType();
6963 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6990 if (
Op->isDivergent())
7003 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7005 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7008 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7010 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7016 EVT VT =
Op.getValueType();
7023 const APInt &
C = RHSC->getAPIntValue();
7025 if (
C.isPowerOf2()) {
7027 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7054 if (
Op->isDivergent()) {
7071 return lowerTrapEndpgm(
Op, DAG);
7074 : lowerTrapHsaQueuePtr(
Op, DAG);
7084SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7086 ImplicitParameter Param)
const {
7106 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7112 if (UserSGPR == AMDGPU::NoRegister) {
7154 "debugtrap handler not supported",
7167SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7171 ? AMDGPU::SRC_SHARED_BASE
7172 : AMDGPU::SRC_PRIVATE_BASE;
7195 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7204 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7210 if (UserSGPR == AMDGPU::NoRegister) {
7240 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7241 isa<BasicBlockSDNode>(Val))
7244 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7245 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7259 unsigned DestAS, SrcAS;
7261 bool IsNonNull =
false;
7262 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7263 SrcAS = ASC->getSrcAddressSpace();
7264 Src = ASC->getOperand(0);
7265 DestAS = ASC->getDestAddressSpace();
7268 Op.getConstantOperandVal(0) ==
7269 Intrinsic::amdgcn_addrspacecast_nonnull);
7270 Src =
Op->getOperand(1);
7271 SrcAS =
Op->getConstantOperandVal(2);
7272 DestAS =
Op->getConstantOperandVal(3);
7287 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7301 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7309 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7321 Op.getValueType() == MVT::i64) {
7330 Src.getValueType() == MVT::i64)
7354 EVT InsVT =
Ins.getValueType();
7357 unsigned IdxVal =
Idx->getAsZExtVal();
7362 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7367 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7369 MVT::i32, InsNumElts / 2);
7374 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7376 if (InsNumElts == 2) {
7389 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7411 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7412 if (NumElts == 4 && EltSize == 16 && KIdx) {
7423 unsigned Idx = KIdx->getZExtValue();
7424 bool InsertLo =
Idx < 2;
7441 if (isa<ConstantSDNode>(
Idx))
7447 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7453 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7482 EVT ResultVT =
Op.getValueType();
7495 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7498 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7502 if (VecSize == 128) {
7510 }
else if (VecSize == 256) {
7513 for (
unsigned P = 0;
P < 4; ++
P) {
7519 Parts[0], Parts[1]));
7521 Parts[2], Parts[3]));
7527 for (
unsigned P = 0;
P < 8; ++
P) {
7534 Parts[0], Parts[1], Parts[2], Parts[3]));
7537 Parts[4], Parts[5], Parts[6], Parts[7]));
7540 EVT IdxVT =
Idx.getValueType();
7557 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7572 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7582 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7588 EVT ResultVT =
Op.getValueType();
7592 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7608 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7609 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7617 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7618 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7619 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7620 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7639 EVT ResultVT =
Op.getValueType();
7655 EVT VT =
Op.getValueType();
7657 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7692 for (
unsigned P = 0;
P < NumParts; ++
P) {
7694 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7727 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7765 EVT PtrVT =
Op.getValueType();
7781 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7853 SDValue Param = lowerKernargMemParameter(
7863 "non-hsa intrinsic with hsa target",
7872 "intrinsic not supported on subtarget",
7882 unsigned NumElts = Elts.
size();
7884 if (NumElts <= 12) {
7893 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7899 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7900 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7909 EVT SrcVT = Src.getValueType();
7930 bool Unpacked,
bool IsD16,
int DMaskPop,
7931 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7935 EVT ReqRetVT = ResultTypes[0];
7937 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7938 ? (ReqRetNumElts + 1) / 2
7941 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7952 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7963 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7965 NumDataDwords - MaskPopDwords);
7970 EVT LegalReqRetVT = ReqRetVT;
7972 if (!
Data.getValueType().isInteger())
7974 Data.getValueType().changeTypeToInteger(),
Data);
7995 if (Result->getNumValues() == 1)
8002 SDValue *LWE,
bool &IsTexFail) {
8003 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8022 unsigned DimIdx,
unsigned EndIdx,
8023 unsigned NumGradients) {
8025 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8033 if (((
I + 1) >= EndIdx) ||
8034 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8035 I == DimIdx + NumGradients - 1))) {
8036 if (
Addr.getValueType() != MVT::i16)
8057 unsigned IntrOpcode =
Intr->BaseOpcode;
8068 int NumVDataDwords = 0;
8069 bool AdjustRetType =
false;
8070 bool IsAtomicPacked16Bit =
false;
8073 const unsigned ArgOffset = WithChain ? 2 : 1;
8076 unsigned DMaskLanes = 0;
8078 if (BaseOpcode->Atomic) {
8079 VData =
Op.getOperand(2);
8081 IsAtomicPacked16Bit =
8082 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8083 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8086 if (BaseOpcode->AtomicX2) {
8093 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8094 DMask = Is64Bit ? 0xf : 0x3;
8095 NumVDataDwords = Is64Bit ? 4 : 2;
8097 DMask = Is64Bit ? 0x3 : 0x1;
8098 NumVDataDwords = Is64Bit ? 2 : 1;
8101 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8104 if (BaseOpcode->Store) {
8105 VData =
Op.getOperand(2);
8113 VData = handleD16VData(VData, DAG,
true);
8117 }
else if (!BaseOpcode->NoReturn) {
8130 (!LoadVT.
isVector() && DMaskLanes > 1))
8138 NumVDataDwords = (DMaskLanes + 1) / 2;
8140 NumVDataDwords = DMaskLanes;
8142 AdjustRetType =
true;
8146 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8151 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8153 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8154 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8156 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8158 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8159 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8162 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8163 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8164 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8169 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8173 "Bias needs to be converted to 16 bit in A16 mode");
8178 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8182 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8183 "require 16 bit args for both gradients and addresses");
8188 if (!
ST->hasA16()) {
8189 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8190 "support 16 bit addresses\n");
8200 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8204 IntrOpcode = G16MappingInfo->
G16;
8212 ArgOffset +
Intr->GradientStart,
8213 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8215 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8216 I < ArgOffset + Intr->CoordStart;
I++)
8223 ArgOffset +
Intr->CoordStart, VAddrEnd,
8227 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8245 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8246 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8247 const bool UseNSA =
ST->hasNSAEncoding() &&
8248 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8249 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8250 const bool UsePartialNSA =
8251 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8254 if (UsePartialNSA) {
8256 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8257 }
else if (!UseNSA) {
8264 if (!BaseOpcode->Sampler) {
8268 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8270 Unorm = UnormConst ? True : False;
8275 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8276 bool IsTexFail =
false;
8277 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8288 NumVDataDwords += 1;
8289 AdjustRetType =
true;
8294 if (AdjustRetType) {
8297 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8300 if (isa<MemSDNode>(
Op))
8306 MVT::i32, NumVDataDwords)
8309 ResultTypes[0] = NewVT;
8310 if (ResultTypes.size() == 3) {
8314 ResultTypes.erase(&ResultTypes[1]);
8318 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8319 if (BaseOpcode->Atomic)
8326 if (BaseOpcode->Store || BaseOpcode->Atomic)
8328 if (UsePartialNSA) {
8337 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8340 if (BaseOpcode->Sampler) {
8349 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8353 ST->hasFeature(AMDGPU::FeatureR128A16)
8363 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8367 if (BaseOpcode->HasD16)
8369 if (isa<MemSDNode>(
Op))
8372 int NumVAddrDwords =
8378 NumVDataDwords, NumVAddrDwords);
8379 }
else if (IsGFX11Plus) {
8381 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8382 : AMDGPU::MIMGEncGfx11Default,
8383 NumVDataDwords, NumVAddrDwords);
8384 }
else if (IsGFX10Plus) {
8386 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8387 : AMDGPU::MIMGEncGfx10Default,
8388 NumVDataDwords, NumVAddrDwords);
8392 NumVDataDwords, NumVAddrDwords);
8395 "requested image instruction is not supported on this GPU");
8400 NumVDataDwords, NumVAddrDwords);
8403 NumVDataDwords, NumVAddrDwords);
8409 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8414 if (BaseOpcode->AtomicX2) {
8419 if (BaseOpcode->NoReturn)
8423 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8441 if (!
Offset->isDivergent()) {
8486 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8490 unsigned NumLoads = 1;
8496 if (NumElts == 8 || NumElts == 16) {
8497 NumLoads = NumElts / 4;
8505 setBufferOffsets(
Offset, DAG, &Ops[3],
8506 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8509 for (
unsigned i = 0; i < NumLoads; ++i) {
8515 if (NumElts == 8 || NumElts == 16)
8562 EVT VT =
Op.getValueType();
8564 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8568 switch (IntrinsicID) {
8569 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8572 return getPreloadedValue(DAG, *MFI, VT,
8575 case Intrinsic::amdgcn_dispatch_ptr:
8576 case Intrinsic::amdgcn_queue_ptr: {
8579 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8585 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8588 return getPreloadedValue(DAG, *MFI, VT, RegID);
8590 case Intrinsic::amdgcn_implicitarg_ptr: {
8592 return getImplicitArgPtr(DAG,
DL);
8593 return getPreloadedValue(DAG, *MFI, VT,
8596 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8602 return getPreloadedValue(DAG, *MFI, VT,
8605 case Intrinsic::amdgcn_dispatch_id: {
8608 case Intrinsic::amdgcn_rcp:
8610 case Intrinsic::amdgcn_rsq:
8612 case Intrinsic::amdgcn_rsq_legacy:
8616 case Intrinsic::amdgcn_rcp_legacy:
8620 case Intrinsic::amdgcn_rsq_clamp: {
8634 case Intrinsic::r600_read_ngroups_x:
8638 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8641 case Intrinsic::r600_read_ngroups_y:
8645 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8648 case Intrinsic::r600_read_ngroups_z:
8652 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8655 case Intrinsic::r600_read_global_size_x:
8659 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8662 case Intrinsic::r600_read_global_size_y:
8666 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8669 case Intrinsic::r600_read_global_size_z:
8673 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8676 case Intrinsic::r600_read_local_size_x:
8680 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8682 case Intrinsic::r600_read_local_size_y:
8686 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8688 case Intrinsic::r600_read_local_size_z:
8692 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8694 case Intrinsic::amdgcn_workgroup_id_x:
8695 return getPreloadedValue(DAG, *MFI, VT,
8697 case Intrinsic::amdgcn_workgroup_id_y:
8698 return getPreloadedValue(DAG, *MFI, VT,
8700 case Intrinsic::amdgcn_workgroup_id_z:
8701 return getPreloadedValue(DAG, *MFI, VT,
8703 case Intrinsic::amdgcn_wave_id:
8704 return lowerWaveID(DAG,
Op);
8705 case Intrinsic::amdgcn_lds_kernel_id: {
8707 return getLDSKernelId(DAG,
DL);
8708 return getPreloadedValue(DAG, *MFI, VT,
8711 case Intrinsic::amdgcn_workitem_id_x:
8712 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8713 case Intrinsic::amdgcn_workitem_id_y:
8714 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8715 case Intrinsic::amdgcn_workitem_id_z:
8716 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8717 case Intrinsic::amdgcn_wavefrontsize:
8720 case Intrinsic::amdgcn_s_buffer_load: {
8721 unsigned CPol =
Op.getConstantOperandVal(3);
8728 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8729 Op.getOperand(3), DAG);
8731 case Intrinsic::amdgcn_fdiv_fast:
8732 return lowerFDIV_FAST(
Op, DAG);
8733 case Intrinsic::amdgcn_sin:
8736 case Intrinsic::amdgcn_cos:
8739 case Intrinsic::amdgcn_mul_u24:
8742 case Intrinsic::amdgcn_mul_i24:
8746 case Intrinsic::amdgcn_log_clamp: {
8752 case Intrinsic::amdgcn_fract:
8755 case Intrinsic::amdgcn_class:
8758 case Intrinsic::amdgcn_div_fmas:
8760 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8762 case Intrinsic::amdgcn_div_fixup:
8764 Op.getOperand(2),
Op.getOperand(3));
8766 case Intrinsic::amdgcn_div_scale: {
8779 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8782 Denominator, Numerator);
8784 case Intrinsic::amdgcn_icmp: {
8786 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8787 Op.getConstantOperandVal(2) == 0 &&
8792 case Intrinsic::amdgcn_fcmp: {
8795 case Intrinsic::amdgcn_ballot:
8797 case Intrinsic::amdgcn_fmed3:
8799 Op.getOperand(2),
Op.getOperand(3));
8800 case Intrinsic::amdgcn_fdot2:
8802 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8803 case Intrinsic::amdgcn_fmul_legacy:
8806 case Intrinsic::amdgcn_sffbh:
8808 case Intrinsic::amdgcn_sbfe:
8810 Op.getOperand(2),
Op.getOperand(3));
8811 case Intrinsic::amdgcn_ubfe:
8813 Op.getOperand(2),
Op.getOperand(3));
8814 case Intrinsic::amdgcn_cvt_pkrtz:
8815 case Intrinsic::amdgcn_cvt_pknorm_i16:
8816 case Intrinsic::amdgcn_cvt_pknorm_u16:
8817 case Intrinsic::amdgcn_cvt_pk_i16:
8818 case Intrinsic::amdgcn_cvt_pk_u16: {
8820 EVT VT =
Op.getValueType();
8823 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8825 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8827 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8829 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8835 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8838 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8841 case Intrinsic::amdgcn_fmad_ftz:
8843 Op.getOperand(2),
Op.getOperand(3));
8845 case Intrinsic::amdgcn_if_break:
8847 Op->getOperand(1),
Op->getOperand(2)),
8850 case Intrinsic::amdgcn_groupstaticsize: {
8862 case Intrinsic::amdgcn_is_shared:
8863 case Intrinsic::amdgcn_is_private: {
8865 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8868 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8876 case Intrinsic::amdgcn_perm:
8878 Op.getOperand(2),
Op.getOperand(3));
8879 case Intrinsic::amdgcn_reloc_constant: {
8883 auto *RelocSymbol = cast<GlobalVariable>(
8889 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8890 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8891 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8892 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8893 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8894 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8895 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8896 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8897 if (
Op.getOperand(4).getValueType() == MVT::i32)
8903 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8904 Op.getOperand(3), IndexKeyi32);
8906 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8907 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8908 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8909 if (
Op.getOperand(6).getValueType() == MVT::i32)
8915 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8916 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8917 IndexKeyi32, Op.getOperand(7)});
8919 case Intrinsic::amdgcn_addrspacecast_nonnull:
8920 return lowerADDRSPACECAST(
Op, DAG);
8921 case Intrinsic::amdgcn_readlane:
8922 case Intrinsic::amdgcn_readfirstlane:
8923 case Intrinsic::amdgcn_writelane:
8924 case Intrinsic::amdgcn_permlane16:
8925 case Intrinsic::amdgcn_permlanex16:
8926 case Intrinsic::amdgcn_permlane64:
8927 case Intrinsic::amdgcn_set_inactive:
8928 case Intrinsic::amdgcn_set_inactive_chain_arg:
8929 case Intrinsic::amdgcn_mov_dpp8:
8930 case Intrinsic::amdgcn_update_dpp:
8935 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8946 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8952 unsigned NewOpcode)
const {
8956 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8957 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
8971 auto *
M = cast<MemSDNode>(
Op);
8975 M->getMemOperand());
8980 unsigned NewOpcode)
const {
8984 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8985 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
8999 auto *
M = cast<MemSDNode>(
Op);
9003 M->getMemOperand());
9008 unsigned IntrID =
Op.getConstantOperandVal(1);
9012 case Intrinsic::amdgcn_ds_ordered_add:
9013 case Intrinsic::amdgcn_ds_ordered_swap: {
9018 unsigned IndexOperand =
M->getConstantOperandVal(7);
9019 unsigned WaveRelease =
M->getConstantOperandVal(8);
9020 unsigned WaveDone =
M->getConstantOperandVal(9);
9022 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9023 IndexOperand &= ~0x3f;
9024 unsigned CountDw = 0;
9027 CountDw = (IndexOperand >> 24) & 0xf;
9028 IndexOperand &= ~(0xf << 24);
9030 if (CountDw < 1 || CountDw > 4) {
9032 "ds_ordered_count: dword count must be between 1 and 4");
9039 if (WaveDone && !WaveRelease)
9042 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9043 unsigned ShaderType =
9045 unsigned Offset0 = OrderedCountIndex << 2;
9046 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9049 Offset1 |= (CountDw - 1) << 6;
9052 Offset1 |= ShaderType << 2;
9054 unsigned Offset = Offset0 | (Offset1 << 8);
9061 M->getVTList(), Ops,
M->getMemoryVT(),
9062 M->getMemOperand());
9064 case Intrinsic::amdgcn_raw_buffer_load:
9065 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9066 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9067 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9068 case Intrinsic::amdgcn_raw_buffer_load_format:
9069 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9070 const bool IsFormat =
9071 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9072 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9074 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9075 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9088 auto *
M = cast<MemSDNode>(
Op);
9089 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9091 case Intrinsic::amdgcn_struct_buffer_load:
9092 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9093 case Intrinsic::amdgcn_struct_buffer_load_format:
9094 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9095 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9096 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9097 const bool IsFormat =
9098 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9099 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9101 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9102 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9115 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9117 case Intrinsic::amdgcn_raw_tbuffer_load:
9118 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9120 EVT LoadVT =
Op.getValueType();
9121 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9122 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9141 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9144 case Intrinsic::amdgcn_struct_tbuffer_load:
9145 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9147 EVT LoadVT =
Op.getValueType();
9148 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9149 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9168 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9171 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9172 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9174 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9175 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9176 return lowerStructBufferAtomicIntrin(
Op, DAG,
9178 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9181 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9182 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9183 return lowerStructBufferAtomicIntrin(
Op, DAG,
9185 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9186 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9188 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9189 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9190 return lowerStructBufferAtomicIntrin(
Op, DAG,
9192 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9193 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9195 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9196 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9198 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9201 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9202 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9204 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9205 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9207 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9208 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9210 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9213 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9216 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9217 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9219 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9222 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9225 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9228 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9229 return lowerRawBufferAtomicIntrin(
Op, DAG,
9231 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9232 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9233 return lowerStructBufferAtomicIntrin(
Op, DAG,
9235 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9236 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9238 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9239 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9241 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9243 return lowerStructBufferAtomicIntrin(
Op, DAG,
9245 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9247 return lowerStructBufferAtomicIntrin(
Op, DAG,
9249 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9251 return lowerStructBufferAtomicIntrin(
Op, DAG,
9253 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9254 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9255 return lowerStructBufferAtomicIntrin(
Op, DAG,
9257 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9260 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9263 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9264 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9266 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9269 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9272 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9273 return lowerStructBufferAtomicIntrin(
Op, DAG,
9276 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9277 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9278 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9279 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9293 EVT VT =
Op.getValueType();
9294 auto *
M = cast<MemSDNode>(
Op);
9297 Op->getVTList(), Ops, VT,
9298 M->getMemOperand());
9300 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9302 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9303 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9317 EVT VT =
Op.getValueType();
9318 auto *
M = cast<MemSDNode>(
Op);
9321 Op->getVTList(), Ops, VT,
9322 M->getMemOperand());
9324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9326 SDValue NodePtr =
M->getOperand(2);
9327 SDValue RayExtent =
M->getOperand(3);
9328 SDValue RayOrigin =
M->getOperand(4);
9330 SDValue RayInvDir =
M->getOperand(6);
9348 const unsigned NumVDataDwords = 4;
9349 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9350 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9354 const unsigned BaseOpcodes[2][2] = {
9355 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9356 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9357 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9361 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9362 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9363 : AMDGPU::MIMGEncGfx10NSA,
9364 NumVDataDwords, NumVAddrDwords);
9368 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9369 : AMDGPU::MIMGEncGfx10Default,
9370 NumVDataDwords, NumVAddrDwords);
9376 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9379 if (Lanes[0].getValueSizeInBits() == 32) {
9380 for (
unsigned I = 0;
I < 3; ++
I)
9399 if (UseNSA && IsGFX11Plus) {
9407 for (
unsigned I = 0;
I < 3; ++
I) {
9410 {DirLanes[I], InvDirLanes[I]})));
9425 packLanes(RayOrigin,
true);
9426 packLanes(RayDir,
true);
9427 packLanes(RayInvDir,
false);
9432 if (NumVAddrDwords > 12) {
9452 case Intrinsic::amdgcn_global_atomic_fmin_num:
9453 case Intrinsic::amdgcn_global_atomic_fmax_num:
9454 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9455 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9462 unsigned Opcode = 0;
9464 case Intrinsic::amdgcn_global_atomic_fmin_num:
9465 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9469 case Intrinsic::amdgcn_global_atomic_fmax_num:
9470 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9478 Ops,
M->getMemOperand());
9480 case Intrinsic::amdgcn_s_get_barrier_state:
9481 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9486 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9487 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9488 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9489 BarID = (BarID >> 4) & 0x3F;
9490 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9495 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9496 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9516 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9524SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9534 bool IsTFE = VTList.
NumVTs == 3;
9537 unsigned NumOpDWords = NumValueDWords + 1;
9542 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9543 OpDWordsVT, OpDWordsMMO, DAG);
9558 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9564 WidenedMemVT, WidenedMMO);
9574 bool ImageStore)
const {
9609 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9615 if ((NumElements % 2) == 1) {
9617 unsigned I = Elts.
size() / 2;
9633 if (NumElements == 3) {
9654 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9657 switch (IntrinsicID) {
9658 case Intrinsic::amdgcn_exp_compr: {
9662 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9685 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9688 case Intrinsic::amdgcn_s_barrier:
9689 case Intrinsic::amdgcn_s_barrier_signal:
9690 case Intrinsic::amdgcn_s_barrier_wait: {
9693 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9694 if (WGSize <=
ST.getWavefrontSize()) {
9697 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9698 return Op.getOperand(0);
9701 MVT::Other,
Op.getOperand(0)),
9706 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9712 MVT::Other, K,
Op.getOperand(0)),
9724 case Intrinsic::amdgcn_struct_tbuffer_store:
9725 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9729 VData = handleD16VData(VData, DAG);
9730 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9731 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9749 M->getMemoryVT(),
M->getMemOperand());
9752 case Intrinsic::amdgcn_raw_tbuffer_store:
9753 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9757 VData = handleD16VData(VData, DAG);
9758 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9759 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9777 M->getMemoryVT(),
M->getMemOperand());
9780 case Intrinsic::amdgcn_raw_buffer_store:
9781 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9782 case Intrinsic::amdgcn_raw_buffer_store_format:
9783 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9784 const bool IsFormat =
9785 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9786 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9793 VData = handleD16VData(VData, DAG);
9803 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9804 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9824 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9827 M->getMemoryVT(),
M->getMemOperand());
9830 case Intrinsic::amdgcn_struct_buffer_store:
9831 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9832 case Intrinsic::amdgcn_struct_buffer_store_format:
9833 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9834 const bool IsFormat =
9835 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9836 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9844 VData = handleD16VData(VData, DAG);
9854 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9855 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9876 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9879 M->getMemoryVT(),
M->getMemOperand());
9881 case Intrinsic::amdgcn_raw_buffer_load_lds:
9882 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9883 case Intrinsic::amdgcn_struct_buffer_load_lds:
9884 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9888 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9889 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9890 unsigned OpOffset = HasVIndex ? 1 : 0;
9891 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9893 unsigned Size =
Op->getConstantOperandVal(4);
9899 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9900 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9901 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9902 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9905 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9906 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9907 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9908 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9911 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9912 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9913 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9914 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9919 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9920 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9921 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9922 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9927 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9928 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9929 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9930 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9938 if (HasVIndex && HasVOffset)
9944 else if (HasVOffset)
9947 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9952 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9964 auto *
M = cast<MemSDNode>(
Op);
9991 case Intrinsic::amdgcn_global_load_lds: {
9993 unsigned Size =
Op->getConstantOperandVal(4);
9998 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10001 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10004 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10009 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10014 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10018 auto *
M = cast<MemSDNode>(
Op);
10031 if (
LHS->isDivergent())
10035 RHS.getOperand(0).getValueType() == MVT::i32) {
10038 VOffset =
RHS.getOperand(0);
10043 if (!
Addr->isDivergent()) {
10060 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10080 case Intrinsic::amdgcn_end_cf:
10082 Op->getOperand(2), Chain),
10084 case Intrinsic::amdgcn_s_barrier_init:
10085 case Intrinsic::amdgcn_s_barrier_signal_var: {
10092 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10093 ? AMDGPU::S_BARRIER_INIT_M0
10094 : AMDGPU::S_BARRIER_SIGNAL_M0;
10109 constexpr unsigned ShAmt = 16;
10121 case Intrinsic::amdgcn_s_barrier_join: {
10128 if (isa<ConstantSDNode>(BarOp)) {
10129 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10130 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10133 unsigned BarID = (BarVal >> 4) & 0x3F;
10138 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10154 case Intrinsic::amdgcn_s_prefetch_data: {
10157 return Op.getOperand(0);
10160 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10162 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10169 Op->getVTList(), Ops,
M->getMemoryVT(),
10170 M->getMemOperand());
10175 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10188std::pair<SDValue, SDValue>
10195 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10198 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10212 unsigned Overflow = ImmOffset & ~MaxImm;
10213 ImmOffset -= Overflow;
10214 if ((int32_t)Overflow < 0) {
10215 Overflow += ImmOffset;
10220 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10224 SDValue Ops[] = {N0, OverflowVal};
10239void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10241 Align Alignment)
const {
10244 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10247 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10258 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10260 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10277SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10280 return MaybePointer;
10294 SDValue NumRecords =
Op->getOperand(3);
10297 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10300 std::optional<uint32_t> ConstStride = std::nullopt;
10301 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10302 ConstStride = ConstNode->getZExtValue();
10305 if (!ConstStride || *ConstStride != 0) {
10308 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10319 NewHighHalf, NumRecords, Flags);
10329 bool IsTFE)
const {
10339 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10367 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10371 Ops[1] = BufferStoreExt;
10376 M->getMemOperand());
10401 DAGCombinerInfo &DCI)
const {
10417 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10424 "unexpected vector extload");
10437 "unexpected fp extload");
10455 DCI.AddToWorklist(Cvt.
getNode());
10460 DCI.AddToWorklist(Cvt.
getNode());
10471 if (
Info.isEntryFunction())
10472 return Info.getUserSGPRInfo().hasFlatScratchInit();
10480 EVT MemVT =
Load->getMemoryVT();
10493 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10521 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10522 "Custom lowering for non-i32 vectors hasn't been implemented.");
10525 unsigned AS =
Load->getAddressSpace();
10549 Alignment >=
Align(4) && NumElements < 32) {
10563 if (NumElements > 4)
10582 if (NumElements > 2)
10587 if (NumElements > 4)
10599 auto Flags =
Load->getMemOperand()->getFlags();
10601 Load->getAlign(), Flags, &
Fast) &&
10610 MemVT, *
Load->getMemOperand())) {
10619 EVT VT =
Op.getValueType();
10656 EVT VT =
Op.getValueType();
10659 bool AllowInaccurateRcp =
10666 if (!AllowInaccurateRcp && VT != MVT::f16)
10669 if (CLHS->isExactlyValue(1.0)) {
10686 if (CLHS->isExactlyValue(-1.0)) {
10695 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10709 EVT VT =
Op.getValueType();
10712 bool AllowInaccurateDiv =
10714 if (!AllowInaccurateDiv)
10735 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10749 return DAG.
getNode(Opcode, SL, VTList,
10758 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10772 return DAG.
getNode(Opcode, SL, VTList,
10778 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10779 return FastLowered;
10799 unsigned FMADOpCode =
10809 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10811 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10812 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10835 const APFloat K0Val(0x1p+96f);
10838 const APFloat K1Val(0x1p-32f);
10865 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10866 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10867 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10872 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10873 return FastLowered;
10880 Flags.setNoFPExcept(
true);
10901 using namespace AMDGPU::Hwreg;
10902 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10910 const bool HasDynamicDenormals =
10916 if (!PreservesDenormals) {
10924 if (HasDynamicDenormals) {
10928 SavedDenormMode =
SDValue(GetReg, 0);
10936 const SDValue EnableDenormValue =
10943 const SDValue EnableDenormValue =
10945 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10946 {EnableDenormValue,
BitField, Glue});
10956 ApproxRcp, One, NegDivScale0, Flags);
10959 ApproxRcp, Fma0, Flags);
10965 NumeratorScaled,
Mul, Flags);
10971 NumeratorScaled, Fma3, Flags);
10973 if (!PreservesDenormals) {
10981 DisableDenormValue, Fma4.
getValue(2))
10984 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10985 const SDValue DisableDenormValue =
10986 HasDynamicDenormals
10991 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11002 {Fma4, Fma1, Fma3, Scale},
Flags);
11008 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11009 return FastLowered;
11077 EVT VT =
Op.getValueType();
11079 if (VT == MVT::f32)
11080 return LowerFDIV32(
Op, DAG);
11082 if (VT == MVT::f64)
11083 return LowerFDIV64(
Op, DAG);
11085 if (VT == MVT::f16)
11086 return LowerFDIV16(
Op, DAG);
11095 EVT ResultExpVT =
Op->getValueType(1);
11096 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11126 if (VT == MVT::i1) {
11130 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11134 Store->getValue().getValueType().getScalarType() == MVT::i32);
11136 unsigned AS =
Store->getAddressSpace();
11155 if (NumElements > 4)
11162 VT, *
Store->getMemOperand()))
11172 if (NumElements > 2)
11176 if (NumElements > 4 ||
11185 auto Flags =
Store->getMemOperand()->getFlags();
11220 MVT VT =
Op.getValueType().getSimpleVT();
11391 EVT VT =
Op.getValueType();
11408 switch (
Op.getOpcode()) {
11435 EVT VT =
Op.getValueType();
11443 Op->getVTList(), Ops, VT,
11452SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11453 DAGCombinerInfo &DCI)
const {
11454 EVT VT =
N->getValueType(0);
11456 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11463 EVT SrcVT = Src.getValueType();
11469 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11472 DCI.AddToWorklist(Cvt.
getNode());
11475 if (ScalarVT != MVT::f32) {
11487 DAGCombinerInfo &DCI)
const {
11488 SDValue MagnitudeOp =
N->getOperand(0);
11489 SDValue SignOp =
N->getOperand(1);
11545SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11547 DAGCombinerInfo &DCI)
const {
11577 AM.HasBaseReg =
true;
11578 AM.BaseOffs =
Offset.getSExtValue();
11583 EVT VT =
N->getValueType(0);
11589 Flags.setNoUnsignedWrap(
11590 N->getFlags().hasNoUnsignedWrap() &&
11600 switch (
N->getOpcode()) {
11611 DAGCombinerInfo &DCI)
const {
11620 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11621 N->getMemoryVT(), DCI);
11625 NewOps[PtrIdx] = NewPtr;
11634 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11635 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11644SDValue SITargetLowering::splitBinaryBitConstantOp(
11645 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11665 if (V.getValueType() != MVT::i1)
11667 switch (V.getOpcode()) {
11686 if (!(
C & 0x000000ff))
11687 ZeroByteMask |= 0x000000ff;
11688 if (!(
C & 0x0000ff00))
11689 ZeroByteMask |= 0x0000ff00;
11690 if (!(
C & 0x00ff0000))
11691 ZeroByteMask |= 0x00ff0000;
11692 if (!(
C & 0xff000000))
11693 ZeroByteMask |= 0xff000000;
11694 uint32_t NonZeroByteMask = ~ZeroByteMask;
11695 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11708 assert(V.getValueSizeInBits() == 32);
11710 if (V.getNumOperands() != 2)
11719 switch (V.getOpcode()) {
11724 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11729 return (0x03020100 & ~ConstMask) | ConstMask;
11736 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11742 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11749 DAGCombinerInfo &DCI)
const {
11750 if (DCI.isBeforeLegalize())
11754 EVT VT =
N->getValueType(0);
11759 if (VT == MVT::i64 && CRHS) {
11765 if (CRHS && VT == MVT::i32) {
11774 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11775 unsigned Shift = CShift->getZExtValue();
11777 unsigned Offset = NB + Shift;
11778 if ((
Offset & (Bits - 1)) == 0) {
11796 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11802 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11817 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11822 if (
X !=
LHS.getOperand(1))
11827 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11860 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11861 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11863 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11864 :
Mask->getZExtValue() & OrdMask;
11885 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11888 if (LHSMask != ~0u && RHSMask != ~0u) {
11891 if (LHSMask > RHSMask) {
11898 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11899 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11902 if (!(LHSUsedLanes & RHSUsedLanes) &&
11905 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11912 for (
unsigned I = 0;
I < 32;
I += 8) {
11914 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11915 Mask &= (0x0c <<
I) & 0xffffffff;
11973static const std::optional<ByteProvider<SDValue>>
11975 unsigned Depth = 0) {
11978 return std::nullopt;
11980 if (
Op.getValueSizeInBits() < 8)
11981 return std::nullopt;
11983 if (
Op.getValueType().isVector())
11986 switch (
Op->getOpcode()) {
11997 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11998 NarrowVT = VTSign->getVT();
12001 return std::nullopt;
12004 if (SrcIndex >= NarrowByteWidth)
12005 return std::nullopt;
12011 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12013 return std::nullopt;
12015 uint64_t BitShift = ShiftOp->getZExtValue();
12017 if (BitShift % 8 != 0)
12018 return std::nullopt;
12020 SrcIndex += BitShift / 8;
12038static const std::optional<ByteProvider<SDValue>>
12040 unsigned StartingIndex = 0) {
12044 return std::nullopt;
12046 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12048 return std::nullopt;
12050 return std::nullopt;
12052 bool IsVec =
Op.getValueType().isVector();
12053 switch (
Op.getOpcode()) {
12056 return std::nullopt;
12061 return std::nullopt;
12065 return std::nullopt;
12068 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12069 return std::nullopt;
12070 if (!
LHS ||
LHS->isConstantZero())
12072 if (!
RHS ||
RHS->isConstantZero())
12074 return std::nullopt;
12079 return std::nullopt;
12081 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12083 return std::nullopt;
12085 uint32_t BitMask = BitMaskOp->getZExtValue();
12087 uint32_t IndexMask = 0xFF << (Index * 8);
12089 if ((IndexMask & BitMask) != IndexMask) {
12092 if (IndexMask & BitMask)
12093 return std::nullopt;
12102 return std::nullopt;
12105 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12106 if (!ShiftOp ||
Op.getValueType().isVector())
12107 return std::nullopt;
12109 uint64_t BitsProvided =
Op.getValueSizeInBits();
12110 if (BitsProvided % 8 != 0)
12111 return std::nullopt;
12113 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12115 return std::nullopt;
12117 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12118 uint64_t ByteShift = BitShift / 8;
12120 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12121 uint64_t BytesProvided = BitsProvided / 8;
12122 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12123 NewIndex %= BytesProvided;
12130 return std::nullopt;
12132 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12134 return std::nullopt;
12136 uint64_t BitShift = ShiftOp->getZExtValue();
12138 return std::nullopt;
12140 auto BitsProvided =
Op.getScalarValueSizeInBits();
12141 if (BitsProvided % 8 != 0)
12142 return std::nullopt;
12144 uint64_t BytesProvided = BitsProvided / 8;
12145 uint64_t ByteShift = BitShift / 8;
12150 return BytesProvided - ByteShift > Index
12158 return std::nullopt;
12160 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12162 return std::nullopt;
12164 uint64_t BitShift = ShiftOp->getZExtValue();
12165 if (BitShift % 8 != 0)
12166 return std::nullopt;
12167 uint64_t ByteShift = BitShift / 8;
12173 return Index < ByteShift
12176 Depth + 1, StartingIndex);
12185 return std::nullopt;
12192 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12193 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12195 if (NarrowBitWidth % 8 != 0)
12196 return std::nullopt;
12197 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12199 if (Index >= NarrowByteWidth)
12201 ? std::optional<ByteProvider<SDValue>>(
12209 return std::nullopt;
12213 if (NarrowByteWidth >= Index) {
12218 return std::nullopt;
12225 return std::nullopt;
12229 auto *L = cast<LoadSDNode>(
Op.getNode());
12231 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12232 if (NarrowBitWidth % 8 != 0)
12233 return std::nullopt;
12234 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12239 if (Index >= NarrowByteWidth) {
12241 ? std::optional<ByteProvider<SDValue>>(
12246 if (NarrowByteWidth > Index) {
12250 return std::nullopt;
12255 return std::nullopt;
12258 Depth + 1, StartingIndex);
12262 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12264 return std::nullopt;
12265 auto VecIdx = IdxOp->getZExtValue();
12266 auto ScalarSize =
Op.getScalarValueSizeInBits();
12267 if (ScalarSize < 32)
12268 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12270 StartingIndex, Index);
12275 return std::nullopt;
12277 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12279 return std::nullopt;
12282 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12283 if (IdxMask > 0x07 && IdxMask != 0x0c)
12284 return std::nullopt;
12286 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12287 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12289 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12295 return std::nullopt;
12310 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12314 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12317 auto MemVT = L->getMemoryVT();
12320 return L->getMemoryVT().getSizeInBits() == 16;
12330 int Low8 = Mask & 0xff;
12331 int Hi8 = (Mask & 0xff00) >> 8;
12333 assert(Low8 < 8 && Hi8 < 8);
12335 bool IsConsecutive = (Hi8 - Low8 == 1);
12340 bool Is16Aligned = !(Low8 % 2);
12342 return IsConsecutive && Is16Aligned;
12350 int Low16 = PermMask & 0xffff;
12351 int Hi16 = (PermMask & 0xffff0000) >> 16;
12361 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12363 if (!OtherOpIs16Bit)
12371 unsigned DWordOffset) {
12374 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12376 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12381 if (Src.getValueType().isVector()) {
12382 auto ScalarTySize = Src.getScalarValueSizeInBits();
12383 auto ScalarTy = Src.getValueType().getScalarType();
12384 if (ScalarTySize == 32) {
12388 if (ScalarTySize > 32) {
12391 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12392 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12399 assert(ScalarTySize < 32);
12400 auto NumElements =
TypeSize / ScalarTySize;
12401 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12402 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12403 auto NumElementsIn32 = 32 / ScalarTySize;
12404 auto NumAvailElements = DWordOffset < Trunc32Elements
12406 : NumElements - NormalizedTrunc;
12419 auto ShiftVal = 32 * DWordOffset;
12427 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12432 for (
int i = 0; i < 4; i++) {
12434 std::optional<ByteProvider<SDValue>>
P =
12437 if (!
P ||
P->isConstantZero())
12442 if (PermNodes.
size() != 4)
12445 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12446 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12448 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12449 auto PermOp = PermNodes[i];
12452 int SrcByteAdjust = 4;
12456 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12457 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12459 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12460 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12464 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12465 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12468 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12470 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12473 SDValue Op = *PermNodes[FirstSrc.first].Src;
12475 assert(
Op.getValueSizeInBits() == 32);
12479 int Low16 = PermMask & 0xffff;
12480 int Hi16 = (PermMask & 0xffff0000) >> 16;
12482 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12483 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12486 if (WellFormedLow && WellFormedHi)
12490 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12499 assert(
Op.getValueType().isByteSized() &&
12517 DAGCombinerInfo &DCI)
const {
12522 EVT VT =
N->getValueType(0);
12523 if (VT == MVT::i1) {
12528 if (Src !=
RHS.getOperand(0))
12533 if (!CLHS || !CRHS)
12537 static const uint32_t MaxMask = 0x3ff;
12552 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12557 Sel |=
LHS.getConstantOperandVal(2);
12566 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12570 auto usesCombinedOperand = [](
SDNode *OrUse) {
12573 !OrUse->getValueType(0).isVector())
12577 for (
auto *VUser : OrUse->users()) {
12578 if (!VUser->getValueType(0).isVector())
12585 if (VUser->getOpcode() == VectorwiseOp)
12591 if (!
any_of(
N->users(), usesCombinedOperand))
12597 if (LHSMask != ~0u && RHSMask != ~0u) {
12600 if (LHSMask > RHSMask) {
12607 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12608 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12611 if (!(LHSUsedLanes & RHSUsedLanes) &&
12614 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12616 LHSMask &= ~RHSUsedLanes;
12617 RHSMask &= ~LHSUsedLanes;
12619 LHSMask |= LHSUsedLanes & 0x04040404;
12629 if (LHSMask == ~0u || RHSMask == ~0u) {
12635 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12650 if (SrcVT == MVT::i32) {
12655 DCI.AddToWorklist(LowOr.
getNode());
12656 DCI.AddToWorklist(HiBits.getNode());
12664 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12667 N->getOperand(0), CRHS))
12675 DAGCombinerInfo &DCI)
const {
12676 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12685 EVT VT =
N->getValueType(0);
12686 if (CRHS && VT == MVT::i64) {
12708 LHS->getOperand(0), FNegLHS, FNegRHS);
12717 DAGCombinerInfo &DCI)
const {
12722 EVT VT =
N->getValueType(0);
12723 if (VT != MVT::i32)
12727 if (Src.getValueType() != MVT::i16)
12734SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12735 DAGCombinerInfo &DCI)
const {
12737 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12742 VTSign->getVT() == MVT::i8) ||
12744 VTSign->getVT() == MVT::i16))) {
12746 "s_buffer_load_{u8, i8} are supported "
12747 "in GFX12 (or newer) architectures.");
12748 EVT VT = Src.getValueType();
12753 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12759 auto *
M = cast<MemSDNode>(Src);
12760 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12761 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12766 VTSign->getVT() == MVT::i8) ||
12768 VTSign->getVT() == MVT::i16)) &&
12770 auto *
M = cast<MemSDNode>(Src);
12771 SDValue Ops[] = {Src.getOperand(0),
12777 Src.getOperand(6), Src.getOperand(7)};
12780 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12784 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12785 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12786 return DCI.DAG.getMergeValues(
12793 DAGCombinerInfo &DCI)
const {
12801 if (
N->getOperand(0).isUndef())
12808 DAGCombinerInfo &DCI)
const {
12809 EVT VT =
N->getValueType(0);
12834 unsigned MaxDepth)
const {
12835 unsigned Opcode =
Op.getOpcode();
12839 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12840 const auto &
F = CFP->getValueAPF();
12841 if (
F.isNaN() &&
F.isSignaling())
12843 if (!
F.isDenormal())
12906 if (
Op.getValueType() == MVT::i32) {
12911 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12912 if (
RHS->getZExtValue() == 0xffff0000) {
12922 return Op.getValueType().getScalarType() != MVT::f16;
12990 if (
Op.getValueType() == MVT::i16) {
13001 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
13003 switch (IntrinsicID) {
13004 case Intrinsic::amdgcn_cvt_pkrtz:
13005 case Intrinsic::amdgcn_cubeid:
13006 case Intrinsic::amdgcn_frexp_mant:
13007 case Intrinsic::amdgcn_fdot2:
13008 case Intrinsic::amdgcn_rcp:
13009 case Intrinsic::amdgcn_rsq:
13010 case Intrinsic::amdgcn_rsq_clamp:
13011 case Intrinsic::amdgcn_rcp_legacy:
13012 case Intrinsic::amdgcn_rsq_legacy:
13013 case Intrinsic::amdgcn_trig_preop:
13014 case Intrinsic::amdgcn_log:
13015 case Intrinsic::amdgcn_exp2:
13016 case Intrinsic::amdgcn_sqrt:
13034 unsigned MaxDepth)
const {
13037 unsigned Opcode =
MI->getOpcode();
13039 if (Opcode == AMDGPU::G_FCANONICALIZE)
13042 std::optional<FPValueAndVReg> FCR;
13045 if (FCR->Value.isSignaling())
13047 if (!FCR->Value.isDenormal())
13058 case AMDGPU::G_FADD:
13059 case AMDGPU::G_FSUB:
13060 case AMDGPU::G_FMUL:
13061 case AMDGPU::G_FCEIL:
13062 case AMDGPU::G_FFLOOR:
13063 case AMDGPU::G_FRINT:
13064 case AMDGPU::G_FNEARBYINT:
13065 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13066 case AMDGPU::G_INTRINSIC_TRUNC:
13067 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13068 case AMDGPU::G_FMA:
13069 case AMDGPU::G_FMAD:
13070 case AMDGPU::G_FSQRT:
13071 case AMDGPU::G_FDIV:
13072 case AMDGPU::G_FREM:
13073 case AMDGPU::G_FPOW:
13074 case AMDGPU::G_FPEXT:
13075 case AMDGPU::G_FLOG:
13076 case AMDGPU::G_FLOG2:
13077 case AMDGPU::G_FLOG10:
13078 case AMDGPU::G_FPTRUNC:
13079 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13080 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13081 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13082 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13083 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13085 case AMDGPU::G_FNEG:
13086 case AMDGPU::G_FABS:
13087 case AMDGPU::G_FCOPYSIGN:
13089 case AMDGPU::G_FMINNUM:
13090 case AMDGPU::G_FMAXNUM:
13091 case AMDGPU::G_FMINNUM_IEEE:
13092 case AMDGPU::G_FMAXNUM_IEEE:
13093 case AMDGPU::G_FMINIMUM:
13094 case AMDGPU::G_FMAXIMUM: {
13102 case AMDGPU::G_BUILD_VECTOR:
13107 case AMDGPU::G_INTRINSIC:
13108 case AMDGPU::G_INTRINSIC_CONVERGENT:
13110 case Intrinsic::amdgcn_fmul_legacy:
13111 case Intrinsic::amdgcn_fmad_ftz:
13112 case Intrinsic::amdgcn_sqrt:
13113 case Intrinsic::amdgcn_fmed3:
13114 case Intrinsic::amdgcn_sin:
13115 case Intrinsic::amdgcn_cos:
13116 case Intrinsic::amdgcn_log:
13117 case Intrinsic::amdgcn_exp2:
13118 case Intrinsic::amdgcn_log_clamp:
13119 case Intrinsic::amdgcn_rcp:
13120 case Intrinsic::amdgcn_rcp_legacy:
13121 case Intrinsic::amdgcn_rsq:
13122 case Intrinsic::amdgcn_rsq_clamp:
13123 case Intrinsic::amdgcn_rsq_legacy:
13124 case Intrinsic::amdgcn_div_scale:
13125 case Intrinsic::amdgcn_div_fmas:
13126 case Intrinsic::amdgcn_div_fixup:
13127 case Intrinsic::amdgcn_fract:
13128 case Intrinsic::amdgcn_cvt_pkrtz:
13129 case Intrinsic::amdgcn_cubeid:
13130 case Intrinsic::amdgcn_cubema:
13131 case Intrinsic::amdgcn_cubesc:
13132 case Intrinsic::amdgcn_cubetc:
13133 case Intrinsic::amdgcn_frexp_mant:
13134 case Intrinsic::amdgcn_fdot2:
13135 case Intrinsic::amdgcn_trig_preop:
13154 if (
C.isDenormal()) {
13168 if (
C.isSignaling()) {
13187 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13191SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13192 DAGCombinerInfo &DCI)
const {
13195 EVT VT =
N->getValueType(0);
13204 EVT VT =
N->getValueType(0);
13205 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13221 EVT EltVT =
Lo.getValueType();
13224 for (
unsigned I = 0;
I != 2; ++
I) {
13228 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13229 }
else if (
Op.isUndef()) {
13241 if (isa<ConstantFPSDNode>(NewElts[1]))
13242 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13248 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13300 if (!MinK || !MaxK)
13313 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13314 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13355 if (
Info->getMode().DX10Clamp) {
13364 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13396 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13405 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13414 DAGCombinerInfo &DCI)
const {
13417 EVT VT =
N->getValueType(0);
13418 unsigned Opc =
N->getOpcode();
13447 if (
SDValue Med3 = performIntMed3ImmCombine(
13452 if (
SDValue Med3 = performIntMed3ImmCombine(
13458 if (
SDValue Med3 = performIntMed3ImmCombine(
13463 if (
SDValue Med3 = performIntMed3ImmCombine(
13473 (VT == MVT::f32 || VT == MVT::f64 ||
13477 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13488 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13489 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13498 DAGCombinerInfo &DCI)
const {
13499 EVT VT =
N->getValueType(0);
13522 if (
Info->getMode().DX10Clamp) {
13525 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13528 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13531 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13542 DAGCombinerInfo &DCI)
const {
13546 return DCI.DAG.getUNDEF(
N->getValueType(0));
13554 bool IsDivergentIdx,
13559 unsigned VecSize = EltSize * NumElem;
13562 if (VecSize <= 64 && EltSize < 32)
13571 if (IsDivergentIdx)
13575 unsigned NumInsts = NumElem +
13576 ((EltSize + 31) / 32) * NumElem ;
13581 return NumInsts <= 16;
13586 return NumInsts <= 15;
13593 if (isa<ConstantSDNode>(
Idx))
13607SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13608 DAGCombinerInfo &DCI)
const {
13614 EVT ResVT =
N->getValueType(0);
13633 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13661 DCI.AddToWorklist(Elt0.
getNode());
13662 DCI.AddToWorklist(Elt1.
getNode());
13684 if (!DCI.isBeforeLegalize())
13690 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13691 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13692 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13695 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13696 unsigned EltIdx = BitIndex / 32;
13697 unsigned LeftoverBitIdx = BitIndex % 32;
13701 DCI.AddToWorklist(Cast.
getNode());
13705 DCI.AddToWorklist(Elt.
getNode());
13708 DCI.AddToWorklist(Srl.
getNode());
13712 DCI.AddToWorklist(Trunc.
getNode());
13714 if (VecEltVT == ResVT) {
13726SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13727 DAGCombinerInfo &DCI)
const {
13741 EVT IdxVT =
Idx.getValueType();
13758 Src.getOperand(0).getValueType() == MVT::f16) {
13759 return Src.getOperand(0);
13762 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13763 APFloat Val = CFP->getValueAPF();
13764 bool LosesInfo =
true;
13774 DAGCombinerInfo &DCI)
const {
13776 "combine only useful on gfx8");
13778 SDValue TruncSrc =
N->getOperand(0);
13779 EVT VT =
N->getValueType(0);
13780 if (VT != MVT::f16)
13818unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13820 const SDNode *N1)
const {
13825 if (((VT == MVT::f32 &&
13827 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13847 EVT VT =
N->getValueType(0);
13848 if (VT != MVT::i32 && VT != MVT::i64)
13854 unsigned Opc =
N->getOpcode();
13877 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13909 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
13928 DAGCombinerInfo &DCI)
const {
13932 EVT VT =
N->getValueType(0);
13942 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13946 if (NumBits <= 32 || NumBits > 64)
13958 unsigned NumUsers = 0;
13986 bool MulSignedLo =
false;
13987 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13996 if (VT != MVT::i64) {
14019 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14021 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14022 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14024 if (!MulLHSUnsigned32) {
14031 if (!MulRHSUnsigned32) {
14042 if (VT != MVT::i64)
14048SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14049 DAGCombinerInfo &DCI)
const {
14051 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14075 DAG.
getNode(
N->getOpcode(), SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14086static std::optional<ByteProvider<SDValue>>
14089 if (!Byte0 || Byte0->isConstantZero()) {
14090 return std::nullopt;
14093 if (Byte1 && !Byte1->isConstantZero()) {
14094 return std::nullopt;
14100 unsigned FirstCs =
First & 0x0c0c0c0c;
14101 unsigned SecondCs = Second & 0x0c0c0c0c;
14102 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14103 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14105 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14106 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14107 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14108 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14110 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14134 for (
int BPI = 0; BPI < 2; BPI++) {
14137 BPP = {Src1, Src0};
14139 unsigned ZeroMask = 0x0c0c0c0c;
14140 unsigned FMask = 0xFF << (8 * (3 - Step));
14142 unsigned FirstMask =
14143 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14144 unsigned SecondMask =
14145 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14149 int FirstGroup = -1;
14150 for (
int I = 0;
I < 2;
I++) {
14152 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14153 return IterElt.SrcOp == *BPP.first.Src &&
14154 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14164 if (FirstGroup != -1) {
14166 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14167 return IterElt.SrcOp == *BPP.second.Src &&
14168 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14174 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14182 unsigned ZeroMask = 0x0c0c0c0c;
14183 unsigned FMask = 0xFF << (8 * (3 - Step));
14187 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14191 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14200 if (Srcs.
size() == 1) {
14201 auto *Elt = Srcs.
begin();
14205 if (Elt->PermMask == 0x3020100)
14212 auto *FirstElt = Srcs.
begin();
14213 auto *SecondElt = std::next(FirstElt);
14220 auto FirstMask = FirstElt->PermMask;
14221 auto SecondMask = SecondElt->PermMask;
14223 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14224 unsigned FirstPlusFour = FirstMask | 0x04040404;
14227 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14239 FirstElt = std::next(SecondElt);
14240 if (FirstElt == Srcs.
end())
14243 SecondElt = std::next(FirstElt);
14246 if (SecondElt == Srcs.
end()) {
14252 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14258 return Perms.
size() == 2
14264 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14265 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14266 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14267 EntryMask += ZeroMask;
14272 auto Opcode =
Op.getOpcode();
14278static std::optional<bool>
14289 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14292 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14294 assert(!(S0IsUnsigned && S0IsSigned));
14295 assert(!(S1IsUnsigned && S1IsSigned));
14303 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14309 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14310 return std::nullopt;
14322 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14323 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14328 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14334 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14335 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14336 return std::nullopt;
14342 DAGCombinerInfo &DCI)
const {
14344 EVT VT =
N->getValueType(0);
14351 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14356 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14360 if (VT == MVT::i64) {
14361 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14368 std::optional<bool> IsSigned;
14374 int ChainLength = 0;
14375 for (
int I = 0;
I < 4;
I++) {
14376 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14379 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14382 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14387 TempNode->getOperand(MulIdx), *Src0, *Src1,
14388 TempNode->getOperand(MulIdx)->getOperand(0),
14389 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14393 IsSigned = *IterIsSigned;
14394 if (*IterIsSigned != *IsSigned)
14397 auto AddIdx = 1 - MulIdx;
14400 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14401 Src2s.
push_back(TempNode->getOperand(AddIdx));
14411 TempNode->getOperand(AddIdx), *Src0, *Src1,
14412 TempNode->getOperand(AddIdx)->getOperand(0),
14413 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14417 if (*IterIsSigned != *IsSigned)
14421 ChainLength =
I + 2;
14425 TempNode = TempNode->getOperand(AddIdx);
14427 ChainLength =
I + 1;
14428 if (TempNode->getNumOperands() < 2)
14430 LHS = TempNode->getOperand(0);
14431 RHS = TempNode->getOperand(1);
14434 if (ChainLength < 2)
14440 if (ChainLength < 4) {
14450 bool UseOriginalSrc =
false;
14451 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14452 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14453 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14454 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14456 auto Src0Mask = Src0s.
begin()->PermMask;
14457 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14458 bool UniqueEntries =
true;
14459 for (
auto I = 1;
I < 4;
I++) {
14460 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14463 UniqueEntries =
false;
14469 if (UniqueEntries) {
14470 UseOriginalSrc =
true;
14472 auto *FirstElt = Src0s.
begin();
14476 auto *SecondElt = Src1s.
begin();
14478 SecondElt->DWordOffset);
14487 if (!UseOriginalSrc) {
14494 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14497 : Intrinsic::amdgcn_udot4,
14507 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14512 unsigned Opc =
LHS.getOpcode();
14517 Opc =
RHS.getOpcode();
14524 auto Cond =
RHS.getOperand(0);
14532 return DAG.
getNode(Opc, SL, VTList, Args);
14546 DAGCombinerInfo &DCI)
const {
14548 EVT VT =
N->getValueType(0);
14550 if (VT == MVT::i64) {
14551 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14555 if (VT != MVT::i32)
14564 unsigned Opc =
RHS.getOpcode();
14571 auto Cond =
RHS.getOperand(0);
14579 return DAG.
getNode(Opc, SL, VTList, Args);
14594SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14595 DAGCombinerInfo &DCI)
const {
14597 if (
N->getValueType(0) != MVT::i32)
14608 unsigned LHSOpc =
LHS.getOpcode();
14609 unsigned Opc =
N->getOpcode();
14619 DAGCombinerInfo &DCI)
const {
14624 EVT VT =
N->getValueType(0);
14636 if (
A ==
LHS.getOperand(1)) {
14637 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14638 if (FusedOp != 0) {
14640 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14648 if (
A ==
RHS.getOperand(1)) {
14649 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14650 if (FusedOp != 0) {
14652 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14661 DAGCombinerInfo &DCI)
const {
14667 EVT VT =
N->getValueType(0);
14680 if (
A ==
LHS.getOperand(1)) {
14681 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14682 if (FusedOp != 0) {
14686 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14695 if (
A ==
RHS.getOperand(1)) {
14696 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14697 if (FusedOp != 0) {
14699 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14708 DAGCombinerInfo &DCI)
const {
14711 EVT VT =
N->getValueType(0);
14725 bool IsNegative =
false;
14726 if (CLHS->isExactlyValue(1.0) ||
14727 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14743 DAGCombinerInfo &DCI)
const {
14745 EVT VT =
N->getValueType(0);
14759 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14774 if (ScalarVT == MVT::f32 &&
14780 if (TrueNodeExpVal == INT_MIN)
14783 if (FalseNodeExpVal == INT_MIN)
14803 DAGCombinerInfo &DCI)
const {
14805 EVT VT =
N->getValueType(0);
14826 (
N->getFlags().hasAllowContract() &&
14827 FMA->getFlags().hasAllowContract())) {
14861 if (Vec1 == Vec2 || Vec3 == Vec4)
14867 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14876 DAGCombinerInfo &DCI)
const {
14882 EVT VT =
LHS.getValueType();
14885 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14887 CRHS = dyn_cast<ConstantSDNode>(LHS);
14911 return LHS.getOperand(0);
14917 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14918 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14919 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14926 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14927 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14935 return LHS.getOperand(0);
14939 if (VT != MVT::f32 && VT != MVT::f64 &&
14955 const unsigned IsInfMask =
14957 const unsigned IsFiniteMask =
14971SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
14972 DAGCombinerInfo &DCI)
const {
14990 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14994 unsigned ShiftOffset = 8 *
Offset;
14996 ShiftOffset -=
C->getZExtValue();
14998 ShiftOffset +=
C->getZExtValue();
15000 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15002 MVT::f32, Shifted);
15013 DCI.AddToWorklist(
N);
15020 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
15026 DAGCombinerInfo &DCI)
const {
15036 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
15039 APFloat One(
F.getSemantics(),
"1.0");
15041 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
15048 switch (
N->getOpcode()) {
15064 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
15074 switch (
N->getOpcode()) {
15076 return performAddCombine(
N, DCI);
15078 return performSubCombine(
N, DCI);
15081 return performAddCarrySubCarryCombine(
N, DCI);
15083 return performFAddCombine(
N, DCI);
15085 return performFSubCombine(
N, DCI);
15087 return performFDivCombine(
N, DCI);
15089 return performFMulCombine(
N, DCI);
15091 return performSetCCCombine(
N, DCI);
15104 return performMinMaxCombine(
N, DCI);
15106 return performFMACombine(
N, DCI);
15108 return performAndCombine(
N, DCI);
15110 return performOrCombine(
N, DCI);
15113 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15114 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15120 return performXorCombine(
N, DCI);
15122 return performZeroExtendCombine(
N, DCI);
15124 return performSignExtendInRegCombine(
N, DCI);
15126 return performClassCombine(
N, DCI);
15128 return performFCanonicalizeCombine(
N, DCI);
15130 return performRcpCombine(
N, DCI);
15145 return performUCharToFloatCombine(
N, DCI);
15147 return performFCopySignCombine(
N, DCI);
15152 return performCvtF32UByteNCombine(
N, DCI);
15154 return performFMed3Combine(
N, DCI);
15156 return performCvtPkRTZCombine(
N, DCI);
15158 return performClampCombine(
N, DCI);
15161 EVT VT =
N->getValueType(0);
15164 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15167 EVT EltVT = Src.getValueType();
15168 if (EltVT != MVT::i16)
15178 return performExtractVectorEltCombine(
N, DCI);
15180 return performInsertVectorEltCombine(
N, DCI);
15182 return performFPRoundCombine(
N, DCI);
15184 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15190 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15191 return performMemSDNodeCombine(MemNode, DCI);
15222 unsigned Opcode =
Node->getMachineOpcode();
15226 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15231 unsigned DmaskIdx =
15233 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15234 unsigned NewDmask = 0;
15237 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15238 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15241 unsigned TFCLane = 0;
15242 bool HasChain =
Node->getNumValues() > 1;
15244 if (OldDmask == 0) {
15252 TFCLane = OldBitsSet;
15259 if (
Use.getResNo() != 0)
15265 if (!
User->isMachineOpcode() ||
15266 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15278 if (UsesTFC && Lane == TFCLane) {
15283 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15285 Dmask &= ~(1 << Comp);
15293 NewDmask |= 1 << Comp;
15298 bool NoChannels = !NewDmask;
15305 if (OldBitsSet == 1)
15311 if (NewDmask == OldDmask)
15320 unsigned NewChannels = BitsSet + UsesTFC;
15324 assert(NewOpcode != -1 &&
15325 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15326 "failed to find equivalent MIMG op");
15334 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15336 MVT ResultVT = NewChannels == 1
15339 : NewChannels == 5 ? 8
15353 if (NewChannels == 1) {
15363 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15368 if (i || !NoChannels)
15373 if (NewUser !=
User) {
15383 Idx = AMDGPU::sub1;
15386 Idx = AMDGPU::sub2;
15389 Idx = AMDGPU::sub3;
15392 Idx = AMDGPU::sub4;
15403 Op =
Op.getOperand(0);
15405 return isa<FrameIndexSDNode>(
Op);
15415 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15416 SDValue SrcVal = Node->getOperand(2);
15424 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15426 SDNode *Glued = Node->getGluedNode();
15428 Node->getOperand(0), SL, VReg, SrcVal,
15434 return ToResultReg.
getNode();
15439 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15447 Node->getOperand(i).getValueType(),
15448 Node->getOperand(i)),
15460 unsigned Opcode = Node->getMachineOpcode();
15462 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15463 !
TII->isGather4(Opcode) &&
15465 return adjustWritemask(Node, DAG);
15468 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15474 case AMDGPU::V_DIV_SCALE_F32_e64:
15475 case AMDGPU::V_DIV_SCALE_F64_e64: {
15479 SDValue Src0 = Node->getOperand(1);
15480 SDValue Src1 = Node->getOperand(3);
15481 SDValue Src2 = Node->getOperand(5);
15485 (Src0 == Src1 || Src0 == Src2))
15542 unsigned InitIdx = 0;
15544 if (
TII->isImage(
MI)) {
15552 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15553 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15554 unsigned D16Val = D16 ? D16->getImm() : 0;
15556 if (!TFEVal && !LWEVal)
15567 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15569 unsigned dmask = MO_Dmask->
getImm();
15576 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15582 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15583 if (DstSize < InitIdx)
15586 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15594 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15595 unsigned NewDst = 0;
15604 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15605 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15625 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15638 if (
TII->isVOP3(
MI.getOpcode())) {
15640 TII->legalizeOperandsVOP3(
MRI,
MI);
15645 if (!
MI.getDesc().operands().empty()) {
15646 unsigned Opc =
MI.getOpcode();
15647 bool HasAGPRs =
Info->mayNeedAGPRs();
15655 if ((
I == Src2Idx) && (HasAGPRs))
15658 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15660 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15661 if (!
TRI->hasAGPRs(RC))
15663 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15664 if (!Src || !Src->isCopy() ||
15665 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15667 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15671 MRI.setRegClass(
Op.getReg(), NewRC);
15674 if (
TII->isMAI(
MI)) {
15680 AMDGPU::OpName::scale_src0);
15681 if (Src0Idx != -1) {
15683 AMDGPU::OpName::scale_src1);
15684 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15685 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15686 TII->legalizeOpWithMove(
MI, Src1Idx);
15694 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15695 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15696 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15697 if (
TRI->isVectorSuperClass(RC)) {
15698 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15699 MRI.setRegClass(Src2->getReg(), NewRC);
15700 if (Src2->isTied())
15701 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15710 if (
TII->isImage(
MI))
15711 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15785std::pair<unsigned, const TargetRegisterClass *>
15792 if (Constraint.
size() == 1) {
15794 switch (Constraint[0]) {
15801 RC = &AMDGPU::SReg_32RegClass;
15804 RC = &AMDGPU::SGPR_64RegClass;
15809 return std::pair(0U,
nullptr);
15816 RC = &AMDGPU::VGPR_32RegClass;
15821 return std::pair(0U,
nullptr);
15830 RC = &AMDGPU::AGPR_32RegClass;
15835 return std::pair(0U,
nullptr);
15844 return std::pair(0U, RC);
15849 if (
RegName.consume_front(
"v")) {
15850 RC = &AMDGPU::VGPR_32RegClass;
15851 }
else if (
RegName.consume_front(
"s")) {
15852 RC = &AMDGPU::SGPR_32RegClass;
15853 }
else if (
RegName.consume_front(
"a")) {
15854 RC = &AMDGPU::AGPR_32RegClass;
15859 if (
RegName.consume_front(
"[")) {
15870 return std::pair(0U,
nullptr);
15873 RC =
TRI->getVGPRClassForBitWidth(Width);
15875 RC =
TRI->getSGPRClassForBitWidth(Width);
15877 RC =
TRI->getAGPRClassForBitWidth(Width);
15879 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15880 return std::pair(Reg, RC);
15886 return std::pair(0U,
nullptr);
15888 if (!
Failed && Idx < RC->getNumRegs())
15896 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15902 if (Constraint.
size() == 1) {
15903 switch (Constraint[0]) {
15913 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
15921 if (Constraint.
size() == 1) {
15922 switch (Constraint[0]) {
15939 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15946 std::vector<SDValue> &Ops,
15961 unsigned Size =
Op.getScalarValueSizeInBits();
15969 Val =
C->getSExtValue();
15973 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15979 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15982 Val =
C->getSExtValue();
15986 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15996 if (Constraint.
size() == 1) {
15997 switch (Constraint[0]) {
16001 return isInt<16>(Val);
16005 return isInt<32>(Val);
16012 }
else if (Constraint.
size() == 2) {
16013 if (Constraint ==
"DA") {
16014 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
16015 int64_t LoBits =
static_cast<int32_t
>(Val);
16019 if (Constraint ==
"DB") {
16027 unsigned MaxSize)
const {
16028 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
16031 MVT VT =
Op.getSimpleValueType();
16056 switch (UnalignedClassID) {
16057 case AMDGPU::VReg_64RegClassID:
16058 return AMDGPU::VReg_64_Align2RegClassID;
16059 case AMDGPU::VReg_96RegClassID:
16060 return AMDGPU::VReg_96_Align2RegClassID;
16061 case AMDGPU::VReg_128RegClassID:
16062 return AMDGPU::VReg_128_Align2RegClassID;
16063 case AMDGPU::VReg_160RegClassID:
16064 return AMDGPU::VReg_160_Align2RegClassID;
16065 case AMDGPU::VReg_192RegClassID:
16066 return AMDGPU::VReg_192_Align2RegClassID;
16067 case AMDGPU::VReg_224RegClassID:
16068 return AMDGPU::VReg_224_Align2RegClassID;
16069 case AMDGPU::VReg_256RegClassID:
16070 return AMDGPU::VReg_256_Align2RegClassID;
16071 case AMDGPU::VReg_288RegClassID:
16072 return AMDGPU::VReg_288_Align2RegClassID;
16073 case AMDGPU::VReg_320RegClassID:
16074 return AMDGPU::VReg_320_Align2RegClassID;
16075 case AMDGPU::VReg_352RegClassID:
16076 return AMDGPU::VReg_352_Align2RegClassID;
16077 case AMDGPU::VReg_384RegClassID:
16078 return AMDGPU::VReg_384_Align2RegClassID;
16079 case AMDGPU::VReg_512RegClassID:
16080 return AMDGPU::VReg_512_Align2RegClassID;
16081 case AMDGPU::VReg_1024RegClassID:
16082 return AMDGPU::VReg_1024_Align2RegClassID;
16083 case AMDGPU::AReg_64RegClassID:
16084 return AMDGPU::AReg_64_Align2RegClassID;
16085 case AMDGPU::AReg_96RegClassID:
16086 return AMDGPU::AReg_96_Align2RegClassID;
16087 case AMDGPU::AReg_128RegClassID:
16088 return AMDGPU::AReg_128_Align2RegClassID;
16089 case AMDGPU::AReg_160RegClassID:
16090 return AMDGPU::AReg_160_Align2RegClassID;
16091 case AMDGPU::AReg_192RegClassID:
16092 return AMDGPU::AReg_192_Align2RegClassID;
16093 case AMDGPU::AReg_256RegClassID:
16094 return AMDGPU::AReg_256_Align2RegClassID;
16095 case AMDGPU::AReg_512RegClassID:
16096 return AMDGPU::AReg_512_Align2RegClassID;
16097 case AMDGPU::AReg_1024RegClassID:
16098 return AMDGPU::AReg_1024_Align2RegClassID;
16114 if (
Info->isEntryFunction()) {
16121 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16123 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16124 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16125 &AMDGPU::SGPR_64RegClass);
16126 Info->setSGPRForEXECCopy(SReg);
16129 Info->getStackPtrOffsetReg()));
16130 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16131 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16135 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16136 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16138 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16139 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16141 Info->limitOccupancy(MF);
16143 if (ST.isWave32() && !MF.
empty()) {
16144 for (
auto &
MBB : MF) {
16145 for (
auto &
MI :
MBB) {
16146 TII->fixImplicitOperands(
MI);
16156 if (ST.needsAlignedVGPRs()) {
16157 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16163 if (NewClassID != -1)
16164 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16173 const APInt &DemandedElts,
16175 unsigned Depth)
const {
16177 unsigned Opc =
Op.getOpcode();
16180 unsigned IID =
Op.getConstantOperandVal(0);
16182 case Intrinsic::amdgcn_mbcnt_lo:
16183 case Intrinsic::amdgcn_mbcnt_hi: {
16189 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16199 Op, Known, DemandedElts, DAG,
Depth);
16214 unsigned MaxValue =
16223 switch (
MI->getOpcode()) {
16224 case AMDGPU::G_INTRINSIC:
16225 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16228 case Intrinsic::amdgcn_workitem_id_x:
16231 case Intrinsic::amdgcn_workitem_id_y:
16234 case Intrinsic::amdgcn_workitem_id_z:
16237 case Intrinsic::amdgcn_mbcnt_lo:
16238 case Intrinsic::amdgcn_mbcnt_hi: {
16250 case Intrinsic::amdgcn_groupstaticsize: {
16261 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16264 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16267 case AMDGPU::G_AMDGPU_SMED3:
16268 case AMDGPU::G_AMDGPU_UMED3: {
16269 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16296 unsigned Depth)
const {
16298 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16304 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16331 if (Header->getAlignment() != PrefAlign)
16332 return Header->getAlignment();
16334 unsigned LoopSize = 0;
16342 LoopSize +=
TII->getInstSizeInBytes(
MI);
16343 if (LoopSize > 192)
16348 if (LoopSize <= 64)
16351 if (LoopSize <= 128)
16352 return CacheLineAlign;
16358 auto I = Exit->getFirstNonDebugInstr();
16359 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16360 return CacheLineAlign;
16369 if (PreTerm == Pre->
begin() ||
16370 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16374 auto ExitHead = Exit->getFirstNonDebugInstr();
16375 if (ExitHead == Exit->end() ||
16376 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16381 return CacheLineAlign;
16389 N =
N->getOperand(0).getNode();
16399 switch (
N->getOpcode()) {
16407 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16408 return !
TRI->isSGPRReg(
MRI, Reg);
16410 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16414 return !
TRI->isSGPRReg(
MRI, Reg);
16418 unsigned AS = L->getAddressSpace();
16449 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16451 return A->readMem() &&
A->writeMem();
16486 unsigned Depth)
const {
16491 if (
Info->getMode().DX10Clamp)
16503 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16523 <<
"Hardware instruction generated for atomic "
16525 <<
" operation at memory scope " << MemScope;
16529 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16530 Type *EltTy = VT->getElementType();
16531 return VT->getNumElements() == 2 &&
16550 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16551 unsigned BW =
IT->getBitWidth();
16552 return BW == 32 || BW == 64;
16564 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16566 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16567 return BW == 32 || BW == 64;
16574 return VT->getNumElements() == 2 &&
16575 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16585 bool HasSystemScope) {
16592 if (HasSystemScope) {
16599 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16612 const MDNode *NoaliasAddrSpaceMD =
16613 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16614 if (!NoaliasAddrSpaceMD)
16617 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16619 auto *
Low = mdconst::extract<ConstantInt>(
16622 auto *
High = mdconst::extract<ConstantInt>(
16644 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16657 bool HasSystemScope =
16844 if (HasSystemScope)
16896 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16897 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16898 : &AMDGPU::SReg_32RegClass;
16899 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16900 return TRI->getEquivalentSGPRClass(RC);
16901 if (
TRI->isSGPRClass(RC) && isDivergent)
16902 return TRI->getEquivalentVGPRClass(RC);
16914 unsigned WaveSize) {
16919 if (!
IT ||
IT->getBitWidth() != WaveSize)
16922 if (!isa<Instruction>(V))
16924 if (!Visited.
insert(V).second)
16926 bool Result =
false;
16927 for (
const auto *U : V->users()) {
16928 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16929 if (V == U->getOperand(1)) {
16930 switch (Intrinsic->getIntrinsicID()) {
16934 case Intrinsic::amdgcn_if_break:
16935 case Intrinsic::amdgcn_if:
16936 case Intrinsic::amdgcn_else:
16941 if (V == U->getOperand(0)) {
16942 switch (Intrinsic->getIntrinsicID()) {
16946 case Intrinsic::amdgcn_end_cf:
16947 case Intrinsic::amdgcn_loop:
16953 Result =
hasCFUser(U, Visited, WaveSize);
16962 const Value *V)
const {
16963 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16964 if (CI->isInlineAsm()) {
16973 for (
auto &TC : TargetConstraints) {
17015 return MRI.hasOneNonDBGUse(N0);
17022 if (
I.getMetadata(
"amdgpu.noclobber"))
17024 if (
I.getMetadata(
"amdgpu.last.use"))
17034 if (!Def->isMachineOpcode())
17044 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17045 PhysReg = AMDGPU::SCC;
17047 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17056 if (!
I->hasOneUse())
17062 switch (
I->getOpcode()) {
17063 case Instruction::FMul: {
17064 if (
User->getOpcode() != Instruction::FSub &&
17065 User->getOpcode() != Instruction::FAdd)
17070 return ((!
I->hasAllowContract() || !
User->hasAllowContract()) &&
17129 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17140 Alignment = RMW->getAlign();
17155 RMW->getType()->isFloatTy();
17158 bool ReturnValueIsUsed = !AI->
use_empty();
17167 if (FullFlatEmulation) {
17178 std::prev(BB->
end())->eraseFromParent();
17181 Value *LoadedShared =
nullptr;
17182 if (FullFlatEmulation) {
17184 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17185 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17193 LoadedShared = Clone;
17200 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17208 Value *LoadedPrivate;
17211 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17214 LoadedPrivate, RMW->getValOperand());
17218 auto [ResultLoad, Equal] =
17233 if (FullFlatEmulation) {
17243 if (!FullFlatEmulation) {
17248 MDNode *RangeNotPrivate =
17251 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17259 if (ReturnValueIsUsed) {
17262 if (FullFlatEmulation)
17277 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17278 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const