39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
438 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
439 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
444 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
448 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
449 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
450 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
451 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
532 {MVT::f32, MVT::f64},
Legal);
625 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
626 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
627 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
763 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
767 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
771 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
772 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
791 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
794 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
795 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
796 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
799 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
807 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
823 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
843 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
844 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
845 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
846 MVT::v32f16, MVT::v32bf16},
862 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
864 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
876 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
877 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
882 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
883 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
884 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
885 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
889 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
890 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
891 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
999 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1012 EVT DestVT,
EVT SrcVT)
const {
1022 LLT DestTy,
LLT SrcTy)
const {
1023 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1024 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1050 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1052 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1079 return (NumElts + 1) / 2;
1085 return NumElts * ((
Size + 31) / 32);
1094 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1103 if (ScalarVT == MVT::bf16) {
1104 RegisterVT = MVT::i32;
1105 IntermediateVT = MVT::v2bf16;
1107 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1108 IntermediateVT = RegisterVT;
1110 NumIntermediates = (NumElts + 1) / 2;
1111 return NumIntermediates;
1116 IntermediateVT = RegisterVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1121 if (Size < 16 && Subtarget->has16BitInsts()) {
1123 RegisterVT = MVT::i16;
1124 IntermediateVT = ScalarVT;
1125 NumIntermediates = NumElts;
1126 return NumIntermediates;
1130 RegisterVT = MVT::i32;
1131 IntermediateVT = ScalarVT;
1132 NumIntermediates = NumElts;
1133 return NumIntermediates;
1137 RegisterVT = MVT::i32;
1138 IntermediateVT = RegisterVT;
1139 NumIntermediates = NumElts * ((
Size + 31) / 32);
1140 return NumIntermediates;
1145 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1150 unsigned MaxNumLanes) {
1151 assert(MaxNumLanes != 0);
1154 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1155 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1166 unsigned MaxNumLanes) {
1167 auto *ST = dyn_cast<StructType>(Ty);
1172 assert(ST->getNumContainedTypes() == 2 &&
1173 ST->getContainedType(1)->isIntegerTy(32));
1188 DL.getPointerSizeInBits(AS) == 192)
1198 DL.getPointerSizeInBits(AS) == 160) ||
1200 DL.getPointerSizeInBits(AS) == 192))
1208 unsigned IntrID)
const {
1210 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1228 if (RsrcIntr->IsImage) {
1236 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1243 Info.ptrVal = RsrcArg;
1246 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1255 if (RsrcIntr->IsImage) {
1256 unsigned MaxNumLanes = 4;
1271 std::numeric_limits<unsigned>::max());
1281 if (RsrcIntr->IsImage) {
1282 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1302 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1304 Info.memVT = MVT::i32;
1311 case Intrinsic::amdgcn_raw_buffer_load_lds:
1312 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1313 case Intrinsic::amdgcn_struct_buffer_load_lds:
1314 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1315 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1320 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1321 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1322 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1323 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1326 std::numeric_limits<unsigned>::max());
1336 case Intrinsic::amdgcn_ds_ordered_add:
1337 case Intrinsic::amdgcn_ds_ordered_swap: {
1350 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1351 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1354 Info.ptrVal =
nullptr;
1359 case Intrinsic::amdgcn_ds_append:
1360 case Intrinsic::amdgcn_ds_consume: {
1373 case Intrinsic::amdgcn_global_atomic_csub: {
1382 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1392 case Intrinsic::amdgcn_global_atomic_fmin_num:
1393 case Intrinsic::amdgcn_global_atomic_fmax_num:
1394 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1395 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1396 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1397 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1407 case Intrinsic::amdgcn_global_load_tr_b64:
1408 case Intrinsic::amdgcn_global_load_tr_b128:
1409 case Intrinsic::amdgcn_ds_read_tr4_b64:
1410 case Intrinsic::amdgcn_ds_read_tr6_b96:
1411 case Intrinsic::amdgcn_ds_read_tr8_b64:
1412 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1420 case Intrinsic::amdgcn_ds_gws_init:
1421 case Intrinsic::amdgcn_ds_gws_barrier:
1422 case Intrinsic::amdgcn_ds_gws_sema_v:
1423 case Intrinsic::amdgcn_ds_gws_sema_br:
1424 case Intrinsic::amdgcn_ds_gws_sema_p:
1425 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1435 Info.memVT = MVT::i32;
1439 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1445 case Intrinsic::amdgcn_global_load_lds: {
1447 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1453 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1463 Info.memVT = MVT::i32;
1470 case Intrinsic::amdgcn_s_prefetch_data: {
1485 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1488 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1489 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1501 Type *&AccessTy)
const {
1503 switch (
II->getIntrinsicID()) {
1504 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1505 case Intrinsic::amdgcn_ds_append:
1506 case Intrinsic::amdgcn_ds_consume:
1507 case Intrinsic::amdgcn_ds_read_tr4_b64:
1508 case Intrinsic::amdgcn_ds_read_tr6_b96:
1509 case Intrinsic::amdgcn_ds_read_tr8_b64:
1510 case Intrinsic::amdgcn_ds_read_tr16_b64:
1511 case Intrinsic::amdgcn_ds_ordered_add:
1512 case Intrinsic::amdgcn_ds_ordered_swap:
1513 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1514 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1515 case Intrinsic::amdgcn_global_atomic_csub:
1516 case Intrinsic::amdgcn_global_atomic_fmax_num:
1517 case Intrinsic::amdgcn_global_atomic_fmin_num:
1518 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1519 case Intrinsic::amdgcn_global_load_tr_b64:
1520 case Intrinsic::amdgcn_global_load_tr_b128:
1521 Ptr =
II->getArgOperand(0);
1523 case Intrinsic::amdgcn_global_load_lds:
1524 Ptr =
II->getArgOperand(1);
1529 AccessTy =
II->getType();
1535 unsigned AddrSpace)
const {
1547 return AM.
Scale == 0 &&
1549 AM.
BaseOffs, AddrSpace, FlatVariant));
1569 return isLegalMUBUFAddressingMode(AM);
1572bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1583 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1595 if (AM.HasBaseReg) {
1627 return isLegalMUBUFAddressingMode(AM);
1634 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1684 : isLegalMUBUFAddressingMode(AM);
1731 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1743 Align RequiredAlignment(
1746 Alignment < RequiredAlignment)
1767 RequiredAlignment =
Align(4);
1785 *IsFast = (Alignment >= RequiredAlignment) ? 64
1786 : (Alignment <
Align(4)) ? 32
1808 *IsFast = (Alignment >= RequiredAlignment) ? 96
1809 : (Alignment <
Align(4)) ? 32
1822 RequiredAlignment =
Align(8);
1833 *IsFast = (Alignment >= RequiredAlignment) ? 128
1834 : (Alignment <
Align(4)) ? 32
1851 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1853 return Alignment >= RequiredAlignment ||
1862 bool AlignedBy4 = Alignment >=
Align(4);
1864 *IsFast = AlignedBy4;
1875 return Alignment >=
Align(4) ||
1889 return Size >= 32 && Alignment >=
Align(4);
1894 unsigned *IsFast)
const {
1896 Alignment, Flags, IsFast);
1906 if (
Op.size() >= 16 &&
1910 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1918 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1928 unsigned DestAS)
const {
1936 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1954 unsigned Index)
const {
1997 auto [InputPtrReg, RC, ArgTy] =
2007 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2013 const SDLoc &SL)
const {
2020 const SDLoc &SL)
const {
2023 std::optional<uint32_t> KnownSize =
2025 if (KnownSize.has_value())
2051 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2060SDValue SITargetLowering::lowerKernargMemParameter(
2072 int64_t OffsetDiff =
Offset - AlignDownOffset;
2078 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2088 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2098 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2146 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2151SDValue SITargetLowering::getPreloadedValue(
2173 Reg = &WorkGroupIDX;
2174 RC = &AMDGPU::SReg_32RegClass;
2178 Reg = &WorkGroupIDY;
2179 RC = &AMDGPU::SReg_32RegClass;
2183 Reg = &WorkGroupIDZ;
2184 RC = &AMDGPU::SReg_32RegClass;
2215 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2219 "vector type argument should have been split");
2224 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2232 "unexpected vector split in ps argument type");
2246 Info->markPSInputAllocated(PSInputNum);
2248 Info->markPSInputEnabled(PSInputNum);
2264 if (
Info.hasWorkItemIDX()) {
2274 if (
Info.hasWorkItemIDY()) {
2277 Info.setWorkItemIDY(
2280 unsigned Reg = AMDGPU::VGPR1;
2288 if (
Info.hasWorkItemIDZ()) {
2291 Info.setWorkItemIDZ(
2294 unsigned Reg = AMDGPU::VGPR2;
2314 if (RegIdx == ArgVGPRs.
size()) {
2321 unsigned Reg = ArgVGPRs[RegIdx];
2323 assert(Reg != AMDGPU::NoRegister);
2333 unsigned NumArgRegs) {
2336 if (RegIdx == ArgSGPRs.
size())
2339 unsigned Reg = ArgSGPRs[RegIdx];
2341 assert(Reg != AMDGPU::NoRegister);
2355 assert(Reg != AMDGPU::NoRegister);
2381 const unsigned Mask = 0x3ff;
2384 if (
Info.hasWorkItemIDX()) {
2386 Info.setWorkItemIDX(Arg);
2389 if (
Info.hasWorkItemIDY()) {
2391 Info.setWorkItemIDY(Arg);
2394 if (
Info.hasWorkItemIDZ())
2406 const unsigned Mask = 0x3ff;
2427 if (
Info.hasImplicitArgPtr())
2435 if (
Info.hasWorkGroupIDX())
2438 if (
Info.hasWorkGroupIDY())
2441 if (
Info.hasWorkGroupIDZ())
2444 if (
Info.hasLDSKernelId())
2456 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2463 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2469 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2475 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2490 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2496 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2502 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2519 bool InPreloadSequence =
true;
2521 bool AlignedForImplictArgs =
false;
2522 unsigned ImplicitArgOffset = 0;
2523 for (
auto &Arg :
F.args()) {
2524 if (!InPreloadSequence || !Arg.hasInRegAttr())
2527 unsigned ArgIdx = Arg.getArgNo();
2530 if (InIdx < Ins.size() &&
2531 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2534 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2535 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2537 assert(ArgLocs[ArgIdx].isMemLoc());
2538 auto &ArgLoc = ArgLocs[InIdx];
2540 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2542 unsigned NumAllocSGPRs =
2543 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2546 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2547 if (!AlignedForImplictArgs) {
2549 alignTo(LastExplicitArgOffset,
2551 LastExplicitArgOffset;
2552 AlignedForImplictArgs =
true;
2554 ArgOffset += ImplicitArgOffset;
2558 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2559 assert(InIdx >= 1 &&
"No previous SGPR");
2560 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2561 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2565 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2566 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2569 InPreloadSequence =
false;
2575 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2577 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2579 if (PreloadRegs->
size() > 1)
2580 RC = &AMDGPU::SGPR_32RegClass;
2581 for (
auto &Reg : *PreloadRegs) {
2587 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2596 if (
Info.hasLDSKernelId()) {
2598 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2607 bool IsShader)
const {
2615 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2617 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2621 unsigned NumRequiredSystemSGPRs =
2622 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2623 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2624 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2626 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2631 if (!HasArchitectedSGPRs) {
2632 if (
Info.hasWorkGroupIDX()) {
2634 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2638 if (
Info.hasWorkGroupIDY()) {
2640 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2644 if (
Info.hasWorkGroupIDZ()) {
2646 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2651 if (
Info.hasWorkGroupInfo()) {
2653 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2657 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2659 unsigned PrivateSegmentWaveByteOffsetReg;
2662 PrivateSegmentWaveByteOffsetReg =
2663 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2667 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2669 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2672 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2674 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2675 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2679 Info.getNumPreloadedSGPRs() >= 16);
2694 if (HasStackObjects)
2695 Info.setHasNonSpillStackObjects(
true);
2700 HasStackObjects =
true;
2704 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2706 if (!ST.enableFlatScratch()) {
2707 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2714 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2716 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2726 Info.setScratchRSrcReg(ReservedBufferReg);
2745 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2746 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2753 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2754 if (!
MRI.isLiveIn(Reg)) {
2755 Info.setStackPtrOffsetReg(Reg);
2760 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2767 if (ST.getFrameLowering()->hasFP(MF)) {
2768 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2774 return !
Info->isEntryFunction();
2784 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2793 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2794 RC = &AMDGPU::SGPR_64RegClass;
2795 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2796 RC = &AMDGPU::SGPR_32RegClass;
2802 Entry->addLiveIn(*
I);
2807 for (
auto *Exit : Exits)
2809 TII->get(TargetOpcode::COPY), *
I)
2827 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2846 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2847 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2855 !
Info->hasWorkGroupIDZ());
2874 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2875 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2878 Info->markPSInputAllocated(0);
2879 Info->markPSInputEnabled(0);
2890 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2891 if ((PsInputBits & 0x7F) == 0 ||
2892 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2895 }
else if (IsKernel) {
2898 Splits.
append(Ins.begin(), Ins.end());
2911 }
else if (!IsGraphics) {
2936 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2946 if (IsEntryFunc && VA.
isMemLoc()) {
2969 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2973 int64_t OffsetDiff =
Offset - AlignDownOffset;
2980 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2991 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2992 Ins[i].Flags.isSExt(), &Ins[i]);
3000 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3003 if (PreloadRegs.
size() == 1) {
3004 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3009 TRI->getRegSizeInBits(*RC)));
3017 for (
auto Reg : PreloadRegs) {
3024 PreloadRegs.size()),
3041 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3042 Ins[i].Flags.isSExt(), &Ins[i]);
3054 "hidden argument in kernel signature was not preloaded",
3061 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3062 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3067 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3082 if (!IsEntryFunc && VA.
isMemLoc()) {
3083 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3094 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3095 RC = &AMDGPU::VGPR_32RegClass;
3096 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3097 RC = &AMDGPU::SGPR_32RegClass;
3157 Info->setBytesInStackArgArea(StackArgSize);
3159 return Chains.
empty() ? Chain
3176 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3182 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3183 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3184 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3207 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3225 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3226 ++
I, ++RealRVLocIdx) {
3230 SDValue Arg = OutVals[RealRVLocIdx];
3258 if (!
Info->isEntryFunction()) {
3264 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3266 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3282 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3365 auto &ArgUsageInfo =
3367 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3393 const auto [OutgoingArg, ArgRC, ArgTy] =
3398 const auto [IncomingArg, IncomingArgRC, Ty] =
3400 assert(IncomingArgRC == ArgRC);
3403 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3411 InputReg = getImplicitArgPtr(DAG,
DL);
3413 std::optional<uint32_t> Id =
3415 if (Id.has_value()) {
3426 if (OutgoingArg->isRegister()) {
3427 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3428 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3431 unsigned SpecialArgOffset =
3442 auto [OutgoingArg, ArgRC, Ty] =
3445 std::tie(OutgoingArg, ArgRC, Ty) =
3448 std::tie(OutgoingArg, ArgRC, Ty) =
3463 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3464 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3465 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3497 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3498 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3509 : IncomingArgY ? *IncomingArgY
3516 if (OutgoingArg->isRegister()) {
3518 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3559 if (Callee->isDivergent())
3566 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3570 if (!CallerPreserved)
3573 bool CCMatch = CallerCC == CalleeCC;
3586 if (Arg.hasByValAttr())
3600 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3601 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3610 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3623 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3625 if (!CCVA.isRegLoc())
3630 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3632 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3661 if (IsChainCallConv) {
3665 RequestedExec = CLI.
Args.back();
3666 assert(RequestedExec.
Node &&
"No node for EXEC");
3671 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3672 CLI.
Outs.pop_back();
3676 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3677 CLI.
Outs.pop_back();
3682 "Haven't popped all the pieces of the EXEC mask");
3693 bool IsSibCall =
false;
3707 "unsupported call to variadic function ");
3715 "unsupported required tail call to function ");
3720 Outs, OutVals, Ins, DAG);
3724 "site marked musttail or on llvm.amdgcn.cs.chain");
3731 if (!TailCallOpt && IsTailCall)
3777 if (!IsSibCall || IsChainCallConv) {
3784 RegsToPass.emplace_back(IsChainCallConv
3785 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3786 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3793 const unsigned NumSpecialInputs = RegsToPass.size();
3795 MVT PtrVT = MVT::i32;
3798 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3826 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3834 int32_t
Offset = LocMemOffset;
3841 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3847 ? Flags.getNonZeroByValAlign()
3874 if (Outs[i].Flags.isByVal()) {
3876 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3879 Outs[i].Flags.getNonZeroByValAlign(),
3881 nullptr, std::nullopt, DstInfo,
3887 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3893 if (!MemOpChains.
empty())
3909 unsigned ArgIdx = 0;
3910 for (
auto [Reg, Val] : RegsToPass) {
3911 if (ArgIdx++ >= NumSpecialInputs &&
3912 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3938 if (IsTailCall && !IsSibCall) {
3943 std::vector<SDValue> Ops({Chain});
3949 Ops.push_back(Callee);
3966 Ops.push_back(Callee);
3977 if (IsChainCallConv)
3978 Ops.push_back(RequestedExec.
Node);
3982 for (
auto &[Reg, Val] : RegsToPass)
3986 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3987 assert(Mask &&
"Missing call preserved mask for calling convention");
3997 MVT::Glue, GlueOps),
4002 Ops.push_back(InGlue);
4019 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4024 Chain = Call.getValue(0);
4025 InGlue = Call.getValue(1);
4027 uint64_t CalleePopBytes = NumBytes;
4048 EVT VT =
Op.getValueType();
4058 Align Alignment = cast<ConstantSDNode>(
Op.getOperand(2))->getAlignValue();
4062 "Stack grows upwards for AMDGPU");
4064 Chain = BaseAddr.getValue(1);
4066 if (Alignment > StackAlign) {
4069 uint64_t StackAlignMask = ScaledAlignment - 1;
4076 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4078 if (isa<ConstantSDNode>(
Size)) {
4109 if (
Op.getValueType() != MVT::i32)
4128 assert(
Op.getValueType() == MVT::i32);
4137 Op.getOperand(0), IntrinID, GetRoundBothImm);
4171 SDValue RoundModeTimesNumBits =
4191 TableEntry, EnumOffset);
4205 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4207 static_cast<uint32_t>(ConstMode->getZExtValue()),
4219 if (UseReducedTable) {
4225 SDValue RoundModeTimesNumBits =
4245 SDValue RoundModeTimesNumBits =
4254 NewMode = TruncTable;
4263 ReadFirstLaneID, NewMode);
4276 IntrinID, RoundBothImm, NewMode);
4282 if (
Op->isDivergent())
4301 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4302 EVT SrcVT = Src.getValueType();
4311 EVT DstVT =
Op.getValueType();
4320 if (
Op.getValueType() != MVT::i64)
4334 Op.getOperand(0), IntrinID, ModeHwRegImm);
4336 Op.getOperand(0), IntrinID, TrapHwRegImm);
4350 if (
Op.getOperand(1).getValueType() != MVT::i64)
4362 ReadFirstLaneID, NewModeReg);
4364 ReadFirstLaneID, NewTrapReg);
4366 unsigned ModeHwReg =
4369 unsigned TrapHwReg =
4377 IntrinID, ModeHwRegImm, NewModeReg);
4380 IntrinID, TrapHwRegImm, NewTrapReg);
4387 .
Case(
"m0", AMDGPU::M0)
4388 .
Case(
"exec", AMDGPU::EXEC)
4389 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4390 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4391 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4392 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4393 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4396 if (Reg == AMDGPU::NoRegister) {
4404 "\" for subtarget."));
4409 case AMDGPU::EXEC_LO:
4410 case AMDGPU::EXEC_HI:
4411 case AMDGPU::FLAT_SCR_LO:
4412 case AMDGPU::FLAT_SCR_HI:
4417 case AMDGPU::FLAT_SCR:
4436 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4445static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4467 auto Next = std::next(
I);
4480 return std::pair(LoopBB, RemainderBB);
4487 auto I =
MI.getIterator();
4488 auto E = std::next(
I);
4510 Src->setIsKill(
false);
4520 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
4526 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4529 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4553 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4554 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4563 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4564 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4565 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4566 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4574 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4581 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4585 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4591 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4592 : AMDGPU::S_AND_SAVEEXEC_B64),
4596 MRI.setSimpleHint(NewExec, CondReg);
4598 if (UseGPRIdxMode) {
4600 SGPRIdxReg = CurrentIdxReg;
4602 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4603 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4610 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4613 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4620 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4623 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4624 : AMDGPU::S_XOR_B64_term),
4648 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4649 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4657 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
4659 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4660 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4661 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4662 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4677 InitResultReg, DstReg, PhiReg, TmpExec,
4678 Offset, UseGPRIdxMode, SGPRIdxReg);
4684 LoopBB->removeSuccessor(RemainderBB);
4686 LoopBB->addSuccessor(LandingPad);
4697static std::pair<unsigned, int>
4701 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4706 return std::pair(AMDGPU::sub0,
Offset);
4720 assert(
Idx->getReg() != AMDGPU::NoRegister);
4744 return Idx->getReg();
4746 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4763 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4764 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4773 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4776 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4780 if (UseGPRIdxMode) {
4787 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4800 MI.eraseFromParent();
4809 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4810 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4816 UseGPRIdxMode, SGPRIdxReg);
4820 if (UseGPRIdxMode) {
4822 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4824 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4829 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4834 MI.eraseFromParent();
4851 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4861 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4863 if (
Idx->getReg() == AMDGPU::NoRegister) {
4874 MI.eraseFromParent();
4879 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4883 if (UseGPRIdxMode) {
4887 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4896 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4897 TRI.getRegSizeInBits(*VecRC), 32,
false);
4903 MI.eraseFromParent();
4913 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4917 UseGPRIdxMode, SGPRIdxReg);
4920 if (UseGPRIdxMode) {
4922 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4924 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4930 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4931 TRI.getRegSizeInBits(*VecRC), 32,
false);
4932 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4938 MI.eraseFromParent();
4953 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4984 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4985 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4987 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4988 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4989 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4991 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4992 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4994 bool IsWave32 = ST.isWave32();
4995 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4996 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5001 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5004 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5012 I = ComputeLoop->end();
5014 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5018 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5019 .
addReg(TmpSReg->getOperand(0).getReg())
5023 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5024 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
5025 .
addReg(ActiveBits->getOperand(0).getReg());
5026 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5027 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5029 .
addReg(FF1->getOperand(0).getReg());
5030 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
5032 .
addReg(LaneValue->getOperand(0).getReg());
5035 unsigned BITSETOpc =
5036 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5037 auto NewActiveBits =
5038 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5039 .
addReg(FF1->getOperand(0).getReg())
5040 .
addReg(ActiveBits->getOperand(0).getReg());
5043 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5044 .addMBB(ComputeLoop);
5045 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5046 .addMBB(ComputeLoop);
5049 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5051 .
addReg(NewActiveBits->getOperand(0).getReg())
5053 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5058 MI.eraseFromParent();
5070 switch (
MI.getOpcode()) {
5071 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5073 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5075 case AMDGPU::S_UADDO_PSEUDO:
5076 case AMDGPU::S_USUBO_PSEUDO: {
5083 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5085 : AMDGPU::S_SUB_I32;
5096 MI.eraseFromParent();
5099 case AMDGPU::S_ADD_U64_PSEUDO:
5100 case AMDGPU::S_SUB_U64_PSEUDO: {
5109 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5111 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5121 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5122 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5125 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5127 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5130 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5132 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5134 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5135 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5148 MI.eraseFromParent();
5151 case AMDGPU::V_ADD_U64_PSEUDO:
5152 case AMDGPU::V_SUB_U64_PSEUDO: {
5158 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5164 if (IsAdd && ST.hasLshlAddB64()) {
5170 TII->legalizeOperands(*
Add);
5171 MI.eraseFromParent();
5175 const auto *CarryRC =
TRI->getWaveMaskRegClass();
5177 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5178 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5180 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5181 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5185 : &AMDGPU::VReg_64RegClass;
5188 : &AMDGPU::VReg_64RegClass;
5191 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5193 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5196 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5198 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5201 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5203 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5206 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5213 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5227 TII->legalizeOperands(*LoHalf);
5228 TII->legalizeOperands(*HiHalf);
5229 MI.eraseFromParent();
5232 case AMDGPU::S_ADD_CO_PSEUDO:
5233 case AMDGPU::S_SUB_CO_PSEUDO: {
5247 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5248 ? AMDGPU::S_ADDC_U32
5249 : AMDGPU::S_SUBB_U32;
5251 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5252 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5257 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5258 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5262 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5264 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5270 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5271 assert(WaveSize == 64 || WaveSize == 32);
5273 if (WaveSize == 64) {
5274 if (ST.hasScalarCompareEq64()) {
5280 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5282 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5284 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5285 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5287 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5308 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5314 MI.eraseFromParent();
5317 case AMDGPU::SI_INIT_M0: {
5319 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5320 .
add(
MI.getOperand(0));
5321 MI.eraseFromParent();
5324 case AMDGPU::GET_GROUPSTATICSIZE: {
5329 .
add(
MI.getOperand(0))
5331 MI.eraseFromParent();
5334 case AMDGPU::GET_SHADERCYCLESHILO: {
5348 using namespace AMDGPU::Hwreg;
5349 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5352 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5354 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5355 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5357 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5361 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5366 .
add(
MI.getOperand(0))
5371 MI.eraseFromParent();
5374 case AMDGPU::SI_INDIRECT_SRC_V1:
5375 case AMDGPU::SI_INDIRECT_SRC_V2:
5376 case AMDGPU::SI_INDIRECT_SRC_V4:
5377 case AMDGPU::SI_INDIRECT_SRC_V8:
5378 case AMDGPU::SI_INDIRECT_SRC_V9:
5379 case AMDGPU::SI_INDIRECT_SRC_V10:
5380 case AMDGPU::SI_INDIRECT_SRC_V11:
5381 case AMDGPU::SI_INDIRECT_SRC_V12:
5382 case AMDGPU::SI_INDIRECT_SRC_V16:
5383 case AMDGPU::SI_INDIRECT_SRC_V32:
5385 case AMDGPU::SI_INDIRECT_DST_V1:
5386 case AMDGPU::SI_INDIRECT_DST_V2:
5387 case AMDGPU::SI_INDIRECT_DST_V4:
5388 case AMDGPU::SI_INDIRECT_DST_V8:
5389 case AMDGPU::SI_INDIRECT_DST_V9:
5390 case AMDGPU::SI_INDIRECT_DST_V10:
5391 case AMDGPU::SI_INDIRECT_DST_V11:
5392 case AMDGPU::SI_INDIRECT_DST_V12:
5393 case AMDGPU::SI_INDIRECT_DST_V16:
5394 case AMDGPU::SI_INDIRECT_DST_V32:
5396 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5397 case AMDGPU::SI_KILL_I1_PSEUDO:
5399 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5408 Register SrcCond =
MI.getOperand(3).getReg();
5410 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5411 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5412 const auto *CondRC =
TRI->getWaveMaskRegClass();
5413 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5417 : &AMDGPU::VReg_64RegClass;
5420 : &AMDGPU::VReg_64RegClass;
5423 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5425 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5428 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5430 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5433 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5435 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5456 MI.eraseFromParent();
5459 case AMDGPU::SI_BR_UNDEF: {
5463 .
add(
MI.getOperand(0));
5465 MI.eraseFromParent();
5468 case AMDGPU::ADJCALLSTACKUP:
5469 case AMDGPU::ADJCALLSTACKDOWN: {
5476 case AMDGPU::SI_CALL_ISEL: {
5480 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5483 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5489 MI.eraseFromParent();
5492 case AMDGPU::V_ADD_CO_U32_e32:
5493 case AMDGPU::V_SUB_CO_U32_e32:
5494 case AMDGPU::V_SUBREV_CO_U32_e32: {
5497 unsigned Opc =
MI.getOpcode();
5499 bool NeedClampOperand =
false;
5500 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5502 NeedClampOperand =
true;
5506 if (
TII->isVOP3(*
I)) {
5511 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
5512 if (NeedClampOperand)
5515 TII->legalizeOperands(*
I);
5517 MI.eraseFromParent();
5520 case AMDGPU::V_ADDC_U32_e32:
5521 case AMDGPU::V_SUBB_U32_e32:
5522 case AMDGPU::V_SUBBREV_U32_e32:
5525 TII->legalizeOperands(
MI);
5527 case AMDGPU::DS_GWS_INIT:
5528 case AMDGPU::DS_GWS_SEMA_BR:
5529 case AMDGPU::DS_GWS_BARRIER:
5530 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5532 case AMDGPU::DS_GWS_SEMA_V:
5533 case AMDGPU::DS_GWS_SEMA_P:
5534 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5542 case AMDGPU::S_SETREG_B32: {
5557 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5558 const unsigned SetMask = WidthMask <<
Offset;
5561 unsigned SetDenormOp = 0;
5562 unsigned SetRoundOp = 0;
5570 SetRoundOp = AMDGPU::S_ROUND_MODE;
5571 SetDenormOp = AMDGPU::S_DENORM_MODE;
5573 SetRoundOp = AMDGPU::S_ROUND_MODE;
5575 SetDenormOp = AMDGPU::S_DENORM_MODE;
5578 if (SetRoundOp || SetDenormOp) {
5581 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5582 unsigned ImmVal = Def->getOperand(1).getImm();
5596 MI.eraseFromParent();
5605 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5609 case AMDGPU::S_INVERSE_BALLOT_U32:
5610 case AMDGPU::S_INVERSE_BALLOT_U64:
5613 MI.setDesc(
TII->get(AMDGPU::COPY));
5615 case AMDGPU::ENDPGM_TRAP: {
5618 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5638 MI.eraseFromParent();
5641 case AMDGPU::SIMULATED_TRAP: {
5645 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5646 MI.eraseFromParent();
5683 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5799 EVT VT =
N->getValueType(0);
5803 if (VT == MVT::f16) {
5819 unsigned Opc =
Op.getOpcode();
5820 EVT VT =
Op.getValueType();
5821 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5822 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5823 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5824 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5839 unsigned Opc =
Op.getOpcode();
5840 EVT VT =
Op.getValueType();
5841 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5842 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5843 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5844 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5852 DAG.
getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
5854 DAG.
getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
5861 unsigned Opc =
Op.getOpcode();
5862 EVT VT =
Op.getValueType();
5863 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5864 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5865 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5866 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5867 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5868 VT == MVT::v32bf16);
5873 : std::pair(Op0, Op0);
5882 DAG.
getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
5884 DAG.
getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
5890 switch (
Op.getOpcode()) {
5894 return LowerBRCOND(
Op, DAG);
5896 return LowerRETURNADDR(
Op, DAG);
5899 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5900 "Load should return a value and a chain");
5904 EVT VT =
Op.getValueType();
5906 return lowerFSQRTF32(
Op, DAG);
5908 return lowerFSQRTF64(
Op, DAG);
5913 return LowerTrig(
Op, DAG);
5915 return LowerSELECT(
Op, DAG);
5917 return LowerFDIV(
Op, DAG);
5919 return LowerFFREXP(
Op, DAG);
5921 return LowerATOMIC_CMP_SWAP(
Op, DAG);
5923 return LowerSTORE(
Op, DAG);
5927 return LowerGlobalAddress(MFI,
Op, DAG);
5930 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
5932 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
5934 return LowerINTRINSIC_VOID(
Op, DAG);
5936 return lowerADDRSPACECAST(
Op, DAG);
5938 return lowerINSERT_SUBVECTOR(
Op, DAG);
5940 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5942 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5944 return lowerVECTOR_SHUFFLE(
Op, DAG);
5946 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5948 return lowerBUILD_VECTOR(
Op, DAG);
5951 return lowerFP_ROUND(
Op, DAG);
5953 return lowerTRAP(
Op, DAG);
5955 return lowerDEBUGTRAP(
Op, DAG);
5964 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5967 return lowerFLDEXP(
Op, DAG);
5996 return lowerMUL(
Op, DAG);
5999 return lowerXMULO(
Op, DAG);
6002 return lowerXMUL_LOHI(
Op, DAG);
6035 EVT FittingLoadVT = LoadVT;
6067SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6070 bool IsIntrinsic)
const {
6074 EVT LoadVT =
M->getValueType(0);
6076 EVT EquivLoadVT = LoadVT;
6094 M->getMemoryVT(),
M->getMemOperand());
6105 EVT LoadVT =
M->getValueType(0);
6111 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6112 bool IsTFE =
M->getNumValues() == 3;
6125 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6129 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6130 M->getMemOperand(), DAG);
6135 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6136 M->getMemOperand(), DAG);
6144 EVT VT =
N->getValueType(0);
6145 unsigned CondCode =
N->getConstantOperandVal(3);
6156 EVT CmpVT =
LHS.getValueType();
6157 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6158 unsigned PromoteOp =
6178 EVT VT =
N->getValueType(0);
6180 unsigned CondCode =
N->getConstantOperandVal(3);
6189 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6207 EVT VT =
N->getValueType(0);
6214 Src.getOperand(1), Src.getOperand(2));
6225 Exec = AMDGPU::EXEC_LO;
6227 Exec = AMDGPU::EXEC;
6244 EVT VT =
N->getValueType(0);
6246 unsigned IID =
N->getConstantOperandVal(0);
6247 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6248 IID == Intrinsic::amdgcn_permlanex16;
6249 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6250 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6254 unsigned SplitSize = 32;
6255 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6256 ST->hasDPALU_DPP() &&
6264 case Intrinsic::amdgcn_permlane16:
6265 case Intrinsic::amdgcn_permlanex16:
6266 case Intrinsic::amdgcn_update_dpp:
6271 case Intrinsic::amdgcn_writelane:
6274 case Intrinsic::amdgcn_readlane:
6275 case Intrinsic::amdgcn_set_inactive:
6276 case Intrinsic::amdgcn_set_inactive_chain_arg:
6277 case Intrinsic::amdgcn_mov_dpp8:
6280 case Intrinsic::amdgcn_readfirstlane:
6281 case Intrinsic::amdgcn_permlane64:
6291 if (
SDNode *GL =
N->getGluedNode()) {
6293 GL = GL->getOperand(0).getNode();
6303 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6304 IID == Intrinsic::amdgcn_mov_dpp8 ||
6305 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6306 Src1 =
N->getOperand(2);
6307 if (IID == Intrinsic::amdgcn_writelane ||
6308 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6309 Src2 =
N->getOperand(3);
6312 if (ValSize == SplitSize) {
6322 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6327 if (IID == Intrinsic::amdgcn_writelane) {
6332 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6334 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6337 if (ValSize % SplitSize != 0)
6341 EVT VT =
N->getValueType(0);
6345 unsigned NumOperands =
N->getNumOperands();
6347 SDNode *GL =
N->getGluedNode();
6352 for (
unsigned i = 0; i != NE; ++i) {
6353 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6355 SDValue Operand =
N->getOperand(j);
6385 if (SplitSize == 32) {
6387 return unrollLaneOp(LaneOp.
getNode());
6393 unsigned SubVecNumElt =
6397 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6398 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6402 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6407 if (IID == Intrinsic::amdgcn_writelane)
6412 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6413 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6414 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6415 EltIdx += SubVecNumElt;
6429 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6432 if (IID == Intrinsic::amdgcn_writelane)
6435 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6443 switch (
N->getOpcode()) {
6455 unsigned IID =
N->getConstantOperandVal(0);
6457 case Intrinsic::amdgcn_make_buffer_rsrc:
6458 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6460 case Intrinsic::amdgcn_cvt_pkrtz: {
6469 case Intrinsic::amdgcn_cvt_pknorm_i16:
6470 case Intrinsic::amdgcn_cvt_pknorm_u16:
6471 case Intrinsic::amdgcn_cvt_pk_i16:
6472 case Intrinsic::amdgcn_cvt_pk_u16: {
6478 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6480 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6482 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6487 EVT VT =
N->getValueType(0);
6496 case Intrinsic::amdgcn_s_buffer_load: {
6508 EVT VT =
Op.getValueType();
6509 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6521 if (!
Offset->isDivergent()) {
6540 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6552 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6553 Results.push_back(Res.getOperand(
I));
6557 Results.push_back(Res.getValue(1));
6566 EVT VT =
N->getValueType(0);
6571 EVT SelectVT = NewVT;
6572 if (NewVT.
bitsLT(MVT::i32)) {
6575 SelectVT = MVT::i32;
6581 if (NewVT != SelectVT)
6587 if (
N->getValueType(0) != MVT::v2f16)
6599 if (
N->getValueType(0) != MVT::v2f16)
6611 if (
N->getValueType(0) != MVT::f16)
6626 if (U.get() !=
Value)
6629 if (U.getUser()->getOpcode() == Opcode)
6635unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6637 switch (
Intr->getConstantOperandVal(1)) {
6638 case Intrinsic::amdgcn_if:
6640 case Intrinsic::amdgcn_else:
6642 case Intrinsic::amdgcn_loop:
6644 case Intrinsic::amdgcn_end_cf:
6691 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6704 assert(BR &&
"brcond missing unconditional branch user");
6705 Target = BR->getOperand(1);
6708 unsigned CFNode = isCFIntrinsic(
Intr);
6727 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6751 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6764 Intr->getOperand(0));
6770 MVT VT =
Op.getSimpleValueType();
6773 if (
Op.getConstantOperandVal(0) != 0)
6779 if (
Info->isEntryFunction())
6796 return Op.getValueType().bitsLE(VT)
6803 assert(
Op.getValueType() == MVT::f16 &&
6804 "Do not know how to custom lower FP_ROUND for non-f16 type");
6807 EVT SrcVT = Src.getValueType();
6808 if (SrcVT != MVT::f64)
6824 EVT VT =
Op.getValueType();
6827 bool IsIEEEMode =
Info->getMode().IEEE;
6836 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6844 EVT VT =
Op.getValueType();
6848 EVT ExpVT =
Exp.getValueType();
6849 if (ExpVT == MVT::i16)
6870 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6877 switch (
Op->getOpcode()) {
6907 DAGCombinerInfo &DCI)
const {
6908 const unsigned Opc =
Op.getOpcode();
6916 :
Op->getOperand(0).getValueType();
6919 if (DCI.isBeforeLegalizeOps() ||
6923 auto &DAG = DCI.DAG;
6929 LHS =
Op->getOperand(1);
6930 RHS =
Op->getOperand(2);
6932 LHS =
Op->getOperand(0);
6933 RHS =
Op->getOperand(1);
6964 EVT VT =
Op.getValueType();
6970 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6997 if (
Op->isDivergent())
7010 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7012 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7015 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7017 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7023 EVT VT =
Op.getValueType();
7030 const APInt &
C = RHSC->getAPIntValue();
7032 if (
C.isPowerOf2()) {
7034 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
7061 if (
Op->isDivergent()) {
7078 return lowerTrapEndpgm(
Op, DAG);
7081 : lowerTrapHsaQueuePtr(
Op, DAG);
7091SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
7093 ImplicitParameter Param)
const {
7113 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
7119 if (UserSGPR == AMDGPU::NoRegister) {
7161 "debugtrap handler not supported",
7174SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
7178 ? AMDGPU::SRC_SHARED_BASE
7179 : AMDGPU::SRC_PRIVATE_BASE;
7202 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7211 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7217 if (UserSGPR == AMDGPU::NoRegister) {
7247 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7248 isa<BasicBlockSDNode>(Val))
7251 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7252 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7266 unsigned DestAS, SrcAS;
7268 bool IsNonNull =
false;
7269 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7270 SrcAS = ASC->getSrcAddressSpace();
7271 Src = ASC->getOperand(0);
7272 DestAS = ASC->getDestAddressSpace();
7275 Op.getConstantOperandVal(0) ==
7276 Intrinsic::amdgcn_addrspacecast_nonnull);
7277 Src =
Op->getOperand(1);
7278 SrcAS =
Op->getConstantOperandVal(2);
7279 DestAS =
Op->getConstantOperandVal(3);
7294 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7308 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7316 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7328 Op.getValueType() == MVT::i64) {
7337 Src.getValueType() == MVT::i64)
7361 EVT InsVT =
Ins.getValueType();
7364 unsigned IdxVal =
Idx->getAsZExtVal();
7369 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7374 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7376 MVT::i32, InsNumElts / 2);
7381 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7383 if (InsNumElts == 2) {
7396 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7418 auto *KIdx = dyn_cast<ConstantSDNode>(
Idx);
7419 if (NumElts == 4 && EltSize == 16 && KIdx) {
7430 unsigned Idx = KIdx->getZExtValue();
7431 bool InsertLo =
Idx < 2;
7448 if (isa<ConstantSDNode>(
Idx))
7454 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7460 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7489 EVT ResultVT =
Op.getValueType();
7502 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7505 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7509 if (VecSize == 128) {
7517 }
else if (VecSize == 256) {
7520 for (
unsigned P = 0;
P < 4; ++
P) {
7526 Parts[0], Parts[1]));
7528 Parts[2], Parts[3]));
7534 for (
unsigned P = 0;
P < 8; ++
P) {
7541 Parts[0], Parts[1], Parts[2], Parts[3]));
7544 Parts[4], Parts[5], Parts[6], Parts[7]));
7547 EVT IdxVT =
Idx.getValueType();
7564 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7579 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7589 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7595 EVT ResultVT =
Op.getValueType();
7599 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7615 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7616 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7624 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7625 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7626 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7627 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7646 EVT ResultVT =
Op.getValueType();
7662 EVT VT =
Op.getValueType();
7664 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7699 for (
unsigned P = 0;
P < NumParts; ++
P) {
7701 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
7734 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7772 EVT PtrVT =
Op.getValueType();
7788 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7859 SDValue Param = lowerKernargMemParameter(
7869 "non-hsa intrinsic with hsa target",
7878 "intrinsic not supported on subtarget",
7888 unsigned NumElts = Elts.
size();
7890 if (NumElts <= 12) {
7899 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7905 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7906 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7915 EVT SrcVT = Src.getValueType();
7936 bool Unpacked,
bool IsD16,
int DMaskPop,
7937 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7941 EVT ReqRetVT = ResultTypes[0];
7943 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7944 ? (ReqRetNumElts + 1) / 2
7947 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7958 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7969 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7971 NumDataDwords - MaskPopDwords);
7976 EVT LegalReqRetVT = ReqRetVT;
7978 if (!
Data.getValueType().isInteger())
7980 Data.getValueType().changeTypeToInteger(),
Data);
8001 if (Result->getNumValues() == 1)
8008 SDValue *LWE,
bool &IsTexFail) {
8009 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
8028 unsigned DimIdx,
unsigned EndIdx,
8029 unsigned NumGradients) {
8031 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
8039 if (((
I + 1) >= EndIdx) ||
8040 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
8041 I == DimIdx + NumGradients - 1))) {
8042 if (
Addr.getValueType() != MVT::i16)
8063 unsigned IntrOpcode =
Intr->BaseOpcode;
8074 int NumVDataDwords = 0;
8075 bool AdjustRetType =
false;
8076 bool IsAtomicPacked16Bit =
false;
8079 const unsigned ArgOffset = WithChain ? 2 : 1;
8082 unsigned DMaskLanes = 0;
8084 if (BaseOpcode->Atomic) {
8085 VData =
Op.getOperand(2);
8087 IsAtomicPacked16Bit =
8088 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8089 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8092 if (BaseOpcode->AtomicX2) {
8099 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8100 DMask = Is64Bit ? 0xf : 0x3;
8101 NumVDataDwords = Is64Bit ? 4 : 2;
8103 DMask = Is64Bit ? 0x3 : 0x1;
8104 NumVDataDwords = Is64Bit ? 2 : 1;
8107 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
8110 if (BaseOpcode->Store) {
8111 VData =
Op.getOperand(2);
8119 VData = handleD16VData(VData, DAG,
true);
8123 }
else if (!BaseOpcode->NoReturn) {
8136 (!LoadVT.
isVector() && DMaskLanes > 1))
8144 NumVDataDwords = (DMaskLanes + 1) / 2;
8146 NumVDataDwords = DMaskLanes;
8148 AdjustRetType =
true;
8152 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8157 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8159 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8160 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8162 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8164 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8165 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8168 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8169 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8170 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8175 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8179 "Bias needs to be converted to 16 bit in A16 mode");
8184 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8188 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8189 "require 16 bit args for both gradients and addresses");
8194 if (!
ST->hasA16()) {
8195 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8196 "support 16 bit addresses\n");
8206 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8210 IntrOpcode = G16MappingInfo->
G16;
8218 ArgOffset +
Intr->GradientStart,
8219 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8221 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8222 I < ArgOffset + Intr->CoordStart;
I++)
8229 ArgOffset +
Intr->CoordStart, VAddrEnd,
8233 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8251 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8252 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8253 const bool UseNSA =
ST->hasNSAEncoding() &&
8254 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8255 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8256 const bool UsePartialNSA =
8257 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8260 if (UsePartialNSA) {
8262 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8263 }
else if (!UseNSA) {
8270 if (!BaseOpcode->Sampler) {
8274 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8276 Unorm = UnormConst ? True : False;
8281 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8282 bool IsTexFail =
false;
8283 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8294 NumVDataDwords += 1;
8295 AdjustRetType =
true;
8300 if (AdjustRetType) {
8303 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8306 if (isa<MemSDNode>(
Op))
8312 MVT::i32, NumVDataDwords)
8315 ResultTypes[0] = NewVT;
8316 if (ResultTypes.size() == 3) {
8320 ResultTypes.erase(&ResultTypes[1]);
8324 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8325 if (BaseOpcode->Atomic)
8332 if (BaseOpcode->Store || BaseOpcode->Atomic)
8334 if (UsePartialNSA) {
8343 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8346 if (BaseOpcode->Sampler) {
8355 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8359 ST->hasFeature(AMDGPU::FeatureR128A16)
8369 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8373 if (BaseOpcode->HasD16)
8375 if (isa<MemSDNode>(
Op))
8378 int NumVAddrDwords =
8384 NumVDataDwords, NumVAddrDwords);
8385 }
else if (IsGFX11Plus) {
8387 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8388 : AMDGPU::MIMGEncGfx11Default,
8389 NumVDataDwords, NumVAddrDwords);
8390 }
else if (IsGFX10Plus) {
8392 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8393 : AMDGPU::MIMGEncGfx10Default,
8394 NumVDataDwords, NumVAddrDwords);
8398 NumVDataDwords, NumVAddrDwords);
8401 "requested image instruction is not supported on this GPU");
8406 NumVDataDwords, NumVAddrDwords);
8409 NumVDataDwords, NumVAddrDwords);
8415 if (
auto *
MemOp = dyn_cast<MemSDNode>(
Op)) {
8420 if (BaseOpcode->AtomicX2) {
8425 if (BaseOpcode->NoReturn)
8429 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8447 if (!
Offset->isDivergent()) {
8492 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8496 unsigned NumLoads = 1;
8502 if (NumElts == 8 || NumElts == 16) {
8503 NumLoads = NumElts / 4;
8511 setBufferOffsets(
Offset, DAG, &Ops[3],
8512 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8515 for (
unsigned i = 0; i < NumLoads; ++i) {
8521 if (NumElts == 8 || NumElts == 16)
8568 EVT VT =
Op.getValueType();
8570 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8574 switch (IntrinsicID) {
8575 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8578 return getPreloadedValue(DAG, *MFI, VT,
8581 case Intrinsic::amdgcn_dispatch_ptr:
8582 case Intrinsic::amdgcn_queue_ptr: {
8585 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8591 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8594 return getPreloadedValue(DAG, *MFI, VT, RegID);
8596 case Intrinsic::amdgcn_implicitarg_ptr: {
8598 return getImplicitArgPtr(DAG,
DL);
8599 return getPreloadedValue(DAG, *MFI, VT,
8602 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8608 return getPreloadedValue(DAG, *MFI, VT,
8611 case Intrinsic::amdgcn_dispatch_id: {
8614 case Intrinsic::amdgcn_rcp:
8616 case Intrinsic::amdgcn_rsq:
8618 case Intrinsic::amdgcn_rsq_legacy:
8622 case Intrinsic::amdgcn_rcp_legacy:
8626 case Intrinsic::amdgcn_rsq_clamp: {
8640 case Intrinsic::r600_read_ngroups_x:
8644 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8647 case Intrinsic::r600_read_ngroups_y:
8651 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8654 case Intrinsic::r600_read_ngroups_z:
8658 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8661 case Intrinsic::r600_read_global_size_x:
8665 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8668 case Intrinsic::r600_read_global_size_y:
8672 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8675 case Intrinsic::r600_read_global_size_z:
8679 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8682 case Intrinsic::r600_read_local_size_x:
8686 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8688 case Intrinsic::r600_read_local_size_y:
8692 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8694 case Intrinsic::r600_read_local_size_z:
8698 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8700 case Intrinsic::amdgcn_workgroup_id_x:
8701 return getPreloadedValue(DAG, *MFI, VT,
8703 case Intrinsic::amdgcn_workgroup_id_y:
8704 return getPreloadedValue(DAG, *MFI, VT,
8706 case Intrinsic::amdgcn_workgroup_id_z:
8707 return getPreloadedValue(DAG, *MFI, VT,
8709 case Intrinsic::amdgcn_wave_id:
8710 return lowerWaveID(DAG,
Op);
8711 case Intrinsic::amdgcn_lds_kernel_id: {
8713 return getLDSKernelId(DAG,
DL);
8714 return getPreloadedValue(DAG, *MFI, VT,
8717 case Intrinsic::amdgcn_workitem_id_x:
8718 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8719 case Intrinsic::amdgcn_workitem_id_y:
8720 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8721 case Intrinsic::amdgcn_workitem_id_z:
8722 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8723 case Intrinsic::amdgcn_wavefrontsize:
8726 case Intrinsic::amdgcn_s_buffer_load: {
8727 unsigned CPol =
Op.getConstantOperandVal(3);
8734 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
8735 Op.getOperand(3), DAG);
8737 case Intrinsic::amdgcn_fdiv_fast:
8738 return lowerFDIV_FAST(
Op, DAG);
8739 case Intrinsic::amdgcn_sin:
8742 case Intrinsic::amdgcn_cos:
8745 case Intrinsic::amdgcn_mul_u24:
8748 case Intrinsic::amdgcn_mul_i24:
8752 case Intrinsic::amdgcn_log_clamp: {
8758 case Intrinsic::amdgcn_fract:
8761 case Intrinsic::amdgcn_class:
8764 case Intrinsic::amdgcn_div_fmas:
8766 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8768 case Intrinsic::amdgcn_div_fixup:
8770 Op.getOperand(2),
Op.getOperand(3));
8772 case Intrinsic::amdgcn_div_scale: {
8785 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8788 Denominator, Numerator);
8790 case Intrinsic::amdgcn_icmp: {
8792 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8793 Op.getConstantOperandVal(2) == 0 &&
8798 case Intrinsic::amdgcn_fcmp: {
8801 case Intrinsic::amdgcn_ballot:
8803 case Intrinsic::amdgcn_fmed3:
8805 Op.getOperand(2),
Op.getOperand(3));
8806 case Intrinsic::amdgcn_fdot2:
8808 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
8809 case Intrinsic::amdgcn_fmul_legacy:
8812 case Intrinsic::amdgcn_sffbh:
8814 case Intrinsic::amdgcn_sbfe:
8816 Op.getOperand(2),
Op.getOperand(3));
8817 case Intrinsic::amdgcn_ubfe:
8819 Op.getOperand(2),
Op.getOperand(3));
8820 case Intrinsic::amdgcn_cvt_pkrtz:
8821 case Intrinsic::amdgcn_cvt_pknorm_i16:
8822 case Intrinsic::amdgcn_cvt_pknorm_u16:
8823 case Intrinsic::amdgcn_cvt_pk_i16:
8824 case Intrinsic::amdgcn_cvt_pk_u16: {
8826 EVT VT =
Op.getValueType();
8829 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8831 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8833 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8835 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8841 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8844 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
8847 case Intrinsic::amdgcn_fmad_ftz:
8849 Op.getOperand(2),
Op.getOperand(3));
8851 case Intrinsic::amdgcn_if_break:
8853 Op->getOperand(1),
Op->getOperand(2)),
8856 case Intrinsic::amdgcn_groupstaticsize: {
8868 case Intrinsic::amdgcn_is_shared:
8869 case Intrinsic::amdgcn_is_private: {
8871 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8874 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8882 case Intrinsic::amdgcn_perm:
8884 Op.getOperand(2),
Op.getOperand(3));
8885 case Intrinsic::amdgcn_reloc_constant: {
8889 auto *RelocSymbol = cast<GlobalVariable>(
8895 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8896 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8897 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8898 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8899 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8900 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8901 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8902 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8903 if (
Op.getOperand(4).getValueType() == MVT::i32)
8909 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8910 Op.getOperand(3), IndexKeyi32);
8912 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8913 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8914 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8915 if (
Op.getOperand(6).getValueType() == MVT::i32)
8921 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8922 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8923 IndexKeyi32, Op.getOperand(7)});
8925 case Intrinsic::amdgcn_addrspacecast_nonnull:
8926 return lowerADDRSPACECAST(
Op, DAG);
8927 case Intrinsic::amdgcn_readlane:
8928 case Intrinsic::amdgcn_readfirstlane:
8929 case Intrinsic::amdgcn_writelane:
8930 case Intrinsic::amdgcn_permlane16:
8931 case Intrinsic::amdgcn_permlanex16:
8932 case Intrinsic::amdgcn_permlane64:
8933 case Intrinsic::amdgcn_set_inactive:
8934 case Intrinsic::amdgcn_set_inactive_chain_arg:
8935 case Intrinsic::amdgcn_mov_dpp8:
8936 case Intrinsic::amdgcn_update_dpp:
8941 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8952 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8958 unsigned NewOpcode)
const {
8962 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8963 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
8977 auto *
M = cast<MemSDNode>(
Op);
8981 M->getMemOperand());
8986 unsigned NewOpcode)
const {
8990 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8991 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9005 auto *
M = cast<MemSDNode>(
Op);
9009 M->getMemOperand());
9014 unsigned IntrID =
Op.getConstantOperandVal(1);
9018 case Intrinsic::amdgcn_ds_ordered_add:
9019 case Intrinsic::amdgcn_ds_ordered_swap: {
9024 unsigned IndexOperand =
M->getConstantOperandVal(7);
9025 unsigned WaveRelease =
M->getConstantOperandVal(8);
9026 unsigned WaveDone =
M->getConstantOperandVal(9);
9028 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9029 IndexOperand &= ~0x3f;
9030 unsigned CountDw = 0;
9033 CountDw = (IndexOperand >> 24) & 0xf;
9034 IndexOperand &= ~(0xf << 24);
9036 if (CountDw < 1 || CountDw > 4) {
9038 "ds_ordered_count: dword count must be between 1 and 4");
9045 if (WaveDone && !WaveRelease)
9048 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9049 unsigned ShaderType =
9051 unsigned Offset0 = OrderedCountIndex << 2;
9052 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
9055 Offset1 |= (CountDw - 1) << 6;
9058 Offset1 |= ShaderType << 2;
9060 unsigned Offset = Offset0 | (Offset1 << 8);
9067 M->getVTList(), Ops,
M->getMemoryVT(),
9068 M->getMemOperand());
9070 case Intrinsic::amdgcn_raw_buffer_load:
9071 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9072 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9073 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9074 case Intrinsic::amdgcn_raw_buffer_load_format:
9075 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9076 const bool IsFormat =
9077 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9078 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9080 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9081 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9094 auto *
M = cast<MemSDNode>(
Op);
9095 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9097 case Intrinsic::amdgcn_struct_buffer_load:
9098 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9099 case Intrinsic::amdgcn_struct_buffer_load_format:
9100 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9101 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9102 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9103 const bool IsFormat =
9104 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9105 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9107 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9108 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9121 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
9123 case Intrinsic::amdgcn_raw_tbuffer_load:
9124 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9126 EVT LoadVT =
Op.getValueType();
9127 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9128 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
9147 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9150 case Intrinsic::amdgcn_struct_tbuffer_load:
9151 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9153 EVT LoadVT =
Op.getValueType();
9154 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9155 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9174 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9177 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9178 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9180 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9182 return lowerStructBufferAtomicIntrin(
Op, DAG,
9184 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9187 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9188 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9189 return lowerStructBufferAtomicIntrin(
Op, DAG,
9191 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9192 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9194 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9195 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9196 return lowerStructBufferAtomicIntrin(
Op, DAG,
9198 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9201 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9202 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9204 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9205 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9207 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9208 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9210 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9213 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9216 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9217 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9219 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9222 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9225 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9228 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9229 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9231 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9232 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9234 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9235 return lowerRawBufferAtomicIntrin(
Op, DAG,
9237 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9239 return lowerStructBufferAtomicIntrin(
Op, DAG,
9241 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9244 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9247 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9248 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9249 return lowerStructBufferAtomicIntrin(
Op, DAG,
9251 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9252 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9253 return lowerStructBufferAtomicIntrin(
Op, DAG,
9255 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9257 return lowerStructBufferAtomicIntrin(
Op, DAG,
9259 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9260 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9261 return lowerStructBufferAtomicIntrin(
Op, DAG,
9263 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9264 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9266 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9269 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9272 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9275 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9276 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9278 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9279 return lowerStructBufferAtomicIntrin(
Op, DAG,
9282 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9284 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9285 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9299 EVT VT =
Op.getValueType();
9300 auto *
M = cast<MemSDNode>(
Op);
9303 Op->getVTList(), Ops, VT,
9304 M->getMemOperand());
9306 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9308 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9309 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
9323 EVT VT =
Op.getValueType();
9324 auto *
M = cast<MemSDNode>(
Op);
9327 Op->getVTList(), Ops, VT,
9328 M->getMemOperand());
9330 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9332 SDValue NodePtr =
M->getOperand(2);
9333 SDValue RayExtent =
M->getOperand(3);
9334 SDValue RayOrigin =
M->getOperand(4);
9336 SDValue RayInvDir =
M->getOperand(6);
9354 const unsigned NumVDataDwords = 4;
9355 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9356 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9360 const unsigned BaseOpcodes[2][2] = {
9361 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9362 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9363 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9367 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9368 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9369 : AMDGPU::MIMGEncGfx10NSA,
9370 NumVDataDwords, NumVAddrDwords);
9374 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9375 : AMDGPU::MIMGEncGfx10Default,
9376 NumVDataDwords, NumVAddrDwords);
9382 auto packLanes = [&DAG, &Ops, &
DL](
SDValue Op,
bool IsAligned) {
9385 if (Lanes[0].getValueSizeInBits() == 32) {
9386 for (
unsigned I = 0;
I < 3; ++
I)
9405 if (UseNSA && IsGFX11Plus) {
9413 for (
unsigned I = 0;
I < 3; ++
I) {
9416 {DirLanes[I], InvDirLanes[I]})));
9431 packLanes(RayOrigin,
true);
9432 packLanes(RayDir,
true);
9433 packLanes(RayInvDir,
false);
9438 if (NumVAddrDwords > 12) {
9458 case Intrinsic::amdgcn_global_atomic_fmin_num:
9459 case Intrinsic::amdgcn_global_atomic_fmax_num:
9460 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9461 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9468 unsigned Opcode = 0;
9470 case Intrinsic::amdgcn_global_atomic_fmin_num:
9471 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9475 case Intrinsic::amdgcn_global_atomic_fmax_num:
9476 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9484 Ops,
M->getMemOperand());
9486 case Intrinsic::amdgcn_s_get_barrier_state:
9487 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9492 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9493 uint64_t BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getZExtValue();
9494 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9495 BarID = (BarID >> 4) & 0x3F;
9496 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9501 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9502 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9522 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9530SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9540 bool IsTFE = VTList.
NumVTs == 3;
9543 unsigned NumOpDWords = NumValueDWords + 1;
9548 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9549 OpDWordsVT, OpDWordsMMO, DAG);
9564 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9570 WidenedMemVT, WidenedMMO);
9580 bool ImageStore)
const {
9615 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9621 if ((NumElements % 2) == 1) {
9623 unsigned I = Elts.
size() / 2;
9639 if (NumElements == 3) {
9660 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9663 switch (IntrinsicID) {
9664 case Intrinsic::amdgcn_exp_compr: {
9668 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9691 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9694 case Intrinsic::amdgcn_s_barrier:
9695 case Intrinsic::amdgcn_s_barrier_signal:
9696 case Intrinsic::amdgcn_s_barrier_wait: {
9699 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9700 if (WGSize <=
ST.getWavefrontSize()) {
9703 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9704 return Op.getOperand(0);
9707 MVT::Other,
Op.getOperand(0)),
9712 if (
ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9718 MVT::Other, K,
Op.getOperand(0)),
9730 case Intrinsic::amdgcn_struct_tbuffer_store:
9731 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9735 VData = handleD16VData(VData, DAG);
9736 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9737 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9755 M->getMemoryVT(),
M->getMemOperand());
9758 case Intrinsic::amdgcn_raw_tbuffer_store:
9759 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9763 VData = handleD16VData(VData, DAG);
9764 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9765 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9783 M->getMemoryVT(),
M->getMemOperand());
9786 case Intrinsic::amdgcn_raw_buffer_store:
9787 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9788 case Intrinsic::amdgcn_raw_buffer_store_format:
9789 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9790 const bool IsFormat =
9791 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9792 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9799 VData = handleD16VData(VData, DAG);
9809 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9810 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
9830 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9833 M->getMemoryVT(),
M->getMemOperand());
9836 case Intrinsic::amdgcn_struct_buffer_store:
9837 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9838 case Intrinsic::amdgcn_struct_buffer_store_format:
9839 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9840 const bool IsFormat =
9841 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9842 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9850 VData = handleD16VData(VData, DAG);
9860 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9861 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
9882 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9885 M->getMemoryVT(),
M->getMemOperand());
9887 case Intrinsic::amdgcn_raw_buffer_load_lds:
9888 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9889 case Intrinsic::amdgcn_struct_buffer_load_lds:
9890 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9894 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9895 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9896 unsigned OpOffset = HasVIndex ? 1 : 0;
9897 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9899 unsigned Size =
Op->getConstantOperandVal(4);
9905 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9906 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9907 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9908 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9911 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9912 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9913 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9914 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9917 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9918 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9919 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9920 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9925 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9926 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9927 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9928 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9933 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9934 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9935 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9936 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9944 if (HasVIndex && HasVOffset)
9950 else if (HasVOffset)
9953 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9958 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9970 auto *
M = cast<MemSDNode>(
Op);
9997 case Intrinsic::amdgcn_global_load_lds: {
9999 unsigned Size =
Op->getConstantOperandVal(4);
10004 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10007 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10010 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10015 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10020 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10024 auto *
M = cast<MemSDNode>(
Op);
10037 if (
LHS->isDivergent())
10041 RHS.getOperand(0).getValueType() == MVT::i32) {
10044 VOffset =
RHS.getOperand(0);
10049 if (!
Addr->isDivergent()) {
10066 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
10086 case Intrinsic::amdgcn_end_cf:
10088 Op->getOperand(2), Chain),
10090 case Intrinsic::amdgcn_s_barrier_init:
10091 case Intrinsic::amdgcn_s_barrier_signal_var: {
10098 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10099 ? AMDGPU::S_BARRIER_INIT_M0
10100 : AMDGPU::S_BARRIER_SIGNAL_M0;
10115 constexpr unsigned ShAmt = 16;
10127 case Intrinsic::amdgcn_s_barrier_join: {
10134 if (isa<ConstantSDNode>(BarOp)) {
10135 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10136 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10139 unsigned BarID = (BarVal >> 4) & 0x3F;
10144 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10160 case Intrinsic::amdgcn_s_prefetch_data: {
10163 return Op.getOperand(0);
10166 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10168 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
10175 Op->getVTList(), Ops,
M->getMemoryVT(),
10176 M->getMemOperand());
10181 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10194std::pair<SDValue, SDValue>
10201 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10204 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10218 unsigned Overflow = ImmOffset & ~MaxImm;
10219 ImmOffset -= Overflow;
10220 if ((int32_t)Overflow < 0) {
10221 Overflow += ImmOffset;
10226 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10230 SDValue Ops[] = {N0, OverflowVal};
10245void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10247 Align Alignment)
const {
10250 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10253 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10264 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10266 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10283SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10286 return MaybePointer;
10300 SDValue NumRecords =
Op->getOperand(3);
10303 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10306 std::optional<uint32_t> ConstStride = std::nullopt;
10307 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10308 ConstStride = ConstNode->getZExtValue();
10311 if (!ConstStride || *ConstStride != 0) {
10314 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10325 NewHighHalf, NumRecords, Flags);
10335 bool IsTFE)
const {
10345 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10373 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10377 Ops[1] = BufferStoreExt;
10382 M->getMemOperand());
10407 DAGCombinerInfo &DCI)
const {
10423 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10430 "unexpected vector extload");
10443 "unexpected fp extload");
10461 DCI.AddToWorklist(Cvt.
getNode());
10466 DCI.AddToWorklist(Cvt.
getNode());
10477 if (
Info.isEntryFunction())
10478 return Info.getUserSGPRInfo().hasFlatScratchInit();
10486 EVT MemVT =
Load->getMemoryVT();
10499 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10527 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10528 "Custom lowering for non-i32 vectors hasn't been implemented.");
10531 unsigned AS =
Load->getAddressSpace();
10555 Alignment >=
Align(4) && NumElements < 32) {
10569 if (NumElements > 4)
10588 if (NumElements > 2)
10593 if (NumElements > 4)
10605 auto Flags =
Load->getMemOperand()->getFlags();
10607 Load->getAlign(), Flags, &
Fast) &&
10616 MemVT, *
Load->getMemOperand())) {
10625 EVT VT =
Op.getValueType();
10662 EVT VT =
Op.getValueType();
10665 bool AllowInaccurateRcp =
10672 if (!AllowInaccurateRcp && VT != MVT::f16)
10675 if (CLHS->isExactlyValue(1.0)) {
10692 if (CLHS->isExactlyValue(-1.0)) {
10701 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10715 EVT VT =
Op.getValueType();
10718 bool AllowInaccurateDiv =
10720 if (!AllowInaccurateDiv)
10741 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10755 return DAG.
getNode(Opcode, SL, VTList,
10764 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10778 return DAG.
getNode(Opcode, SL, VTList,
10784 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10785 return FastLowered;
10805 unsigned FMADOpCode =
10815 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10817 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
10818 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10841 const APFloat K0Val(0x1p+96f);
10844 const APFloat K1Val(0x1p-32f);
10871 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10872 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10873 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10878 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10879 return FastLowered;
10886 Flags.setNoFPExcept(
true);
10907 using namespace AMDGPU::Hwreg;
10908 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10916 const bool HasDynamicDenormals =
10922 if (!PreservesDenormals) {
10930 if (HasDynamicDenormals) {
10934 SavedDenormMode =
SDValue(GetReg, 0);
10942 const SDValue EnableDenormValue =
10949 const SDValue EnableDenormValue =
10951 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10952 {EnableDenormValue,
BitField, Glue});
10962 ApproxRcp, One, NegDivScale0, Flags);
10965 ApproxRcp, Fma0, Flags);
10971 NumeratorScaled,
Mul, Flags);
10977 NumeratorScaled, Fma3, Flags);
10979 if (!PreservesDenormals) {
10987 DisableDenormValue, Fma4.
getValue(2))
10990 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10991 const SDValue DisableDenormValue =
10992 HasDynamicDenormals
10997 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11008 {Fma4, Fma1, Fma3, Scale},
Flags);
11014 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
11015 return FastLowered;
11083 EVT VT =
Op.getValueType();
11085 if (VT == MVT::f32)
11086 return LowerFDIV32(
Op, DAG);
11088 if (VT == MVT::f64)
11089 return LowerFDIV64(
Op, DAG);
11091 if (VT == MVT::f16)
11092 return LowerFDIV16(
Op, DAG);
11101 EVT ResultExpVT =
Op->getValueType(1);
11102 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11132 if (VT == MVT::i1) {
11136 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
11140 Store->getValue().getValueType().getScalarType() == MVT::i32);
11142 unsigned AS =
Store->getAddressSpace();
11161 if (NumElements > 4)
11168 VT, *
Store->getMemOperand()))
11178 if (NumElements > 2)
11182 if (NumElements > 4 ||
11191 auto Flags =
Store->getMemOperand()->getFlags();
11226 MVT VT =
Op.getValueType().getSimpleVT();
11397 EVT VT =
Op.getValueType();
11414 switch (
Op.getOpcode()) {
11441 EVT VT =
Op.getValueType();
11449 Op->getVTList(), Ops, VT,
11458SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
11459 DAGCombinerInfo &DCI)
const {
11460 EVT VT =
N->getValueType(0);
11462 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11469 EVT SrcVT = Src.getValueType();
11475 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11478 DCI.AddToWorklist(Cvt.
getNode());
11481 if (ScalarVT != MVT::f32) {
11493 DAGCombinerInfo &DCI)
const {
11494 SDValue MagnitudeOp =
N->getOperand(0);
11495 SDValue SignOp =
N->getOperand(1);
11551SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
11553 DAGCombinerInfo &DCI)
const {
11583 AM.HasBaseReg =
true;
11584 AM.BaseOffs =
Offset.getSExtValue();
11589 EVT VT =
N->getValueType(0);
11595 Flags.setNoUnsignedWrap(
11596 N->getFlags().hasNoUnsignedWrap() &&
11606 switch (
N->getOpcode()) {
11617 DAGCombinerInfo &DCI)
const {
11626 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11627 N->getMemoryVT(), DCI);
11631 NewOps[PtrIdx] = NewPtr;
11640 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11641 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11650SDValue SITargetLowering::splitBinaryBitConstantOp(
11651 DAGCombinerInfo &DCI,
const SDLoc &SL,
unsigned Opc,
SDValue LHS,
11671 if (V.getValueType() != MVT::i1)
11673 switch (V.getOpcode()) {
11692 if (!(
C & 0x000000ff))
11693 ZeroByteMask |= 0x000000ff;
11694 if (!(
C & 0x0000ff00))
11695 ZeroByteMask |= 0x0000ff00;
11696 if (!(
C & 0x00ff0000))
11697 ZeroByteMask |= 0x00ff0000;
11698 if (!(
C & 0xff000000))
11699 ZeroByteMask |= 0xff000000;
11700 uint32_t NonZeroByteMask = ~ZeroByteMask;
11701 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11714 assert(V.getValueSizeInBits() == 32);
11716 if (V.getNumOperands() != 2)
11725 switch (V.getOpcode()) {
11730 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11735 return (0x03020100 & ~ConstMask) | ConstMask;
11742 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11748 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11755 DAGCombinerInfo &DCI)
const {
11756 if (DCI.isBeforeLegalize())
11760 EVT VT =
N->getValueType(0);
11765 if (VT == MVT::i64 && CRHS) {
11771 if (CRHS && VT == MVT::i32) {
11780 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11781 unsigned Shift = CShift->getZExtValue();
11783 unsigned Offset = NB + Shift;
11784 if ((
Offset & (Bits - 1)) == 0) {
11802 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11808 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11823 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11828 if (
X !=
LHS.getOperand(1))
11833 dyn_cast<ConstantFPSDNode>(
RHS.getOperand(1));
11866 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11867 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11869 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
11870 :
Mask->getZExtValue() & OrdMask;
11891 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11894 if (LHSMask != ~0u && RHSMask != ~0u) {
11897 if (LHSMask > RHSMask) {
11904 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11905 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11908 if (!(LHSUsedLanes & RHSUsedLanes) &&
11911 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11918 for (
unsigned I = 0;
I < 32;
I += 8) {
11920 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11921 Mask &= (0x0c <<
I) & 0xffffffff;
11979static const std::optional<ByteProvider<SDValue>>
11981 unsigned Depth = 0) {
11984 return std::nullopt;
11986 if (
Op.getValueSizeInBits() < 8)
11987 return std::nullopt;
11989 if (
Op.getValueType().isVector())
11992 switch (
Op->getOpcode()) {
12003 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12004 NarrowVT = VTSign->getVT();
12007 return std::nullopt;
12010 if (SrcIndex >= NarrowByteWidth)
12011 return std::nullopt;
12017 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12019 return std::nullopt;
12021 uint64_t BitShift = ShiftOp->getZExtValue();
12023 if (BitShift % 8 != 0)
12024 return std::nullopt;
12026 SrcIndex += BitShift / 8;
12044static const std::optional<ByteProvider<SDValue>>
12046 unsigned StartingIndex = 0) {
12050 return std::nullopt;
12052 unsigned BitWidth =
Op.getScalarValueSizeInBits();
12054 return std::nullopt;
12056 return std::nullopt;
12058 bool IsVec =
Op.getValueType().isVector();
12059 switch (
Op.getOpcode()) {
12062 return std::nullopt;
12067 return std::nullopt;
12071 return std::nullopt;
12074 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
12075 return std::nullopt;
12076 if (!
LHS ||
LHS->isConstantZero())
12078 if (!
RHS ||
RHS->isConstantZero())
12080 return std::nullopt;
12085 return std::nullopt;
12087 auto *BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12089 return std::nullopt;
12091 uint32_t BitMask = BitMaskOp->getZExtValue();
12093 uint32_t IndexMask = 0xFF << (Index * 8);
12095 if ((IndexMask & BitMask) != IndexMask) {
12098 if (IndexMask & BitMask)
12099 return std::nullopt;
12108 return std::nullopt;
12111 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12112 if (!ShiftOp ||
Op.getValueType().isVector())
12113 return std::nullopt;
12115 uint64_t BitsProvided =
Op.getValueSizeInBits();
12116 if (BitsProvided % 8 != 0)
12117 return std::nullopt;
12119 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12121 return std::nullopt;
12123 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12124 uint64_t ByteShift = BitShift / 8;
12126 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12127 uint64_t BytesProvided = BitsProvided / 8;
12128 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12129 NewIndex %= BytesProvided;
12136 return std::nullopt;
12138 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12140 return std::nullopt;
12142 uint64_t BitShift = ShiftOp->getZExtValue();
12144 return std::nullopt;
12146 auto BitsProvided =
Op.getScalarValueSizeInBits();
12147 if (BitsProvided % 8 != 0)
12148 return std::nullopt;
12150 uint64_t BytesProvided = BitsProvided / 8;
12151 uint64_t ByteShift = BitShift / 8;
12156 return BytesProvided - ByteShift > Index
12164 return std::nullopt;
12166 auto *ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12168 return std::nullopt;
12170 uint64_t BitShift = ShiftOp->getZExtValue();
12171 if (BitShift % 8 != 0)
12172 return std::nullopt;
12173 uint64_t ByteShift = BitShift / 8;
12179 return Index < ByteShift
12182 Depth + 1, StartingIndex);
12191 return std::nullopt;
12198 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
12199 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12201 if (NarrowBitWidth % 8 != 0)
12202 return std::nullopt;
12203 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12205 if (Index >= NarrowByteWidth)
12207 ? std::optional<ByteProvider<SDValue>>(
12215 return std::nullopt;
12219 if (NarrowByteWidth >= Index) {
12224 return std::nullopt;
12231 return std::nullopt;
12235 auto *L = cast<LoadSDNode>(
Op.getNode());
12237 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12238 if (NarrowBitWidth % 8 != 0)
12239 return std::nullopt;
12240 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12245 if (Index >= NarrowByteWidth) {
12247 ? std::optional<ByteProvider<SDValue>>(
12252 if (NarrowByteWidth > Index) {
12256 return std::nullopt;
12261 return std::nullopt;
12264 Depth + 1, StartingIndex);
12268 auto *IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12270 return std::nullopt;
12271 auto VecIdx = IdxOp->getZExtValue();
12272 auto ScalarSize =
Op.getScalarValueSizeInBits();
12273 if (ScalarSize < 32)
12274 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12276 StartingIndex, Index);
12281 return std::nullopt;
12283 auto *PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12285 return std::nullopt;
12288 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12289 if (IdxMask > 0x07 && IdxMask != 0x0c)
12290 return std::nullopt;
12292 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12293 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12295 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12301 return std::nullopt;
12316 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12320 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12323 auto MemVT = L->getMemoryVT();
12326 return L->getMemoryVT().getSizeInBits() == 16;
12336 int Low8 = Mask & 0xff;
12337 int Hi8 = (Mask & 0xff00) >> 8;
12339 assert(Low8 < 8 && Hi8 < 8);
12341 bool IsConsecutive = (Hi8 - Low8 == 1);
12346 bool Is16Aligned = !(Low8 % 2);
12348 return IsConsecutive && Is16Aligned;
12356 int Low16 = PermMask & 0xffff;
12357 int Hi16 = (PermMask & 0xffff0000) >> 16;
12367 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12369 if (!OtherOpIs16Bit)
12377 unsigned DWordOffset) {
12380 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12382 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12387 if (Src.getValueType().isVector()) {
12388 auto ScalarTySize = Src.getScalarValueSizeInBits();
12389 auto ScalarTy = Src.getValueType().getScalarType();
12390 if (ScalarTySize == 32) {
12394 if (ScalarTySize > 32) {
12397 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12398 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12405 assert(ScalarTySize < 32);
12406 auto NumElements =
TypeSize / ScalarTySize;
12407 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12408 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12409 auto NumElementsIn32 = 32 / ScalarTySize;
12410 auto NumAvailElements = DWordOffset < Trunc32Elements
12412 : NumElements - NormalizedTrunc;
12425 auto ShiftVal = 32 * DWordOffset;
12433 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12438 for (
int i = 0; i < 4; i++) {
12440 std::optional<ByteProvider<SDValue>>
P =
12443 if (!
P ||
P->isConstantZero())
12448 if (PermNodes.
size() != 4)
12451 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12452 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12454 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12455 auto PermOp = PermNodes[i];
12458 int SrcByteAdjust = 4;
12462 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12463 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12465 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12466 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12470 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12471 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12474 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12476 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12479 SDValue Op = *PermNodes[FirstSrc.first].Src;
12481 assert(
Op.getValueSizeInBits() == 32);
12485 int Low16 = PermMask & 0xffff;
12486 int Hi16 = (PermMask & 0xffff0000) >> 16;
12488 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12489 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12492 if (WellFormedLow && WellFormedHi)
12496 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12505 assert(
Op.getValueType().isByteSized() &&
12523 DAGCombinerInfo &DCI)
const {
12528 EVT VT =
N->getValueType(0);
12529 if (VT == MVT::i1) {
12534 if (Src !=
RHS.getOperand(0))
12539 if (!CLHS || !CRHS)
12543 static const uint32_t MaxMask = 0x3ff;
12558 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12563 Sel |=
LHS.getConstantOperandVal(2);
12572 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12576 auto usesCombinedOperand = [](
SDNode *OrUse) {
12579 !OrUse->getValueType(0).isVector())
12583 for (
auto *VUser : OrUse->users()) {
12584 if (!VUser->getValueType(0).isVector())
12591 if (VUser->getOpcode() == VectorwiseOp)
12597 if (!
any_of(
N->users(), usesCombinedOperand))
12603 if (LHSMask != ~0u && RHSMask != ~0u) {
12606 if (LHSMask > RHSMask) {
12613 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12614 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12617 if (!(LHSUsedLanes & RHSUsedLanes) &&
12620 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12622 LHSMask &= ~RHSUsedLanes;
12623 RHSMask &= ~LHSUsedLanes;
12625 LHSMask |= LHSUsedLanes & 0x04040404;
12635 if (LHSMask == ~0u || RHSMask == ~0u) {
12641 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12656 if (SrcVT == MVT::i32) {
12661 DCI.AddToWorklist(LowOr.
getNode());
12662 DCI.AddToWorklist(HiBits.getNode());
12670 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12673 N->getOperand(0), CRHS))
12681 DAGCombinerInfo &DCI)
const {
12682 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12691 EVT VT =
N->getValueType(0);
12692 if (CRHS && VT == MVT::i64) {
12714 LHS->getOperand(0), FNegLHS, FNegRHS);
12723 DAGCombinerInfo &DCI)
const {
12728 EVT VT =
N->getValueType(0);
12729 if (VT != MVT::i32)
12733 if (Src.getValueType() != MVT::i16)
12740SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12741 DAGCombinerInfo &DCI)
const {
12743 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12748 VTSign->getVT() == MVT::i8) ||
12750 VTSign->getVT() == MVT::i16))) {
12752 "s_buffer_load_{u8, i8} are supported "
12753 "in GFX12 (or newer) architectures.");
12754 EVT VT = Src.getValueType();
12759 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12765 auto *
M = cast<MemSDNode>(Src);
12766 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12767 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12772 VTSign->getVT() == MVT::i8) ||
12774 VTSign->getVT() == MVT::i16)) &&
12776 auto *
M = cast<MemSDNode>(Src);
12777 SDValue Ops[] = {Src.getOperand(0),
12783 Src.getOperand(6), Src.getOperand(7)};
12786 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12790 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12791 Opc,
SDLoc(
N), ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12792 return DCI.DAG.getMergeValues(
12799 DAGCombinerInfo &DCI)
const {
12807 if (
N->getOperand(0).isUndef())
12814 DAGCombinerInfo &DCI)
const {
12815 EVT VT =
N->getValueType(0);
12840 unsigned MaxDepth)
const {
12841 unsigned Opcode =
Op.getOpcode();
12845 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12846 const auto &
F = CFP->getValueAPF();
12847 if (
F.isNaN() &&
F.isSignaling())
12849 if (!
F.isDenormal())
12912 if (
Op.getValueType() == MVT::i32) {
12917 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12918 if (
RHS->getZExtValue() == 0xffff0000) {
12928 return Op.getValueType().getScalarType() != MVT::f16;
12996 if (
Op.getValueType() == MVT::i16) {
13007 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
13009 switch (IntrinsicID) {
13010 case Intrinsic::amdgcn_cvt_pkrtz:
13011 case Intrinsic::amdgcn_cubeid:
13012 case Intrinsic::amdgcn_frexp_mant:
13013 case Intrinsic::amdgcn_fdot2:
13014 case Intrinsic::amdgcn_rcp:
13015 case Intrinsic::amdgcn_rsq:
13016 case Intrinsic::amdgcn_rsq_clamp:
13017 case Intrinsic::amdgcn_rcp_legacy:
13018 case Intrinsic::amdgcn_rsq_legacy:
13019 case Intrinsic::amdgcn_trig_preop:
13020 case Intrinsic::amdgcn_log:
13021 case Intrinsic::amdgcn_exp2:
13022 case Intrinsic::amdgcn_sqrt:
13040 unsigned MaxDepth)
const {
13043 unsigned Opcode =
MI->getOpcode();
13045 if (Opcode == AMDGPU::G_FCANONICALIZE)
13048 std::optional<FPValueAndVReg> FCR;
13051 if (FCR->Value.isSignaling())
13053 if (!FCR->Value.isDenormal())
13064 case AMDGPU::G_FADD:
13065 case AMDGPU::G_FSUB:
13066 case AMDGPU::G_FMUL:
13067 case AMDGPU::G_FCEIL:
13068 case AMDGPU::G_FFLOOR:
13069 case AMDGPU::G_FRINT:
13070 case AMDGPU::G_FNEARBYINT:
13071 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13072 case AMDGPU::G_INTRINSIC_TRUNC:
13073 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13074 case AMDGPU::G_FMA:
13075 case AMDGPU::G_FMAD:
13076 case AMDGPU::G_FSQRT:
13077 case AMDGPU::G_FDIV:
13078 case AMDGPU::G_FREM:
13079 case AMDGPU::G_FPOW:
13080 case AMDGPU::G_FPEXT:
13081 case AMDGPU::G_FLOG:
13082 case AMDGPU::G_FLOG2:
13083 case AMDGPU::G_FLOG10:
13084 case AMDGPU::G_FPTRUNC:
13085 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13086 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13087 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13088 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13089 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13091 case AMDGPU::G_FNEG:
13092 case AMDGPU::G_FABS:
13093 case AMDGPU::G_FCOPYSIGN:
13095 case AMDGPU::G_FMINNUM:
13096 case AMDGPU::G_FMAXNUM:
13097 case AMDGPU::G_FMINNUM_IEEE:
13098 case AMDGPU::G_FMAXNUM_IEEE:
13099 case AMDGPU::G_FMINIMUM:
13100 case AMDGPU::G_FMAXIMUM: {
13108 case AMDGPU::G_BUILD_VECTOR:
13113 case AMDGPU::G_INTRINSIC:
13114 case AMDGPU::G_INTRINSIC_CONVERGENT:
13116 case Intrinsic::amdgcn_fmul_legacy:
13117 case Intrinsic::amdgcn_fmad_ftz:
13118 case Intrinsic::amdgcn_sqrt:
13119 case Intrinsic::amdgcn_fmed3:
13120 case Intrinsic::amdgcn_sin:
13121 case Intrinsic::amdgcn_cos:
13122 case Intrinsic::amdgcn_log:
13123 case Intrinsic::amdgcn_exp2:
13124 case Intrinsic::amdgcn_log_clamp:
13125 case Intrinsic::amdgcn_rcp:
13126 case Intrinsic::amdgcn_rcp_legacy:
13127 case Intrinsic::amdgcn_rsq:
13128 case Intrinsic::amdgcn_rsq_clamp:
13129 case Intrinsic::amdgcn_rsq_legacy:
13130 case Intrinsic::amdgcn_div_scale:
13131 case Intrinsic::amdgcn_div_fmas:
13132 case Intrinsic::amdgcn_div_fixup:
13133 case Intrinsic::amdgcn_fract:
13134 case Intrinsic::amdgcn_cvt_pkrtz:
13135 case Intrinsic::amdgcn_cubeid:
13136 case Intrinsic::amdgcn_cubema:
13137 case Intrinsic::amdgcn_cubesc:
13138 case Intrinsic::amdgcn_cubetc:
13139 case Intrinsic::amdgcn_frexp_mant:
13140 case Intrinsic::amdgcn_fdot2:
13141 case Intrinsic::amdgcn_trig_preop:
13160 if (
C.isDenormal()) {
13174 if (
C.isSignaling()) {
13193 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13197SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
13198 DAGCombinerInfo &DCI)
const {
13201 EVT VT =
N->getValueType(0);
13210 EVT VT =
N->getValueType(0);
13211 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13227 EVT EltVT =
Lo.getValueType();
13230 for (
unsigned I = 0;
I != 2; ++
I) {
13234 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13235 }
else if (
Op.isUndef()) {
13247 if (isa<ConstantFPSDNode>(NewElts[1]))
13248 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13254 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13306 if (!MinK || !MaxK)
13319 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13320 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13361 if (
Info->getMode().DX10Clamp) {
13370 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13402 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13411 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13420 DAGCombinerInfo &DCI)
const {
13423 EVT VT =
N->getValueType(0);
13424 unsigned Opc =
N->getOpcode();
13453 if (
SDValue Med3 = performIntMed3ImmCombine(
13458 if (
SDValue Med3 = performIntMed3ImmCombine(
13464 if (
SDValue Med3 = performIntMed3ImmCombine(
13469 if (
SDValue Med3 = performIntMed3ImmCombine(
13479 (VT == MVT::f32 || VT == MVT::f64 ||
13483 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13494 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13495 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13504 DAGCombinerInfo &DCI)
const {
13505 EVT VT =
N->getValueType(0);
13528 if (
Info->getMode().DX10Clamp) {
13531 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13534 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13537 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13548 DAGCombinerInfo &DCI)
const {
13552 return DCI.DAG.getUNDEF(
N->getValueType(0));
13560 bool IsDivergentIdx,
13565 unsigned VecSize = EltSize * NumElem;
13568 if (VecSize <= 64 && EltSize < 32)
13577 if (IsDivergentIdx)
13581 unsigned NumInsts = NumElem +
13582 ((EltSize + 31) / 32) * NumElem ;
13587 return NumInsts <= 16;
13592 return NumInsts <= 15;
13599 if (isa<ConstantSDNode>(
Idx))
13613SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
13614 DAGCombinerInfo &DCI)
const {
13620 EVT ResVT =
N->getValueType(0);
13639 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13667 DCI.AddToWorklist(Elt0.
getNode());
13668 DCI.AddToWorklist(Elt1.
getNode());
13690 if (!DCI.isBeforeLegalize())
13696 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13697 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13698 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13701 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13702 unsigned EltIdx = BitIndex / 32;
13703 unsigned LeftoverBitIdx = BitIndex % 32;
13707 DCI.AddToWorklist(Cast.
getNode());
13711 DCI.AddToWorklist(Elt.
getNode());
13714 DCI.AddToWorklist(Srl.
getNode());
13718 DCI.AddToWorklist(Trunc.
getNode());
13720 if (VecEltVT == ResVT) {
13732SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13733 DAGCombinerInfo &DCI)
const {
13747 EVT IdxVT =
Idx.getValueType();
13764 Src.getOperand(0).getValueType() == MVT::f16) {
13765 return Src.getOperand(0);
13768 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13769 APFloat Val = CFP->getValueAPF();
13770 bool LosesInfo =
true;
13780 DAGCombinerInfo &DCI)
const {
13782 "combine only useful on gfx8");
13784 SDValue TruncSrc =
N->getOperand(0);
13785 EVT VT =
N->getValueType(0);
13786 if (VT != MVT::f16)
13824unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13826 const SDNode *N1)
const {
13831 if (((VT == MVT::f32 &&
13833 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13853 EVT VT =
N->getValueType(0);
13854 if (VT != MVT::i32 && VT != MVT::i64)
13860 unsigned Opc =
N->getOpcode();
13883 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13915 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
13934 DAGCombinerInfo &DCI)
const {
13938 EVT VT =
N->getValueType(0);
13948 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13952 if (NumBits <= 32 || NumBits > 64)
13964 unsigned NumUsers = 0;
13992 bool MulSignedLo =
false;
13993 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14002 if (VT != MVT::i64) {
14025 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14027 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14028 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14030 if (!MulLHSUnsigned32) {
14037 if (!MulRHSUnsigned32) {
14048 if (VT != MVT::i64)
14054SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
14055 DAGCombinerInfo &DCI)
const {
14057 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14081 DAG.
getNode(
N->getOpcode(), SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
14092static std::optional<ByteProvider<SDValue>>
14095 if (!Byte0 || Byte0->isConstantZero()) {
14096 return std::nullopt;
14099 if (Byte1 && !Byte1->isConstantZero()) {
14100 return std::nullopt;
14106 unsigned FirstCs =
First & 0x0c0c0c0c;
14107 unsigned SecondCs = Second & 0x0c0c0c0c;
14108 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
14109 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14111 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14112 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14113 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14114 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14116 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14140 for (
int BPI = 0; BPI < 2; BPI++) {
14143 BPP = {Src1, Src0};
14145 unsigned ZeroMask = 0x0c0c0c0c;
14146 unsigned FMask = 0xFF << (8 * (3 - Step));
14148 unsigned FirstMask =
14149 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14150 unsigned SecondMask =
14151 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14155 int FirstGroup = -1;
14156 for (
int I = 0;
I < 2;
I++) {
14158 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
14159 return IterElt.SrcOp == *BPP.first.Src &&
14160 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14170 if (FirstGroup != -1) {
14172 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
14173 return IterElt.SrcOp == *BPP.second.Src &&
14174 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14180 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14188 unsigned ZeroMask = 0x0c0c0c0c;
14189 unsigned FMask = 0xFF << (8 * (3 - Step));
14193 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14197 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14206 if (Srcs.
size() == 1) {
14207 auto *Elt = Srcs.
begin();
14211 if (Elt->PermMask == 0x3020100)
14218 auto *FirstElt = Srcs.
begin();
14219 auto *SecondElt = std::next(FirstElt);
14226 auto FirstMask = FirstElt->PermMask;
14227 auto SecondMask = SecondElt->PermMask;
14229 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14230 unsigned FirstPlusFour = FirstMask | 0x04040404;
14233 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14245 FirstElt = std::next(SecondElt);
14246 if (FirstElt == Srcs.
end())
14249 SecondElt = std::next(FirstElt);
14252 if (SecondElt == Srcs.
end()) {
14258 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
14264 return Perms.
size() == 2
14270 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14271 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14272 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14273 EntryMask += ZeroMask;
14278 auto Opcode =
Op.getOpcode();
14284static std::optional<bool>
14295 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14298 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14300 assert(!(S0IsUnsigned && S0IsSigned));
14301 assert(!(S1IsUnsigned && S1IsSigned));
14309 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14315 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14316 return std::nullopt;
14328 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14329 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14334 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14340 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14341 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14342 return std::nullopt;
14348 DAGCombinerInfo &DCI)
const {
14350 EVT VT =
N->getValueType(0);
14357 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14362 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14366 if (VT == MVT::i64) {
14367 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14374 std::optional<bool> IsSigned;
14380 int ChainLength = 0;
14381 for (
int I = 0;
I < 4;
I++) {
14382 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14385 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14388 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14393 TempNode->getOperand(MulIdx), *Src0, *Src1,
14394 TempNode->getOperand(MulIdx)->getOperand(0),
14395 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14399 IsSigned = *IterIsSigned;
14400 if (*IterIsSigned != *IsSigned)
14403 auto AddIdx = 1 - MulIdx;
14406 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14407 Src2s.
push_back(TempNode->getOperand(AddIdx));
14417 TempNode->getOperand(AddIdx), *Src0, *Src1,
14418 TempNode->getOperand(AddIdx)->getOperand(0),
14419 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14423 if (*IterIsSigned != *IsSigned)
14427 ChainLength =
I + 2;
14431 TempNode = TempNode->getOperand(AddIdx);
14433 ChainLength =
I + 1;
14434 if (TempNode->getNumOperands() < 2)
14436 LHS = TempNode->getOperand(0);
14437 RHS = TempNode->getOperand(1);
14440 if (ChainLength < 2)
14446 if (ChainLength < 4) {
14456 bool UseOriginalSrc =
false;
14457 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14458 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14459 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14460 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14462 auto Src0Mask = Src0s.
begin()->PermMask;
14463 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14464 bool UniqueEntries =
true;
14465 for (
auto I = 1;
I < 4;
I++) {
14466 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14469 UniqueEntries =
false;
14475 if (UniqueEntries) {
14476 UseOriginalSrc =
true;
14478 auto *FirstElt = Src0s.
begin();
14482 auto *SecondElt = Src1s.
begin();
14484 SecondElt->DWordOffset);
14493 if (!UseOriginalSrc) {
14500 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14503 : Intrinsic::amdgcn_udot4,
14513 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14518 unsigned Opc =
LHS.getOpcode();
14523 Opc =
RHS.getOpcode();
14530 auto Cond =
RHS.getOperand(0);
14538 return DAG.
getNode(Opc, SL, VTList, Args);
14552 DAGCombinerInfo &DCI)
const {
14554 EVT VT =
N->getValueType(0);
14556 if (VT == MVT::i64) {
14557 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
14561 if (VT != MVT::i32)
14570 unsigned Opc =
RHS.getOpcode();
14577 auto Cond =
RHS.getOperand(0);
14585 return DAG.
getNode(Opc, SL, VTList, Args);
14600SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14601 DAGCombinerInfo &DCI)
const {
14603 if (
N->getValueType(0) != MVT::i32)
14614 unsigned LHSOpc =
LHS.getOpcode();
14615 unsigned Opc =
N->getOpcode();
14625 DAGCombinerInfo &DCI)
const {
14630 EVT VT =
N->getValueType(0);
14642 if (
A ==
LHS.getOperand(1)) {
14643 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14644 if (FusedOp != 0) {
14646 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14654 if (
A ==
RHS.getOperand(1)) {
14655 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14656 if (FusedOp != 0) {
14658 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14667 DAGCombinerInfo &DCI)
const {
14673 EVT VT =
N->getValueType(0);
14686 if (
A ==
LHS.getOperand(1)) {
14687 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14688 if (FusedOp != 0) {
14692 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14701 if (
A ==
RHS.getOperand(1)) {
14702 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14703 if (FusedOp != 0) {
14705 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14714 DAGCombinerInfo &DCI)
const {
14717 EVT VT =
N->getValueType(0);
14731 bool IsNegative =
false;
14732 if (CLHS->isExactlyValue(1.0) ||
14733 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14749 DAGCombinerInfo &DCI)
const {
14751 EVT VT =
N->getValueType(0);
14765 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14780 if (ScalarVT == MVT::f32 &&
14786 if (TrueNodeExpVal == INT_MIN)
14789 if (FalseNodeExpVal == INT_MIN)
14809 DAGCombinerInfo &DCI)
const {
14811 EVT VT =
N->getValueType(0);
14832 (
N->getFlags().hasAllowContract() &&
14833 FMA->getFlags().hasAllowContract())) {
14867 if (Vec1 == Vec2 || Vec3 == Vec4)
14873 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14882 DAGCombinerInfo &DCI)
const {
14888 EVT VT =
LHS.getValueType();
14891 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14893 CRHS = dyn_cast<ConstantSDNode>(LHS);
14917 return LHS.getOperand(0);
14923 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14924 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14925 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14932 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14933 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14941 return LHS.getOperand(0);
14945 if (VT != MVT::f32 && VT != MVT::f64 &&
14961 const unsigned IsInfMask =
14963 const unsigned IsFiniteMask =
14977SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
14978 DAGCombinerInfo &DCI)
const {
14996 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
15000 unsigned ShiftOffset = 8 *
Offset;
15002 ShiftOffset -=
C->getZExtValue();
15004 ShiftOffset +=
C->getZExtValue();
15006 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15008 MVT::f32, Shifted);
15019 DCI.AddToWorklist(
N);
15026 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
15032 DAGCombinerInfo &DCI)
const {
15042 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
15045 APFloat One(
F.getSemantics(),
"1.0");
15047 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
15054 switch (
N->getOpcode()) {
15070 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
15080 switch (
N->getOpcode()) {
15082 return performAddCombine(
N, DCI);
15084 return performSubCombine(
N, DCI);
15087 return performAddCarrySubCarryCombine(
N, DCI);
15089 return performFAddCombine(
N, DCI);
15091 return performFSubCombine(
N, DCI);
15093 return performFDivCombine(
N, DCI);
15095 return performFMulCombine(
N, DCI);
15097 return performSetCCCombine(
N, DCI);
15110 return performMinMaxCombine(
N, DCI);
15112 return performFMACombine(
N, DCI);
15114 return performAndCombine(
N, DCI);
15116 return performOrCombine(
N, DCI);
15119 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
15120 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15126 return performXorCombine(
N, DCI);
15128 return performZeroExtendCombine(
N, DCI);
15130 return performSignExtendInRegCombine(
N, DCI);
15132 return performClassCombine(
N, DCI);
15134 return performFCanonicalizeCombine(
N, DCI);
15136 return performRcpCombine(
N, DCI);
15151 return performUCharToFloatCombine(
N, DCI);
15153 return performFCopySignCombine(
N, DCI);
15158 return performCvtF32UByteNCombine(
N, DCI);
15160 return performFMed3Combine(
N, DCI);
15162 return performCvtPkRTZCombine(
N, DCI);
15164 return performClampCombine(
N, DCI);
15167 EVT VT =
N->getValueType(0);
15170 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15173 EVT EltVT = Src.getValueType();
15174 if (EltVT != MVT::i16)
15184 return performExtractVectorEltCombine(
N, DCI);
15186 return performInsertVectorEltCombine(
N, DCI);
15188 return performFPRoundCombine(
N, DCI);
15190 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
15196 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
15197 return performMemSDNodeCombine(MemNode, DCI);
15228 unsigned Opcode =
Node->getMachineOpcode();
15232 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
15237 unsigned DmaskIdx =
15239 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
15240 unsigned NewDmask = 0;
15243 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
15244 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
15247 unsigned TFCLane = 0;
15248 bool HasChain =
Node->getNumValues() > 1;
15250 if (OldDmask == 0) {
15258 TFCLane = OldBitsSet;
15265 if (
Use.getResNo() != 0)
15271 if (!
User->isMachineOpcode() ||
15272 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15284 if (UsesTFC && Lane == TFCLane) {
15289 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15291 Dmask &= ~(1 << Comp);
15299 NewDmask |= 1 << Comp;
15304 bool NoChannels = !NewDmask;
15311 if (OldBitsSet == 1)
15317 if (NewDmask == OldDmask)
15326 unsigned NewChannels = BitsSet + UsesTFC;
15330 assert(NewOpcode != -1 &&
15331 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
15332 "failed to find equivalent MIMG op");
15340 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
15342 MVT ResultVT = NewChannels == 1
15345 : NewChannels == 5 ? 8
15359 if (NewChannels == 1) {
15369 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
15374 if (i || !NoChannels)
15379 if (NewUser !=
User) {
15389 Idx = AMDGPU::sub1;
15392 Idx = AMDGPU::sub2;
15395 Idx = AMDGPU::sub3;
15398 Idx = AMDGPU::sub4;
15409 Op =
Op.getOperand(0);
15411 return isa<FrameIndexSDNode>(
Op);
15421 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15422 SDValue SrcVal = Node->getOperand(2);
15430 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15432 SDNode *Glued = Node->getGluedNode();
15434 Node->getOperand(0), SL, VReg, SrcVal,
15440 return ToResultReg.
getNode();
15445 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15453 Node->getOperand(i).getValueType(),
15454 Node->getOperand(i)),
15466 unsigned Opcode = Node->getMachineOpcode();
15468 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15469 !
TII->isGather4(Opcode) &&
15471 return adjustWritemask(Node, DAG);
15474 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15480 case AMDGPU::V_DIV_SCALE_F32_e64:
15481 case AMDGPU::V_DIV_SCALE_F64_e64: {
15485 SDValue Src0 = Node->getOperand(1);
15486 SDValue Src1 = Node->getOperand(3);
15487 SDValue Src2 = Node->getOperand(5);
15491 (Src0 == Src1 || Src0 == Src2))
15548 unsigned InitIdx = 0;
15550 if (
TII->isImage(
MI)) {
15558 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15559 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15560 unsigned D16Val = D16 ? D16->getImm() : 0;
15562 if (!TFEVal && !LWEVal)
15573 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15575 unsigned dmask = MO_Dmask->
getImm();
15582 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15588 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15589 if (DstSize < InitIdx)
15592 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15600 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15601 unsigned NewDst = 0;
15610 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15611 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15631 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15644 if (
TII->isVOP3(
MI.getOpcode())) {
15646 TII->legalizeOperandsVOP3(
MRI,
MI);
15651 if (!
MI.getDesc().operands().empty()) {
15652 unsigned Opc =
MI.getOpcode();
15653 bool HasAGPRs =
Info->mayNeedAGPRs();
15661 if ((
I == Src2Idx) && (HasAGPRs))
15664 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15666 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15667 if (!
TRI->hasAGPRs(RC))
15669 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15670 if (!Src || !Src->isCopy() ||
15671 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15673 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15677 MRI.setRegClass(
Op.getReg(), NewRC);
15680 if (
TII->isMAI(
MI)) {
15686 AMDGPU::OpName::scale_src0);
15687 if (Src0Idx != -1) {
15689 AMDGPU::OpName::scale_src1);
15690 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
15691 TII->usesConstantBus(
MRI,
MI, Src1Idx))
15692 TII->legalizeOpWithMove(
MI, Src1Idx);
15700 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15701 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15702 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15703 if (
TRI->isVectorSuperClass(RC)) {
15704 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15705 MRI.setRegClass(Src2->getReg(), NewRC);
15706 if (Src2->isTied())
15707 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15716 if (
TII->isImage(
MI))
15717 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15791std::pair<unsigned, const TargetRegisterClass *>
15798 if (Constraint.
size() == 1) {
15800 switch (Constraint[0]) {
15807 RC = &AMDGPU::SReg_32RegClass;
15810 RC = &AMDGPU::SGPR_64RegClass;
15815 return std::pair(0U,
nullptr);
15822 RC = &AMDGPU::VGPR_32RegClass;
15827 return std::pair(0U,
nullptr);
15836 RC = &AMDGPU::AGPR_32RegClass;
15841 return std::pair(0U,
nullptr);
15850 return std::pair(0U, RC);
15855 if (
RegName.consume_front(
"v")) {
15856 RC = &AMDGPU::VGPR_32RegClass;
15857 }
else if (
RegName.consume_front(
"s")) {
15858 RC = &AMDGPU::SGPR_32RegClass;
15859 }
else if (
RegName.consume_front(
"a")) {
15860 RC = &AMDGPU::AGPR_32RegClass;
15865 if (
RegName.consume_front(
"[")) {
15876 return std::pair(0U,
nullptr);
15879 RC =
TRI->getVGPRClassForBitWidth(Width);
15881 RC =
TRI->getSGPRClassForBitWidth(Width);
15883 RC =
TRI->getAGPRClassForBitWidth(Width);
15885 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15890 return std::pair(0U,
nullptr);
15892 return std::pair(Reg, RC);
15898 return std::pair(0U,
nullptr);
15900 if (!
Failed && Idx < RC->getNumRegs())
15908 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15914 if (Constraint.
size() == 1) {
15915 switch (Constraint[0]) {
15925 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
15933 if (Constraint.
size() == 1) {
15934 switch (Constraint[0]) {
15951 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15958 std::vector<SDValue> &Ops,
15973 unsigned Size =
Op.getScalarValueSizeInBits();
15981 Val =
C->getSExtValue();
15985 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15991 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15994 Val =
C->getSExtValue();
15998 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
16008 if (Constraint.
size() == 1) {
16009 switch (Constraint[0]) {
16013 return isInt<16>(Val);
16017 return isInt<32>(Val);
16024 }
else if (Constraint.
size() == 2) {
16025 if (Constraint ==
"DA") {
16026 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
16027 int64_t LoBits =
static_cast<int32_t
>(Val);
16031 if (Constraint ==
"DB") {
16039 unsigned MaxSize)
const {
16040 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
16043 MVT VT =
Op.getSimpleValueType();
16068 switch (UnalignedClassID) {
16069 case AMDGPU::VReg_64RegClassID:
16070 return AMDGPU::VReg_64_Align2RegClassID;
16071 case AMDGPU::VReg_96RegClassID:
16072 return AMDGPU::VReg_96_Align2RegClassID;
16073 case AMDGPU::VReg_128RegClassID:
16074 return AMDGPU::VReg_128_Align2RegClassID;
16075 case AMDGPU::VReg_160RegClassID:
16076 return AMDGPU::VReg_160_Align2RegClassID;
16077 case AMDGPU::VReg_192RegClassID:
16078 return AMDGPU::VReg_192_Align2RegClassID;
16079 case AMDGPU::VReg_224RegClassID:
16080 return AMDGPU::VReg_224_Align2RegClassID;
16081 case AMDGPU::VReg_256RegClassID:
16082 return AMDGPU::VReg_256_Align2RegClassID;
16083 case AMDGPU::VReg_288RegClassID:
16084 return AMDGPU::VReg_288_Align2RegClassID;
16085 case AMDGPU::VReg_320RegClassID:
16086 return AMDGPU::VReg_320_Align2RegClassID;
16087 case AMDGPU::VReg_352RegClassID:
16088 return AMDGPU::VReg_352_Align2RegClassID;
16089 case AMDGPU::VReg_384RegClassID:
16090 return AMDGPU::VReg_384_Align2RegClassID;
16091 case AMDGPU::VReg_512RegClassID:
16092 return AMDGPU::VReg_512_Align2RegClassID;
16093 case AMDGPU::VReg_1024RegClassID:
16094 return AMDGPU::VReg_1024_Align2RegClassID;
16095 case AMDGPU::AReg_64RegClassID:
16096 return AMDGPU::AReg_64_Align2RegClassID;
16097 case AMDGPU::AReg_96RegClassID:
16098 return AMDGPU::AReg_96_Align2RegClassID;
16099 case AMDGPU::AReg_128RegClassID:
16100 return AMDGPU::AReg_128_Align2RegClassID;
16101 case AMDGPU::AReg_160RegClassID:
16102 return AMDGPU::AReg_160_Align2RegClassID;
16103 case AMDGPU::AReg_192RegClassID:
16104 return AMDGPU::AReg_192_Align2RegClassID;
16105 case AMDGPU::AReg_256RegClassID:
16106 return AMDGPU::AReg_256_Align2RegClassID;
16107 case AMDGPU::AReg_512RegClassID:
16108 return AMDGPU::AReg_512_Align2RegClassID;
16109 case AMDGPU::AReg_1024RegClassID:
16110 return AMDGPU::AReg_1024_Align2RegClassID;
16126 if (
Info->isEntryFunction()) {
16133 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16135 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16136 :
TRI->getAlignedHighSGPRForRC(MF, 2,
16137 &AMDGPU::SGPR_64RegClass);
16138 Info->setSGPRForEXECCopy(SReg);
16141 Info->getStackPtrOffsetReg()));
16142 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16143 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
16147 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16148 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
16150 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16151 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
16153 Info->limitOccupancy(MF);
16155 if (ST.isWave32() && !MF.
empty()) {
16156 for (
auto &
MBB : MF) {
16157 for (
auto &
MI :
MBB) {
16158 TII->fixImplicitOperands(
MI);
16168 if (ST.needsAlignedVGPRs()) {
16169 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
16175 if (NewClassID != -1)
16176 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
16185 const APInt &DemandedElts,
16187 unsigned Depth)
const {
16189 unsigned Opc =
Op.getOpcode();
16192 unsigned IID =
Op.getConstantOperandVal(0);
16194 case Intrinsic::amdgcn_mbcnt_lo:
16195 case Intrinsic::amdgcn_mbcnt_hi: {
16201 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16211 Op, Known, DemandedElts, DAG,
Depth);
16226 unsigned MaxValue =
16235 switch (
MI->getOpcode()) {
16236 case AMDGPU::G_INTRINSIC:
16237 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16240 case Intrinsic::amdgcn_workitem_id_x:
16243 case Intrinsic::amdgcn_workitem_id_y:
16246 case Intrinsic::amdgcn_workitem_id_z:
16249 case Intrinsic::amdgcn_mbcnt_lo:
16250 case Intrinsic::amdgcn_mbcnt_hi: {
16262 case Intrinsic::amdgcn_groupstaticsize: {
16273 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16276 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16279 case AMDGPU::G_AMDGPU_SMED3:
16280 case AMDGPU::G_AMDGPU_UMED3: {
16281 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
16308 unsigned Depth)
const {
16310 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
16316 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
16343 if (Header->getAlignment() != PrefAlign)
16344 return Header->getAlignment();
16346 unsigned LoopSize = 0;
16354 LoopSize +=
TII->getInstSizeInBytes(
MI);
16355 if (LoopSize > 192)
16360 if (LoopSize <= 64)
16363 if (LoopSize <= 128)
16364 return CacheLineAlign;
16370 auto I = Exit->getFirstNonDebugInstr();
16371 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16372 return CacheLineAlign;
16381 if (PreTerm == Pre->
begin() ||
16382 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16386 auto ExitHead = Exit->getFirstNonDebugInstr();
16387 if (ExitHead == Exit->end() ||
16388 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16393 return CacheLineAlign;
16401 N =
N->getOperand(0).getNode();
16411 switch (
N->getOpcode()) {
16419 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
16420 return !
TRI->isSGPRReg(
MRI, Reg);
16422 if (
const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16426 return !
TRI->isSGPRReg(
MRI, Reg);
16430 unsigned AS = L->getAddressSpace();
16461 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16463 return A->readMem() &&
A->writeMem();
16498 unsigned Depth)
const {
16503 if (
Info->getMode().DX10Clamp)
16515 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
16535 <<
"Hardware instruction generated for atomic "
16537 <<
" operation at memory scope " << MemScope;
16541 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16542 Type *EltTy = VT->getElementType();
16543 return VT->getNumElements() == 2 &&
16562 if (
auto *
IT = dyn_cast<IntegerType>(Ty)) {
16563 unsigned BW =
IT->getBitWidth();
16564 return BW == 32 || BW == 64;
16576 if (
PointerType *PT = dyn_cast<PointerType>(Ty)) {
16578 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
16579 return BW == 32 || BW == 64;
16586 return VT->getNumElements() == 2 &&
16587 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16597 bool HasSystemScope) {
16604 if (HasSystemScope) {
16611 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
16624 const MDNode *NoaliasAddrSpaceMD =
16625 I->getMetadata(LLVMContext::MD_noalias_addrspace);
16626 if (!NoaliasAddrSpaceMD)
16629 for (
unsigned I = 0, E = NoaliasAddrSpaceMD->
getNumOperands() / 2;
I != E;
16631 auto *
Low = mdconst::extract<ConstantInt>(
16634 auto *
High = mdconst::extract<ConstantInt>(
16656 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
16669 bool HasSystemScope =
16856 if (HasSystemScope)
16908 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16909 return Subtarget->
isWave64() ? &AMDGPU::SReg_64RegClass
16910 : &AMDGPU::SReg_32RegClass;
16911 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16912 return TRI->getEquivalentSGPRClass(RC);
16913 if (
TRI->isSGPRClass(RC) && isDivergent)
16914 return TRI->getEquivalentVGPRClass(RC);
16926 unsigned WaveSize) {
16931 if (!
IT ||
IT->getBitWidth() != WaveSize)
16934 if (!isa<Instruction>(V))
16936 if (!Visited.
insert(V).second)
16938 bool Result =
false;
16939 for (
const auto *U : V->users()) {
16940 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16941 if (V == U->getOperand(1)) {
16942 switch (Intrinsic->getIntrinsicID()) {
16946 case Intrinsic::amdgcn_if_break:
16947 case Intrinsic::amdgcn_if:
16948 case Intrinsic::amdgcn_else:
16953 if (V == U->getOperand(0)) {
16954 switch (Intrinsic->getIntrinsicID()) {
16958 case Intrinsic::amdgcn_end_cf:
16959 case Intrinsic::amdgcn_loop:
16965 Result =
hasCFUser(U, Visited, WaveSize);
16974 const Value *V)
const {
16975 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16976 if (CI->isInlineAsm()) {
16985 for (
auto &TC : TargetConstraints) {
17027 return MRI.hasOneNonDBGUse(N0);
17034 if (
I.getMetadata(
"amdgpu.noclobber"))
17036 if (
I.getMetadata(
"amdgpu.last.use"))
17046 if (!Def->isMachineOpcode())
17056 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17057 PhysReg = AMDGPU::SCC;
17059 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17068 if (!
I->hasOneUse())
17074 switch (
I->getOpcode()) {
17075 case Instruction::FMul: {
17076 if (
User->getOpcode() != Instruction::FSub &&
17077 User->getOpcode() != Instruction::FAdd)
17082 return ((!
I->hasAllowContract() || !
User->hasAllowContract()) &&
17141 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17152 Alignment = RMW->getAlign();
17167 RMW->getType()->isFloatTy();
17170 bool ReturnValueIsUsed = !AI->
use_empty();
17179 if (FullFlatEmulation) {
17190 std::prev(BB->
end())->eraseFromParent();
17193 Value *LoadedShared =
nullptr;
17194 if (FullFlatEmulation) {
17196 Intrinsic::amdgcn_is_shared, {}, {
Addr},
nullptr,
"is.shared");
17197 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17205 LoadedShared = Clone;
17212 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
17220 Value *LoadedPrivate;
17223 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
17226 LoadedPrivate, RMW->getValOperand());
17230 auto [ResultLoad, Equal] =
17245 if (FullFlatEmulation) {
17255 if (!FullFlatEmulation) {
17260 MDNode *RangeNotPrivate =
17263 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
17271 if (ReturnValueIsUsed) {
17274 if (FullFlatEmulation)
17289 if (
const auto *ConstVal = dyn_cast<Constant>(AI->
getValOperand());
17290 ConstVal && ConstVal->isNullValue()) {
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static constexpr Register SPReg
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool hasBF16ConversionInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool hasMinimum3Maximum3F32() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMinimum3Maximum3PKF16() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasMinimum3Maximum3F16() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasUnalignedScratchAccessEnabled() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool hasDot10Insts() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LLVMContext & getContext() const
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
std::optional< StringRef > getSyncScopeName(SyncScope::ID Id) const
getSyncScopeName - Returns the name of a SyncScope::ID registered with LLVMContext,...
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
User * getUser() const
Returns the User that contains this Use.
unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const