27#include "llvm/IR/IntrinsicsAMDGPU.h"
35#define DEBUG_TYPE "AMDGPUtti"
38 "amdgpu-unroll-threshold-private",
39 cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
43 "amdgpu-unroll-threshold-local",
44 cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
48 "amdgpu-unroll-threshold-if",
49 cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
53 "amdgpu-unroll-runtime-local",
54 cl::desc(
"Allow runtime unroll for AMDGPU if local memory used in a loop"),
58 "amdgpu-unroll-max-block-to-analyze",
59 cl::desc(
"Inner loop block size threshold to analyze in unroll for AMDGPU"),
64 cl::desc(
"Cost of alloca argument"));
72 cl::desc(
"Maximum alloca size to use for inline cost"));
77 cl::desc(
"Maximum number of BBs allowed in a function after inlining"
78 " (compile time constraint)"));
82 "amdgpu-memcpy-loop-unroll",
83 cl::desc(
"Unroll factor (affecting 4x32-bit operations) to use for memory "
84 "operations when lowering statically-sized memcpy, memmove, or"
94 for (
const Value *V :
I->operand_values()) {
99 return SubLoop->contains(PHI); }))
109 TargetTriple(TM->getTargetTriple()),
111 TLI(ST->getTargetLowering()) {}
116 const Function &
F = *L->getHeader()->getParent();
118 F.getFnAttributeAsParsedInteger(
"amdgpu-unroll-threshold", 300);
119 UP.
MaxCount = std::numeric_limits<unsigned>::max();
132 const unsigned MaxAlloca = (256 - 16) * 4;
138 if (
MDNode *LoopUnrollThreshold =
140 if (LoopUnrollThreshold->getNumOperands() == 2) {
142 LoopUnrollThreshold->getOperand(1));
143 if (MetaThresholdValue) {
149 ThresholdPrivate = std::min(ThresholdPrivate, UP.
Threshold);
150 ThresholdLocal = std::min(ThresholdLocal, UP.
Threshold);
155 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
158 unsigned LocalGEPsSeen = 0;
161 return SubLoop->contains(BB); }))
174 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
175 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
181 << *L <<
" due to " << *Br <<
'\n');
193 unsigned AS =
GEP->getAddressSpace();
194 unsigned Threshold = 0;
196 Threshold = ThresholdPrivate;
198 Threshold = ThresholdLocal;
206 const Value *Ptr =
GEP->getPointerOperand();
212 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
221 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
226 << *L <<
" due to LDS use.\n");
231 bool HasLoopDef =
false;
234 if (!Inst || L->isLoopInvariant(
Op))
238 return SubLoop->contains(Inst); }))
262 << *L <<
" due to " << *
GEP <<
'\n');
290 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
291 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
292 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
294 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
297 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
298 AMDGPU::FeatureTrapHandler,
302 AMDGPU::FeatureSRAMECC,
305 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
310 TLI(ST->getTargetLowering()), CommonTTI(TM,
F),
311 IsGraphics(
AMDGPU::isGraphics(
F.getCallingConv())) {
314 HasFP64FP16Denormals =
319 return !
F || !ST->isSingleLaneExecution(*
F);
351 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
352 return 32 * 4 / ElemWidth;
355 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
356 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
357 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
362 unsigned ChainSizeInBytes,
364 unsigned VecRegBitWidth = VF * LoadSize;
367 return 128 / LoadSize;
373 unsigned ChainSizeInBytes,
375 unsigned VecRegBitWidth = VF * StoreSize;
376 if (VecRegBitWidth > 128)
377 return 128 / StoreSize;
393 return 8 * ST->getMaxPrivateElementSize();
401 unsigned AddrSpace)
const {
406 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
407 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
414 unsigned AddrSpace)
const {
420 unsigned AddrSpace)
const {
430 unsigned DestAddrSpace,
Align SrcAlign,
Align DestAlign,
431 std::optional<uint32_t> AtomicElementSize)
const {
433 if (AtomicElementSize)
447 unsigned I32EltsInVector = 4;
457 unsigned RemainingBytes,
unsigned SrcAddrSpace,
unsigned DestAddrSpace,
459 std::optional<uint32_t> AtomicCpySize)
const {
463 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
464 DestAlign, AtomicCpySize);
467 while (RemainingBytes >= 16) {
469 RemainingBytes -= 16;
473 while (RemainingBytes >= 8) {
479 while (RemainingBytes >= 4) {
485 while (RemainingBytes >= 2) {
491 while (RemainingBytes) {
509 case Intrinsic::amdgcn_ds_ordered_add:
510 case Intrinsic::amdgcn_ds_ordered_swap: {
513 if (!Ordering || !Volatile)
516 unsigned OrderingVal = Ordering->getZExtValue();
523 Info.WriteMem =
true;
524 Info.IsVolatile = !Volatile->isZero();
538 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
539 int ISD = TLI->InstructionOpcodeToISD(Opcode);
543 unsigned NElts = LT.second.isVector() ?
544 LT.second.getVectorNumElements() : 1;
553 return get64BitInstrCost(
CostKind) * LT.first * NElts;
555 if (ST->has16BitInsts() && SLT == MVT::i16)
556 NElts = (NElts + 1) / 2;
559 return getFullRateInstrCost() * LT.first * NElts;
565 if (SLT == MVT::i64) {
567 return 2 * getFullRateInstrCost() * LT.first * NElts;
570 if (ST->has16BitInsts() && SLT == MVT::i16)
571 NElts = (NElts + 1) / 2;
573 return LT.first * NElts * getFullRateInstrCost();
575 const int QuarterRateCost = getQuarterRateInstrCost(
CostKind);
576 if (SLT == MVT::i64) {
577 const int FullRateCost = getFullRateInstrCost();
578 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
581 if (ST->has16BitInsts() && SLT == MVT::i16)
582 NElts = (NElts + 1) / 2;
585 return QuarterRateCost * NElts * LT.first;
593 const int OPC = TLI->InstructionOpcodeToISD(
FAdd->getOpcode());
595 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
597 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
610 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
611 NElts = (NElts + 1) / 2;
612 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
613 NElts = (NElts + 1) / 2;
615 return LT.first * NElts * get64BitInstrCost(
CostKind);
617 if (ST->has16BitInsts() && SLT == MVT::f16)
618 NElts = (NElts + 1) / 2;
620 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
621 return LT.first * NElts * getFullRateInstrCost();
627 if (SLT == MVT::f64) {
632 if (!ST->hasUsableDivScaleConditionOutput())
633 Cost += 3 * getFullRateInstrCost();
635 return LT.first *
Cost * NElts;
640 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
641 (SLT == MVT::f16 && ST->has16BitInsts())) {
642 return LT.first * getQuarterRateInstrCost(
CostKind) * NElts;
646 if (SLT == MVT::f16 && ST->has16BitInsts()) {
653 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(
CostKind);
654 return LT.first *
Cost * NElts;
661 int Cost = getQuarterRateInstrCost(
CostKind) + getFullRateInstrCost();
662 return LT.first *
Cost * NElts;
665 if (SLT == MVT::f32 || SLT == MVT::f16) {
667 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
668 1 * getQuarterRateInstrCost(
CostKind);
670 if (!HasFP32Denormals) {
672 Cost += 2 * getFullRateInstrCost();
675 return LT.first * NElts *
Cost;
681 return TLI->isFNegFree(SLT) ? 0 : NElts;
695 case Intrinsic::fmuladd:
696 case Intrinsic::copysign:
697 case Intrinsic::minimumnum:
698 case Intrinsic::maximumnum:
699 case Intrinsic::canonicalize:
701 case Intrinsic::round:
702 case Intrinsic::uadd_sat:
703 case Intrinsic::usub_sat:
704 case Intrinsic::sadd_sat:
705 case Intrinsic::ssub_sat:
716 switch (ICA.
getID()) {
717 case Intrinsic::fabs:
720 case Intrinsic::amdgcn_workitem_id_x:
721 case Intrinsic::amdgcn_workitem_id_y:
722 case Intrinsic::amdgcn_workitem_id_z:
726 case Intrinsic::amdgcn_workgroup_id_x:
727 case Intrinsic::amdgcn_workgroup_id_y:
728 case Intrinsic::amdgcn_workgroup_id_z:
729 case Intrinsic::amdgcn_lds_kernel_id:
730 case Intrinsic::amdgcn_dispatch_ptr:
731 case Intrinsic::amdgcn_dispatch_id:
732 case Intrinsic::amdgcn_implicitarg_ptr:
733 case Intrinsic::amdgcn_queue_ptr:
745 case Intrinsic::exp2:
746 case Intrinsic::exp10: {
748 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
751 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
753 if (SLT == MVT::f64) {
755 if (IID == Intrinsic::exp)
757 else if (IID == Intrinsic::exp10)
763 if (SLT == MVT::f32) {
764 unsigned NumFullRateOps = 0;
766 unsigned NumQuarterRateOps = 1;
772 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
774 if (IID == Intrinsic::exp) {
777 }
else if (IID == Intrinsic::exp10) {
780 NumQuarterRateOps = 2;
783 if (HasFP32Denormals)
788 NumFullRateOps * getFullRateInstrCost() +
789 NumQuarterRateOps * getQuarterRateInstrCost(
CostKind);
790 return LT.first * NElts *
Cost;
802 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
804 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
806 if ((ST->hasVOP3PInsts() &&
807 (SLT == MVT::f16 || SLT == MVT::i16 ||
808 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
809 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
810 NElts = (NElts + 1) / 2;
813 unsigned InstRate = getQuarterRateInstrCost(
CostKind);
815 switch (ICA.
getID()) {
817 case Intrinsic::fmuladd:
818 if (SLT == MVT::f64) {
819 InstRate = get64BitInstrCost(
CostKind);
823 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
824 InstRate = getFullRateInstrCost();
826 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(
CostKind)
827 : getQuarterRateInstrCost(
CostKind);
830 case Intrinsic::copysign:
831 return NElts * getFullRateInstrCost();
832 case Intrinsic::minimumnum:
833 case Intrinsic::maximumnum: {
845 SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
846 InstRate = BaseRate *
NumOps;
849 case Intrinsic::canonicalize: {
851 SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
854 case Intrinsic::uadd_sat:
855 case Intrinsic::usub_sat:
856 case Intrinsic::sadd_sat:
857 case Intrinsic::ssub_sat: {
858 if (SLT == MVT::i16 || SLT == MVT::i32)
859 InstRate = getFullRateInstrCost();
861 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
868 if (SLT == MVT::i16 || SLT == MVT::i32)
869 InstRate = 2 * getFullRateInstrCost();
875 return LT.first * NElts * InstRate;
881 assert((
I ==
nullptr ||
I->getOpcode() == Opcode) &&
882 "Opcode should reflect passed instruction.");
885 const int CBrCost = SCost ? 5 : 7;
887 case Instruction::UncondBr:
889 return SCost ? 1 : 4;
890 case Instruction::CondBr:
894 case Instruction::Switch: {
898 return (
SI ? (
SI->getNumCases() + 1) : 4) * (CBrCost + 1);
900 case Instruction::Ret:
901 return SCost ? 1 : 10;
908 std::optional<FastMathFlags> FMF,
913 EVT OrigTy = TLI->getValueType(
DL, Ty);
920 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
921 return LT.first * getFullRateInstrCost();
928 EVT OrigTy = TLI->getValueType(
DL, Ty);
935 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
936 return LT.first * getHalfRateInstrCost(
CostKind);
943 case Instruction::ExtractElement:
944 case Instruction::InsertElement: {
948 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
959 return Index == ~0u ? 2 : 0;
974 if (Indices.
size() > 1)
980 TLI->ParseConstraints(
DL, ST->getRegisterInfo(), *CI);
982 const int TargetOutputIdx = Indices.
empty() ? -1 : Indices[0];
985 for (
auto &TC : TargetConstraints) {
990 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
993 TLI->ComputeConstraintToUse(TC,
SDValue());
996 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1000 if (!RC || !
TRI->isSGPRClass(RC))
1030bool GCNTTIImpl::isSourceOfDivergence(
const Value *V)
const {
1054 case Intrinsic::read_register:
1056 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1058 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1059 unsigned DstAS =
Intrinsic->getType()->getPointerAddressSpace();
1062 ST->hasGloballyAddressableScratch();
1064 case Intrinsic::amdgcn_workitem_id_y:
1065 case Intrinsic::amdgcn_workitem_id_z: {
1070 *
F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1071 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1080 if (CI->isInlineAsm())
1095 ST->hasGloballyAddressableScratch();
1101bool GCNTTIImpl::isAlwaysUniform(
const Value *V)
const {
1106 if (CI->isInlineAsm())
1124 bool XDimDoesntResetWithinWaves =
false;
1127 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*
F);
1129 using namespace llvm::PatternMatch;
1135 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1142 ST->getWavefrontSizeLog2() &&
1143 XDimDoesntResetWithinWaves;
1158 case Intrinsic::amdgcn_if:
1159 case Intrinsic::amdgcn_else: {
1160 ArrayRef<unsigned> Indices = ExtValue->
getIndices();
1161 return Indices.
size() == 1 && Indices[0] == 1;
1178 case Intrinsic::amdgcn_is_shared:
1179 case Intrinsic::amdgcn_is_private:
1180 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1181 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1182 case Intrinsic::amdgcn_load_to_lds:
1183 case Intrinsic::amdgcn_make_buffer_rsrc:
1193 Value *NewV)
const {
1194 auto IntrID =
II->getIntrinsicID();
1196 case Intrinsic::amdgcn_is_shared:
1197 case Intrinsic::amdgcn_is_private: {
1198 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1206 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1207 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1208 Type *DestTy =
II->getType();
1215 M,
II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1216 II->setArgOperand(0, NewV);
1217 II->setCalledFunction(NewDecl);
1220 case Intrinsic::amdgcn_load_to_lds: {
1225 II->setArgOperand(0, NewV);
1226 II->setCalledFunction(NewDecl);
1229 case Intrinsic::amdgcn_make_buffer_rsrc: {
1231 Type *DstTy =
II->getType();
1234 M,
II->getIntrinsicID(), {DstTy, SrcTy});
1235 II->setArgOperand(0, NewV);
1236 II->setCalledFunction(NewDecl);
1257 unsigned ScalarSize =
DL.getTypeSizeInBits(SrcTy->getElementType());
1259 (ScalarSize == 16 || ScalarSize == 8)) {
1272 unsigned NumSrcElts = SrcVecTy->getNumElements();
1273 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1279 unsigned EltsPerReg = 32 / ScalarSize;
1287 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1290 if (Index % EltsPerReg == 0)
1293 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1299 unsigned NumDstElts = DstVecTy->getNumElements();
1301 unsigned EndIndex = Index + NumInsertElts;
1302 unsigned BeginSubIdx = Index % EltsPerReg;
1303 unsigned EndSubIdx = EndIndex % EltsPerReg;
1306 if (BeginSubIdx != 0) {
1314 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1323 unsigned NumElts = DstVecTy->getNumElements();
1327 unsigned EltsFromLHS = NumElts - Index;
1328 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1329 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1330 if (LHSIsAligned && RHSIsAligned)
1332 if (LHSIsAligned && !RHSIsAligned)
1333 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1334 if (!LHSIsAligned && RHSIsAligned)
1342 if (!Mask.empty()) {
1352 for (
unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1355 for (
unsigned I = 0;
I < EltsPerReg && DstIdx +
I < Mask.size(); ++
I) {
1356 int SrcIdx = Mask[DstIdx +
I];
1360 if (SrcIdx < (
int)NumSrcElts) {
1361 Reg = SrcIdx / EltsPerReg;
1362 if (SrcIdx % EltsPerReg !=
I)
1365 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1366 if ((SrcIdx - NumSrcElts) % EltsPerReg !=
I)
1372 if (Regs.
size() >= 2)
1392 for (
auto &
Op :
I->operands()) {
1405 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1407 if (VecOpInst && VecOpInst->
hasOneUse())
1412 OpInst->getOperand(0),
1413 OpInst->getOperand(1)) == 0) {
1422 unsigned EltSize =
DL.getTypeSizeInBits(
1427 if (EltSize < 16 || !ST->has16BitInsts())
1430 int NumSubElts, SubIndex;
1431 if (Shuffle->changesLength()) {
1432 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1437 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1438 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1439 !(SubIndex & 0x1)) {
1445 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1446 Shuffle->isSingleSource()) {
1453 return !
Ops.empty();
1464 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1465 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1467 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1468 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1469 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1479 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1480 Callee->hasFnAttribute(Attribute::InlineHint))
1486 if (Callee->size() == 1)
1488 size_t BBSize = Caller->size() + Callee->size() - 1;
1498 const int NrOfSGPRUntilSpill = 26;
1499 const int NrOfVGPRUntilSpill = 32;
1503 unsigned adjustThreshold = 0;
1509 for (
auto ArgVT : ValueVTs) {
1513 SGPRsInUse += CCRegNum;
1515 VGPRsInUse += CCRegNum;
1525 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1528 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1534 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1536 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1538 return adjustThreshold;
1547 unsigned AllocaSize = 0;
1554 unsigned AddrSpace = Ty->getAddressSpace();
1564 AllocaSize +=
Size->getFixedValue();
1608 static_assert(InlinerVectorBonusPercent == 0,
"vector bonus assumed to be 0");
1612 return BB.getTerminator()->getNumSuccessors() > 1;
1615 Threshold += Threshold / 2;
1623 unsigned AllocaThresholdBonus =
1624 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1626 return AllocaThresholdBonus;
1632 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1637 CommonTTI.getPeelingPreferences(L, SE, PP);
1641 return ST->hasFullRate64Ops()
1642 ? getFullRateInstrCost()
1643 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(
CostKind)
1644 : getQuarterRateInstrCost(
CostKind);
1647std::pair<InstructionCost, MVT>
1648GCNTTIImpl::getTypeLegalizationCost(
Type *Ty)
const {
1650 auto Size =
DL.getTypeSizeInBits(Ty);
1657 Cost.first += (
Size + 255) / 256;
1662 return ST->hasPrefetch() ? 128 : 0;
1673 LB.
push_back({
"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1674 LB.push_back({
"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1675 LB.push_back({
"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1676 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1677 ST->getFlatWorkGroupSizes(
F);
1678 LB.push_back({
"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1679 LB.push_back({
"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1680 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(
F);
1681 LB.push_back({
"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1682 LB.push_back({
"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1687 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1694 Attribute IEEEAttr =
F->getFnAttribute(
"amdgpu-ieee");
1709 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1710 VecTy->getElementType()->isIntegerTy(8)) {
1721 if (VecTy->getElementType()->isIntegerTy(8)) {
1731 if (isAlwaysUniform(V))
1734 if (isSourceOfDivergence(V))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
Functions, function parameters, and return types can have attributes to indicate how they should be t...
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
LLVM Basic Block Representation.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
A parsed version of the target data layout string in and methods for querying it.
constexpr bool isScalar() const
Exactly one element.
Convenience struct for specifying and reasoning about fast-math flags.
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
FastMathFlags getFlags() const
Type * getReturnType() const
const IntrinsicInst * getInst() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
A Module instance is used to store all the information related to an LLVM module.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
@ ADD
Simple integer binary arithmetic operators.
@ FADD
Simple binary floating point operators.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
@ AND
Bitwise operators - logical and, logical or, logical xor.
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
uint64_t getScalarSizeInBits() const
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
const unsigned PragmaCount
const bool PragmaEnableUnroll