17#include "llvm/IR/IntrinsicsAMDGPU.h"
18#include "llvm/IR/IntrinsicsR600.h"
22#define DEBUG_TYPE "amdgpu-attributor"
27 "amdgpu-indirect-call-specialization-threshold",
29 "A threshold controls whether an indirect call will be specialized"),
32#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
35#include "AMDGPUAttributes.def"
39#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
43#include "AMDGPUAttributes.def"
48#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
49static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
51#include "AMDGPUAttributes.def"
61 bool HasApertureRegs,
bool SupportsGetDoorBellID,
62 unsigned CodeObjectVersion) {
64 case Intrinsic::amdgcn_workitem_id_x:
67 case Intrinsic::amdgcn_workgroup_id_x:
69 return WORKGROUP_ID_X;
70 case Intrinsic::amdgcn_workitem_id_y:
71 case Intrinsic::r600_read_tidig_y:
73 case Intrinsic::amdgcn_workitem_id_z:
74 case Intrinsic::r600_read_tidig_z:
76 case Intrinsic::amdgcn_workgroup_id_y:
77 case Intrinsic::r600_read_tgid_y:
78 return WORKGROUP_ID_Y;
79 case Intrinsic::amdgcn_workgroup_id_z:
80 case Intrinsic::r600_read_tgid_z:
81 return WORKGROUP_ID_Z;
82 case Intrinsic::amdgcn_cluster_id_x:
85 case Intrinsic::amdgcn_cluster_id_y:
87 case Intrinsic::amdgcn_cluster_id_z:
89 case Intrinsic::amdgcn_lds_kernel_id:
91 case Intrinsic::amdgcn_dispatch_ptr:
93 case Intrinsic::amdgcn_dispatch_id:
95 case Intrinsic::amdgcn_implicitarg_ptr:
96 return IMPLICIT_ARG_PTR;
99 case Intrinsic::amdgcn_queue_ptr:
102 case Intrinsic::amdgcn_is_shared:
103 case Intrinsic::amdgcn_is_private:
111 case Intrinsic::amdgcn_wwm:
112 case Intrinsic::amdgcn_strict_wwm:
113 return WHOLE_WAVE_MODE;
114 case Intrinsic::trap:
115 case Intrinsic::debugtrap:
116 case Intrinsic::ubsantrap:
117 if (SupportsGetDoorBellID)
141 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
142 F.hasFnAttribute(Attribute::SanitizeThread) ||
143 F.hasFnAttribute(Attribute::SanitizeMemory) ||
144 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
145 F.hasFnAttribute(Attribute::SanitizeMemTag);
151 AMDGPUInformationCache(
const Module &M, AnalysisGetter &AG,
153 SetVector<Function *> *
CGSCC, TargetMachine &TM)
159 enum ConstantStatus : uint8_t {
162 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
163 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
164 ADDR_SPACE_CAST_BOTH_TO_FLAT =
165 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
169 bool hasApertureRegs(Function &
F) {
170 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
171 return ST.hasApertureRegs();
175 bool supportsGetDoorbellID(Function &
F) {
176 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
177 return ST.supportsGetDoorbellID();
180 std::optional<std::pair<unsigned, unsigned>>
181 getFlatWorkGroupSizeAttr(
const Function &
F)
const {
185 return std::make_pair(
R->first, *(
R->second));
188 std::pair<unsigned, unsigned>
189 getDefaultFlatWorkGroupSize(
const Function &
F)
const {
190 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
191 return ST.getDefaultFlatWorkGroupSize(
F.getCallingConv());
194 std::pair<unsigned, unsigned>
195 getMaximumFlatWorkGroupRange(
const Function &
F) {
196 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
197 return {
ST.getMinFlatWorkGroupSize(),
ST.getMaxFlatWorkGroupSize()};
200 SmallVector<unsigned> getMaxNumWorkGroups(
const Function &
F) {
201 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
202 return ST.getMaxNumWorkGroups(
F);
206 unsigned getCodeObjectVersion()
const {
return CodeObjectVersion; }
208 std::optional<std::pair<unsigned, unsigned>>
209 getWavesPerEUAttr(
const Function &
F) {
215 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
216 Val->second =
ST.getMaxWavesPerEU();
218 return std::make_pair(Val->first, *(Val->second));
222 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
223 return ST.getMaxWavesPerEU();
226 unsigned getMaxAddrSpace()
const override {
233 static uint8_t visitConstExpr(
const ConstantExpr *CE) {
234 uint8_t Status = NONE;
236 if (
CE->getOpcode() == Instruction::AddrSpaceCast) {
237 unsigned SrcAS =
CE->getOperand(0)->getType()->getPointerAddressSpace();
239 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
241 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
248 uint8_t getConstantAccess(
const Constant *
C,
249 SmallPtrSetImpl<const Constant *> &Visited) {
250 auto It = ConstantStatus.find(
C);
251 if (It != ConstantStatus.end())
259 Result |= visitConstExpr(CE);
261 for (
const Use &U :
C->operands()) {
263 if (!OpC || !Visited.
insert(OpC).second)
266 Result |= getConstantAccess(OpC, Visited);
273 bool needsQueuePtr(
const Constant *
C, Function &Fn) {
275 bool HasAperture = hasApertureRegs(Fn);
278 if (!IsNonEntryFunc && HasAperture)
281 SmallPtrSet<const Constant *, 8> Visited;
282 uint8_t
Access = getConstantAccess(
C, Visited);
285 if (IsNonEntryFunc && (
Access & DS_GLOBAL))
288 return !HasAperture && (
Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
291 bool checkConstForAddrSpaceCastFromPrivate(
const Constant *
C) {
292 SmallPtrSet<const Constant *, 8> Visited;
293 uint8_t
Access = getConstantAccess(
C, Visited);
294 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
299 DenseMap<const Constant *, uint8_t> ConstantStatus;
300 const unsigned CodeObjectVersion;
303struct AAAMDAttributes
304 :
public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
306 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
309 AAAMDAttributes(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
312 static AAAMDAttributes &createForPosition(
const IRPosition &IRP,
316 StringRef
getName()
const override {
return "AAAMDAttributes"; }
319 const char *getIdAddr()
const override {
return &ID; }
323 static bool classof(
const AbstractAttribute *AA) {
328 static const char ID;
330const char AAAMDAttributes::ID = 0;
332struct AAUniformWorkGroupSize
333 :
public StateWrapper<BooleanState, AbstractAttribute> {
334 using Base = StateWrapper<BooleanState, AbstractAttribute>;
335 AAUniformWorkGroupSize(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
338 static AAUniformWorkGroupSize &createForPosition(
const IRPosition &IRP,
342 StringRef
getName()
const override {
return "AAUniformWorkGroupSize"; }
345 const char *getIdAddr()
const override {
return &ID; }
349 static bool classof(
const AbstractAttribute *AA) {
354 static const char ID;
356const char AAUniformWorkGroupSize::ID = 0;
358struct AAUniformWorkGroupSizeFunction :
public AAUniformWorkGroupSize {
359 AAUniformWorkGroupSizeFunction(
const IRPosition &IRP, Attributor &
A)
360 : AAUniformWorkGroupSize(IRP,
A) {}
364 CallingConv::ID CC =
F->getCallingConv();
366 if (CC != CallingConv::AMDGPU_KERNEL)
369 bool InitialValue =
F->hasFnAttribute(
"uniform-work-group-size");
372 indicateOptimisticFixpoint();
374 indicatePessimisticFixpoint();
380 auto CheckCallSite = [&](AbstractCallSite CS) {
383 <<
"->" << getAssociatedFunction()->
getName() <<
"\n");
385 const auto *CallerInfo =
A.getAAFor<AAUniformWorkGroupSize>(
387 if (!CallerInfo || !CallerInfo->isValidState())
391 CallerInfo->getState());
396 bool AllCallSitesKnown =
true;
397 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
398 return indicatePessimisticFixpoint();
405 return ChangeStatus::UNCHANGED;
407 LLVMContext &Ctx = getAssociatedFunction()->getContext();
408 return A.manifestAttrs(getIRPosition(),
409 {Attribute::get(Ctx,
"uniform-work-group-size")},
413 bool isValidState()
const override {
418 const std::string getAsStr(Attributor *)
const override {
419 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) +
"]";
423 void trackStatistics()
const override {}
426AAUniformWorkGroupSize &
427AAUniformWorkGroupSize::createForPosition(
const IRPosition &IRP,
430 return *
new (
A.Allocator) AAUniformWorkGroupSizeFunction(IRP,
A);
432 "AAUniformWorkGroupSize is only valid for function position");
435struct AAAMDAttributesFunction :
public AAAMDAttributes {
436 AAAMDAttributesFunction(
const IRPosition &IRP, Attributor &
A)
437 : AAAMDAttributes(IRP,
A) {}
449 if (HasSanitizerAttrs) {
450 removeAssumedBits(IMPLICIT_ARG_PTR);
451 removeAssumedBits(HOSTCALL_PTR);
452 removeAssumedBits(FLAT_SCRATCH_INIT);
456 if (HasSanitizerAttrs &&
457 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
458 Attr.first == FLAT_SCRATCH_INIT))
461 if (
F->hasFnAttribute(Attr.second))
462 addKnownBits(Attr.first);
465 if (
F->isDeclaration())
471 indicatePessimisticFixpoint();
479 auto OrigAssumed = getAssumed();
482 const AACallEdges *AAEdges =
A.getAAFor<AACallEdges>(
483 *
this, this->getIRPosition(), DepClassTy::REQUIRED);
486 return indicatePessimisticFixpoint();
490 bool NeedsImplicit =
false;
491 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
492 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
493 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*
F);
494 unsigned COV = InfoCache.getCodeObjectVersion();
499 const AAAMDAttributes *AAAMD =
A.getAAFor<AAAMDAttributes>(
501 if (!AAAMD || !AAAMD->isValidState())
502 return indicatePessimisticFixpoint();
507 bool NonKernelOnly =
false;
510 HasApertureRegs, SupportsGetDoorbellID, COV);
521 if (!
Callee->hasFnAttribute(Attribute::NoCallback))
522 return indicatePessimisticFixpoint();
527 if ((IsNonEntryFunc || !NonKernelOnly))
528 removeAssumedBits(AttrMask);
534 removeAssumedBits(IMPLICIT_ARG_PTR);
536 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(
A)) {
540 removeAssumedBits(IMPLICIT_ARG_PTR);
542 removeAssumedBits(QUEUE_PTR);
545 if (funcRetrievesMultigridSyncArg(
A, COV)) {
546 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
547 "multigrid_sync_arg needs implicitarg_ptr");
548 removeAssumedBits(MULTIGRID_SYNC_ARG);
551 if (funcRetrievesHostcallPtr(
A, COV)) {
552 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"hostcall needs implicitarg_ptr");
553 removeAssumedBits(HOSTCALL_PTR);
556 if (funcRetrievesHeapPtr(
A, COV)) {
557 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"heap_ptr needs implicitarg_ptr");
558 removeAssumedBits(HEAP_PTR);
561 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(
A, COV)) {
562 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"queue_ptr needs implicitarg_ptr");
563 removeAssumedBits(QUEUE_PTR);
566 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(
A)) {
567 removeAssumedBits(LDS_KERNEL_ID);
570 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(
A, COV))
571 removeAssumedBits(DEFAULT_QUEUE);
573 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(
A, COV))
574 removeAssumedBits(COMPLETION_ACTION);
576 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(
A))
577 removeAssumedBits(FLAT_SCRATCH_INIT);
579 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
580 : ChangeStatus::UNCHANGED;
585 LLVMContext &Ctx = getAssociatedFunction()->getContext();
588 if (isKnown(Attr.first))
589 AttrList.
push_back(Attribute::get(Ctx, Attr.second));
592 return A.manifestAttrs(getIRPosition(), AttrList,
596 const std::string getAsStr(Attributor *)
const override {
598 raw_string_ostream OS(Str);
601 if (isAssumed(Attr.first))
602 OS <<
' ' << Attr.second;
608 void trackStatistics()
const override {}
611 bool checkForQueuePtr(Attributor &
A) {
615 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
617 bool NeedsQueuePtr =
false;
620 unsigned SrcAS =
static_cast<AddrSpaceCastInst &
>(
I).getSrcAddressSpace();
622 NeedsQueuePtr =
true;
628 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
634 if (!HasApertureRegs) {
635 bool UsedAssumedInformation =
false;
636 A.checkForAllInstructions(CheckAddrSpaceCasts, *
this,
637 {Instruction::AddrSpaceCast},
638 UsedAssumedInformation);
645 if (!IsNonEntryFunc && HasApertureRegs)
648 for (BasicBlock &BB : *
F) {
649 for (Instruction &
I : BB) {
650 for (
const Use &U :
I.operands()) {
652 if (InfoCache.needsQueuePtr(
C, *
F))
662 bool funcRetrievesMultigridSyncArg(Attributor &
A,
unsigned COV) {
664 AA::RangeTy
Range(Pos, 8);
665 return funcRetrievesImplicitKernelArg(
A,
Range);
668 bool funcRetrievesHostcallPtr(Attributor &
A,
unsigned COV) {
670 AA::RangeTy
Range(Pos, 8);
671 return funcRetrievesImplicitKernelArg(
A,
Range);
674 bool funcRetrievesDefaultQueue(Attributor &
A,
unsigned COV) {
676 AA::RangeTy
Range(Pos, 8);
677 return funcRetrievesImplicitKernelArg(
A,
Range);
680 bool funcRetrievesCompletionAction(Attributor &
A,
unsigned COV) {
682 AA::RangeTy
Range(Pos, 8);
683 return funcRetrievesImplicitKernelArg(
A,
Range);
686 bool funcRetrievesHeapPtr(Attributor &
A,
unsigned COV) {
690 return funcRetrievesImplicitKernelArg(
A,
Range);
693 bool funcRetrievesQueuePtr(Attributor &
A,
unsigned COV) {
697 return funcRetrievesImplicitKernelArg(
A,
Range);
700 bool funcRetrievesImplicitKernelArg(Attributor &
A, AA::RangeTy
Range) {
712 const auto *PointerInfoAA =
A.getAAFor<AAPointerInfo>(
714 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
717 return PointerInfoAA->forallInterferingAccesses(
718 Range, [](
const AAPointerInfo::Access &Acc,
bool IsExact) {
723 bool UsedAssumedInformation =
false;
724 return !
A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *
this,
725 UsedAssumedInformation);
728 bool funcRetrievesLDSKernelId(Attributor &
A) {
733 bool UsedAssumedInformation =
false;
734 return !
A.checkForAllCallLikeInstructions(DoesNotRetrieve, *
this,
735 UsedAssumedInformation);
740 bool needFlatScratchInit(Attributor &
A) {
741 assert(isAssumed(FLAT_SCRATCH_INIT));
750 bool UsedAssumedInformation =
false;
751 if (!
A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *
this,
752 {Instruction::AddrSpaceCast},
753 UsedAssumedInformation))
757 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
761 for (
const Use &U :
I.operands()) {
763 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(
C))
785 return Callee->getIntrinsicID() !=
786 Intrinsic::amdgcn_addrspacecast_nonnull;
789 UsedAssumedInformation =
false;
793 return !
A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *
this,
794 UsedAssumedInformation);
798AAAMDAttributes &AAAMDAttributes::createForPosition(
const IRPosition &IRP,
801 return *
new (
A.Allocator) AAAMDAttributesFunction(IRP,
A);
806struct AAAMDSizeRangeAttribute
807 :
public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
808 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
812 AAAMDSizeRangeAttribute(
const IRPosition &IRP, Attributor &
A,
814 :
Base(IRP, 32), AttrName(AttrName) {}
817 void trackStatistics()
const override {}
819 template <
class AttributeImpl>
ChangeStatus updateImplImpl(Attributor &
A) {
822 auto CheckCallSite = [&](AbstractCallSite CS) {
825 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
827 const auto *CallerInfo =
A.getAAFor<AttributeImpl>(
829 if (!CallerInfo || !CallerInfo->isValidState())
838 bool AllCallSitesKnown =
true;
839 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
842 return indicatePessimisticFixpoint();
850 emitAttributeIfNotDefaultAfterClamp(Attributor &
A,
851 std::pair<unsigned, unsigned>
Default) {
853 unsigned Lower = getAssumed().getLower().getZExtValue();
854 unsigned Upper = getAssumed().getUpper().getZExtValue();
864 return ChangeStatus::UNCHANGED;
867 LLVMContext &Ctx =
F->getContext();
868 SmallString<10> Buffer;
869 raw_svector_ostream OS(Buffer);
871 return A.manifestAttrs(getIRPosition(),
872 {Attribute::get(Ctx, AttrName, OS.str())},
876 const std::string getAsStr(Attributor *)
const override {
878 raw_string_ostream OS(Str);
880 OS << getAssumed().getLower() <<
',' << getAssumed().getUpper() - 1;
887struct AAAMDFlatWorkGroupSize :
public AAAMDSizeRangeAttribute {
888 AAAMDFlatWorkGroupSize(
const IRPosition &IRP, Attributor &
A)
889 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-flat-work-group-size") {}
893 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
895 bool HasAttr =
false;
896 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*
F);
897 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*
F);
899 if (
auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*
F)) {
903 if (*Attr != MaxRange) {
911 if (
Range == MaxRange)
915 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
916 IntegerRangeState IRS(CR);
920 indicateOptimisticFixpoint();
924 return updateImplImpl<AAAMDFlatWorkGroupSize>(
A);
928 static AAAMDFlatWorkGroupSize &createForPosition(
const IRPosition &IRP,
933 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
934 return emitAttributeIfNotDefaultAfterClamp(
935 A, InfoCache.getMaximumFlatWorkGroupRange(*
F));
939 StringRef
getName()
const override {
return "AAAMDFlatWorkGroupSize"; }
942 const char *getIdAddr()
const override {
return &
ID; }
946 static bool classof(
const AbstractAttribute *AA) {
951 static const char ID;
954const char AAAMDFlatWorkGroupSize::ID = 0;
956AAAMDFlatWorkGroupSize &
957AAAMDFlatWorkGroupSize::createForPosition(
const IRPosition &IRP,
960 return *
new (
A.Allocator) AAAMDFlatWorkGroupSize(IRP,
A);
962 "AAAMDFlatWorkGroupSize is only valid for function position");
965struct TupleDecIntegerRangeState :
public AbstractState {
966 DecIntegerState<uint32_t>
X,
Y, Z;
968 bool isValidState()
const override {
969 return X.isValidState() &&
Y.isValidState() &&
Z.isValidState();
972 bool isAtFixpoint()
const override {
973 return X.isAtFixpoint() &&
Y.isAtFixpoint() &&
Z.isAtFixpoint();
977 return X.indicateOptimisticFixpoint() |
Y.indicateOptimisticFixpoint() |
978 Z.indicateOptimisticFixpoint();
982 return X.indicatePessimisticFixpoint() |
Y.indicatePessimisticFixpoint() |
983 Z.indicatePessimisticFixpoint();
986 TupleDecIntegerRangeState
operator^=(
const TupleDecIntegerRangeState &
Other) {
997 TupleDecIntegerRangeState &getAssumed() {
return *
this; }
998 const TupleDecIntegerRangeState &getAssumed()
const {
return *
this; }
1001using AAAMDMaxNumWorkgroupsState =
1002 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1005struct AAAMDMaxNumWorkgroups
1006 :
public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1007 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1009 AAAMDMaxNumWorkgroups(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1013 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1015 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*
F);
1017 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1018 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1019 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1022 indicatePessimisticFixpoint();
1028 auto CheckCallSite = [&](AbstractCallSite CS) {
1031 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
1033 const auto *CallerInfo =
A.getAAFor<AAAMDMaxNumWorkgroups>(
1035 if (!CallerInfo || !CallerInfo->isValidState())
1043 bool AllCallSitesKnown =
true;
1044 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1047 return indicatePessimisticFixpoint();
1053 static AAAMDMaxNumWorkgroups &createForPosition(
const IRPosition &IRP,
1058 LLVMContext &Ctx =
F->getContext();
1059 SmallString<32> Buffer;
1060 raw_svector_ostream OS(Buffer);
1061 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed();
1065 return A.manifestAttrs(
1067 {Attribute::get(Ctx,
"amdgpu-max-num-workgroups", OS.str())},
1071 StringRef
getName()
const override {
return "AAAMDMaxNumWorkgroups"; }
1073 const std::string getAsStr(Attributor *)
const override {
1074 std::string Buffer =
"AAAMDMaxNumWorkgroupsState[";
1075 raw_string_ostream OS(Buffer);
1076 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed()
1081 const char *getIdAddr()
const override {
return &
ID; }
1085 static bool classof(
const AbstractAttribute *AA) {
1089 void trackStatistics()
const override {}
1092 static const char ID;
1095const char AAAMDMaxNumWorkgroups::ID = 0;
1097AAAMDMaxNumWorkgroups &
1098AAAMDMaxNumWorkgroups::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1100 return *
new (
A.Allocator) AAAMDMaxNumWorkgroups(IRP,
A);
1101 llvm_unreachable(
"AAAMDMaxNumWorkgroups is only valid for function position");
1105struct AAAMDWavesPerEU :
public AAAMDSizeRangeAttribute {
1106 AAAMDWavesPerEU(
const IRPosition &IRP, Attributor &
A)
1107 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-waves-per-eu") {}
1111 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1114 if (
auto Attr = InfoCache.getWavesPerEUAttr(*
F)) {
1115 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1116 1U, InfoCache.getMaxWavesPerEU(*
F)};
1117 if (*Attr != MaxWavesPerEURange) {
1118 auto [Min,
Max] = *Attr;
1119 ConstantRange
Range(APInt(32, Min), APInt(32, Max + 1));
1120 IntegerRangeState RangeState(
Range);
1121 this->getState() = RangeState;
1122 indicateOptimisticFixpoint();
1128 indicatePessimisticFixpoint();
1134 auto CheckCallSite = [&](AbstractCallSite CS) {
1138 <<
"->" <<
Func->getName() <<
'\n');
1141 const auto *CallerAA =
A.getAAFor<AAAMDWavesPerEU>(
1143 if (!CallerAA || !CallerAA->isValidState())
1146 ConstantRange Assumed = getAssumed();
1148 CallerAA->getAssumed().getLower().getZExtValue());
1150 CallerAA->getAssumed().getUpper().getZExtValue());
1151 ConstantRange
Range(APInt(32, Min), APInt(32, Max));
1152 IntegerRangeState RangeState(
Range);
1153 getState() = RangeState;
1154 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1155 : ChangeStatus::CHANGED;
1160 bool AllCallSitesKnown =
true;
1161 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
1162 return indicatePessimisticFixpoint();
1168 static AAAMDWavesPerEU &createForPosition(
const IRPosition &IRP,
1173 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1174 return emitAttributeIfNotDefaultAfterClamp(
1175 A, {1U, InfoCache.getMaxWavesPerEU(*
F)});
1179 StringRef
getName()
const override {
return "AAAMDWavesPerEU"; }
1182 const char *getIdAddr()
const override {
return &
ID; }
1186 static bool classof(
const AbstractAttribute *AA) {
1191 static const char ID;
1194const char AAAMDWavesPerEU::ID = 0;
1196AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(
const IRPosition &IRP,
1199 return *
new (
A.Allocator) AAAMDWavesPerEU(IRP,
A);
1204static unsigned inlineAsmGetNumRequiredAGPRs(
const InlineAsm *IA,
1205 const CallBase &
Call) {
1208 unsigned AGPRDefCount = 0;
1209 unsigned AGPRUseCount = 0;
1210 unsigned MaxPhysReg = 0;
1214 for (
const InlineAsm::ConstraintInfo &CI :
IA->ParseConstraints()) {
1220 Ty = STy->getElementType(ResNo);
1235 for (StringRef Code : CI.Codes) {
1236 unsigned RegCount = 0;
1237 if (
Code.starts_with(
"a")) {
1248 MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
1258 AGPRDefCount =
alignTo(AGPRDefCount, RegCount);
1260 AGPRDefCount += RegCount;
1261 if (CI.isEarlyClobber) {
1262 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1263 AGPRUseCount += RegCount;
1266 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1267 AGPRUseCount += RegCount;
1272 unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
1277 return std::min(MaxVirtReg + MaxPhysReg, 256u);
1280struct AAAMDGPUMinAGPRAlloc
1281 :
public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1282 using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1283 AAAMDGPUMinAGPRAlloc(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1285 static AAAMDGPUMinAGPRAlloc &createForPosition(
const IRPosition &IRP,
1288 return *
new (
A.Allocator) AAAMDGPUMinAGPRAlloc(IRP,
A);
1290 "AAAMDGPUMinAGPRAlloc is only valid for function position");
1295 auto [MinNumAGPR, MaxNumAGPR] =
1298 if (MinNumAGPR == 0)
1299 indicateOptimisticFixpoint();
1302 const std::string getAsStr(Attributor *
A)
const override {
1303 std::string Str =
"amdgpu-agpr-alloc=";
1304 raw_string_ostream OS(Str);
1309 void trackStatistics()
const override {}
1312 DecIntegerState<> Maximum;
1319 const Value *CalleeOp = CB.getCalledOperand();
1324 unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
1328 switch (CB.getIntrinsicID()) {
1331 case Intrinsic::write_register:
1332 case Intrinsic::read_register:
1333 case Intrinsic::read_volatile_register: {
1338 auto [
Kind, RegIdx, NumRegs] =
1352 case Intrinsic::trap:
1353 case Intrinsic::debugtrap:
1354 case Intrinsic::ubsantrap:
1355 return CB.hasFnAttr(Attribute::NoCallback) ||
1356 !CB.hasFnAttr(
"trap-func-name");
1362 return CB.hasFnAttr(Attribute::NoCallback);
1366 auto *CBEdges =
A.getAAFor<AACallEdges>(
1368 if (!CBEdges || CBEdges->hasUnknownCallee()) {
1373 for (
const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1374 const auto *CalleeInfo =
A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1376 if (!CalleeInfo || !CalleeInfo->isValidState()) {
1387 bool UsedAssumedInformation =
false;
1388 if (!
A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *
this,
1389 UsedAssumedInformation))
1390 return indicatePessimisticFixpoint();
1396 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1397 SmallString<4> Buffer;
1398 raw_svector_ostream OS(Buffer);
1401 return A.manifestAttrs(
1402 getIRPosition(), {Attribute::get(Ctx,
"amdgpu-agpr-alloc", OS.str())});
1405 StringRef
getName()
const override {
return "AAAMDGPUMinAGPRAlloc"; }
1406 const char *getIdAddr()
const override {
return &
ID; }
1410 static bool classof(
const AbstractAttribute *AA) {
1414 static const char ID;
1417const char AAAMDGPUMinAGPRAlloc::ID = 0;
1421struct AAAMDGPUClusterDims
1422 :
public StateWrapper<BooleanState, AbstractAttribute> {
1423 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1424 AAAMDGPUClusterDims(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1427 static AAAMDGPUClusterDims &createForPosition(
const IRPosition &IRP,
1431 StringRef
getName()
const override {
return "AAAMDGPUClusterDims"; }
1434 const char *getIdAddr()
const override {
return &
ID; }
1438 static bool classof(
const AbstractAttribute *AA) {
1442 virtual const AMDGPU::ClusterDimsAttr &getClusterDims()
const = 0;
1445 static const char ID;
1448const char AAAMDGPUClusterDims::ID = 0;
1450struct AAAMDGPUClusterDimsFunction :
public AAAMDGPUClusterDims {
1451 AAAMDGPUClusterDimsFunction(
const IRPosition &IRP, Attributor &
A)
1452 : AAAMDGPUClusterDims(IRP,
A) {}
1456 assert(
F &&
"empty associated function");
1463 indicatePessimisticFixpoint();
1465 indicateOptimisticFixpoint();
1469 const std::string getAsStr(Attributor *
A)
const override {
1479 void trackStatistics()
const override {}
1482 auto OldState = Attr;
1484 auto CheckCallSite = [&](AbstractCallSite CS) {
1485 const auto *CallerAA =
A.getAAFor<AAAMDGPUClusterDims>(
1487 DepClassTy::REQUIRED);
1488 if (!CallerAA || !CallerAA->isValidState())
1491 return merge(CallerAA->getClusterDims());
1494 bool UsedAssumedInformation =
false;
1495 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1497 UsedAssumedInformation))
1498 return indicatePessimisticFixpoint();
1500 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1505 return ChangeStatus::UNCHANGED;
1506 return A.manifestAttrs(
1508 {Attribute::get(getAssociatedFunction()->
getContext(), AttrName,
1513 const AMDGPU::ClusterDimsAttr &getClusterDims()
const override {
1518 bool merge(
const AMDGPU::ClusterDimsAttr &
Other) {
1533 if (
Other.isUnknown())
1558 AMDGPU::ClusterDimsAttr Attr;
1560 static constexpr char AttrName[] =
"amdgpu-cluster-dims";
1563AAAMDGPUClusterDims &
1564AAAMDGPUClusterDims::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1566 return *
new (
A.Allocator) AAAMDGPUClusterDimsFunction(IRP,
A);
1567 llvm_unreachable(
"AAAMDGPUClusterDims is only valid for function position");
1570static bool runImpl(SetVector<Function *> &Functions,
bool IsModulePass,
1571 bool DeleteFns,
Module &M, AnalysisGetter &AG,
1572 TargetMachine &TM, AMDGPUAttributorOptions
Options,
1575 CallGraphUpdater CGUpdater;
1577 AMDGPUInformationCache InfoCache(M, AG,
Allocator,
nullptr, TM);
1578 DenseSet<const char *>
Allowed(
1579 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1581 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1587 AttributorConfig AC(CGUpdater);
1588 AC.IsClosedWorldModule =
Options.IsClosedWorld;
1590 AC.IsModulePass = IsModulePass;
1591 AC.DeleteFns = DeleteFns;
1592 AC.DefaultInitializeLiveInternals =
false;
1593 AC.IndirectCalleeSpecializationCallback =
1594 [](Attributor &
A,
const AbstractAttribute &AA, CallBase &CB,
1599 AC.IPOAmendableCB = [](
const Function &
F) {
1600 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1603 Attributor
A(Functions, InfoCache, AC);
1606 StringRef LTOPhaseStr =
to_string(LTOPhase);
1607 dbgs() <<
"[AMDGPUAttributor] Running at phase " << LTOPhaseStr <<
'\n'
1608 <<
"[AMDGPUAttributor] Module " <<
M.getName() <<
" is "
1609 << (AC.IsClosedWorldModule ?
"" :
"not ")
1610 <<
"assumed to be a closed world.\n";
1613 for (
auto *
F : Functions) {
1617 CallingConv::ID CC =
F->getCallingConv();
1624 if (!
F->isDeclaration() &&
ST.hasClusters())
1627 if (
ST.hasGFX90AInsts())
1631 Value *Ptr =
nullptr;
1633 Ptr = LI->getPointerOperand();
1635 Ptr =
SI->getPointerOperand();
1637 Ptr = RMW->getPointerOperand();
1639 Ptr = CmpX->getPointerOperand();
1645 if (
II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1652 return A.run() == ChangeStatus::CHANGED;
1665 if (!
F.isIntrinsic())
1666 Functions.insert(&
F);
1670 return runImpl(Functions,
true,
true, M, AG,
1671 TM, Options, LTOPhase)
1688 if (!
F->isIntrinsic())
1689 Functions.insert(
F);
1693 Module *M =
C.begin()->getFunction().getParent();
1696 return runImpl(Functions,
false,
false, *M, AG,
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
static bool hasSanitizerAttributes(const Function &F)
Returns true if sanitizer attributes are present on a function.
ImplicitArgumentPositions
static bool castRequiresQueuePtr(unsigned SrcAS)
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Machine Check Debug Module
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ClusterDimsAttr get(const Function &F)
std::string to_string() const
bool isVariableDims() const
uint64_t getZExtValue() const
Get zero extended value.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Value * getArgOperand(unsigned i) const
LLVM_ABI Intrinsic::ID getIntrinsicID() const
Returns the intrinsic ID of the intrinsic called or Intrinsic::not_intrinsic if the called function i...
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
A proxy from a FunctionAnalysisManager to an SCC.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
A vector that has set insertion semantics.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
std::string str() const
str - Get the contents as an std::string.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Type * getType() const
All values are typed, get the type of this value.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
std::tuple< char, unsigned, unsigned > parseAsmPhysRegName(StringRef RegName)
Returns a valid charcode or 0 in the first entry if this is a valid physical register name.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
NodeAddr< CodeNode * > Code
NodeAddr< FuncNode * > Func
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
@ None
No LTO/ThinLTO behavior needed.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
const char * to_string(ThinOrFullLTOPhase Phase)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
Wrapper for FunctionAnalysisManager.
The fixpoint analysis framework that orchestrates the attribute deduction.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
DecIntegerState & takeAssumedMaximum(base_t Value)
Take maximum of assumed and Value.
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
@ IRP_FUNCTION
An attribute for a function (scope).
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Helper to tie a abstract state implementation to an abstract attribute.