Go to the documentation of this file.
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
35 enum DispatchPackedOffsets {
46 enum ImplicitArgOffsets {
47 HIDDEN_BLOCK_COUNT_X = 0,
48 HIDDEN_BLOCK_COUNT_Y = 4,
49 HIDDEN_BLOCK_COUNT_Z = 8,
51 HIDDEN_GROUP_SIZE_X = 12,
52 HIDDEN_GROUP_SIZE_Y = 14,
53 HIDDEN_GROUP_SIZE_Z = 16,
55 HIDDEN_REMAINDER_X = 18,
56 HIDDEN_REMAINDER_Y = 20,
57 HIDDEN_REMAINDER_Z = 22,
60 class AMDGPULowerKernelAttributes :
public ModulePass {
66 bool runOnModule(
Module &
M)
override;
69 return "AMDGPU Kernel Attributes";
78 auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
79 : Intrinsic::amdgcn_dispatch_ptr;
81 return M.getFunction(Name);
89 auto MD =
F->getMetadata(
"reqd_work_group_size");
90 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
92 const bool HasUniformWorkGroupSize =
93 F->getFnAttribute(
"uniform-work-group-size").getValueAsBool();
95 if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
98 Value *BlockCounts[3] = {
nullptr,
nullptr,
nullptr};
99 Value *GroupSizes[3] = {
nullptr,
nullptr,
nullptr};
100 Value *Remainders[3] = {
nullptr,
nullptr,
nullptr};
101 Value *GridSizes[3] = {
nullptr,
nullptr,
nullptr};
112 auto *
Load = dyn_cast<LoadInst>(U);
113 auto *BCI = dyn_cast<BitCastInst>(U);
117 Load = dyn_cast<LoadInst>(*U->user_begin());
118 BCI = dyn_cast<BitCastInst>(*U->user_begin());
122 if (!BCI->hasOneUse())
124 Load = dyn_cast<LoadInst>(*BCI->user_begin());
130 unsigned LoadSize =
DL.getTypeStoreSize(
Load->getType());
135 case HIDDEN_BLOCK_COUNT_X:
137 BlockCounts[0] =
Load;
139 case HIDDEN_BLOCK_COUNT_Y:
141 BlockCounts[1] =
Load;
143 case HIDDEN_BLOCK_COUNT_Z:
145 BlockCounts[2] =
Load;
147 case HIDDEN_GROUP_SIZE_X:
149 GroupSizes[0] =
Load;
151 case HIDDEN_GROUP_SIZE_Y:
153 GroupSizes[1] =
Load;
155 case HIDDEN_GROUP_SIZE_Z:
157 GroupSizes[2] =
Load;
159 case HIDDEN_REMAINDER_X:
161 Remainders[0] =
Load;
163 case HIDDEN_REMAINDER_Y:
165 Remainders[1] =
Load;
167 case HIDDEN_REMAINDER_Z:
169 Remainders[2] =
Load;
176 case WORKGROUP_SIZE_X:
178 GroupSizes[0] =
Load;
180 case WORKGROUP_SIZE_Y:
182 GroupSizes[1] =
Load;
184 case WORKGROUP_SIZE_Z:
186 GroupSizes[2] =
Load;
206 bool MadeChange =
false;
207 if (IsV5OrAbove && HasUniformWorkGroupSize) {
215 for (
int I = 0;
I < 3; ++
I) {
216 Value *BlockCount = BlockCounts[
I];
222 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
223 : (
I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
224 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
238 for (
Value *Remainder : Remainders) {
244 }
else if (HasUniformWorkGroupSize) {
264 for (
int I = 0;
I < 3; ++
I) {
265 Value *GroupSize = GroupSizes[
I];
266 Value *GridSize = GridSizes[
I];
267 if (!GroupSize || !GridSize)
272 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
273 : (
I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
274 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
277 auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
281 for (
User *
UMin : ZextGroupSize->users()) {
286 if (HasReqdWorkGroupSize) {
288 = mdconst::extract<ConstantInt>(MD->getOperand(
I));
290 KnownSize,
UMin->getType(),
false));
292 UMin->replaceAllUsesWith(ZextGroupSize);
303 if (!HasReqdWorkGroupSize)
306 for (
int I = 0;
I < 3;
I++) {
307 Value *GroupSize = GroupSizes[
I];
311 ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(
I));
323 bool AMDGPULowerKernelAttributes::runOnModule(
Module &M) {
324 bool MadeChange =
false;
332 for (
auto *U :
BasePtr->users()) {
334 if (HandledUses.
insert(CI).second) {
345 "AMDGPU Kernel Attributes",
false,
false)
349 char AMDGPULowerKernelAttributes::
ID = 0;
352 return new AMDGPULowerKernelAttributes();
358 Function *BasePtr = getBasePtrIntrinsic(*
F.getParent(), IsV5OrAbove);
364 if (
CallInst *CI = dyn_cast<CallInst>(&
I)) {
A set of analyses that are preserved following a run of a transformation pass.
static bool processUse(CallInst *CI, bool IsV5OrAbove)
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
A parsed version of the target data layout string in and methods for querying it.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
const Function * getParent() const
Return the enclosing method, or null if none.
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
unsigned getAmdhsaCodeObjectVersion()
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This is the shared class of boolean and integer constants.
bool match(Val *V, const Pattern &P)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Represent the analysis usage information of a pass.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
@ UMin
Unisgned integer min implemented in terms of select(cmp()).
inst_range instructions(Function *F)
Function * Kernel
Summary of a kernel (=entry point for target offloading).
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
A Module instance is used to store all the information related to an LLVM module.
StringRef - Represent a constant reference to a string, i.e.
@ ICMP_ULT
unsigned less than
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Constant * getIntegerCast(Constant *C, Type *Ty, bool IsSigned)
Create a ZExt, Bitcast or Trunc for integer -> integer casts.
ModulePass * createAMDGPULowerKernelAttributesPass()
static ConstantInt * getTrue(LLVMContext &Context)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
void setPreservesAll()
Set by analyses that do not transform their input at all.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
const BasicBlock * getParent() const
A container for analyses that lazily runs them and caches their results.
This class represents a function call, abstracting a target machine's calling convention.
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU Kernel Attributes", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelAttributes
LLVM Value Representation.
iterator_range< user_iterator > users()
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.