24#include "llvm/IR/IntrinsicsAMDGPU.h"
28#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
35enum DispatchPackedOffsets {
46enum ImplicitArgOffsets {
47 HIDDEN_BLOCK_COUNT_X = 0,
48 HIDDEN_BLOCK_COUNT_Y = 4,
49 HIDDEN_BLOCK_COUNT_Z = 8,
51 HIDDEN_GROUP_SIZE_X = 12,
52 HIDDEN_GROUP_SIZE_Y = 14,
53 HIDDEN_GROUP_SIZE_Z = 16,
55 HIDDEN_REMAINDER_X = 18,
56 HIDDEN_REMAINDER_Y = 20,
57 HIDDEN_REMAINDER_Z = 22,
60class AMDGPULowerKernelAttributes :
public ModulePass {
69 return "AMDGPU Kernel Attributes";
78 auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
79 : Intrinsic::amdgcn_dispatch_ptr;
81 return M.getFunction(
Name);
89 auto MD =
F->getMetadata(
"reqd_work_group_size");
90 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
92 const bool HasUniformWorkGroupSize =
93 F->getFnAttribute(
"uniform-work-group-size").getValueAsBool();
95 if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
98 Value *BlockCounts[3] = {
nullptr,
nullptr,
nullptr};
99 Value *GroupSizes[3] = {
nullptr,
nullptr,
nullptr};
100 Value *Remainders[3] = {
nullptr,
nullptr,
nullptr};
101 Value *GridSizes[3] = {
nullptr,
nullptr,
nullptr};
112 auto *Load = dyn_cast<LoadInst>(U);
113 auto *BCI = dyn_cast<BitCastInst>(U);
117 Load = dyn_cast<LoadInst>(*U->user_begin());
118 BCI = dyn_cast<BitCastInst>(*U->user_begin());
122 if (!BCI->hasOneUse())
124 Load = dyn_cast<LoadInst>(*BCI->user_begin());
127 if (!Load || !Load->isSimple())
130 unsigned LoadSize =
DL.getTypeStoreSize(Load->getType());
135 case HIDDEN_BLOCK_COUNT_X:
137 BlockCounts[0] = Load;
139 case HIDDEN_BLOCK_COUNT_Y:
141 BlockCounts[1] = Load;
143 case HIDDEN_BLOCK_COUNT_Z:
145 BlockCounts[2] = Load;
147 case HIDDEN_GROUP_SIZE_X:
149 GroupSizes[0] = Load;
151 case HIDDEN_GROUP_SIZE_Y:
153 GroupSizes[1] = Load;
155 case HIDDEN_GROUP_SIZE_Z:
157 GroupSizes[2] = Load;
159 case HIDDEN_REMAINDER_X:
161 Remainders[0] = Load;
163 case HIDDEN_REMAINDER_Y:
165 Remainders[1] = Load;
167 case HIDDEN_REMAINDER_Z:
169 Remainders[2] = Load;
176 case WORKGROUP_SIZE_X:
178 GroupSizes[0] = Load;
180 case WORKGROUP_SIZE_Y:
182 GroupSizes[1] = Load;
184 case WORKGROUP_SIZE_Z:
186 GroupSizes[2] = Load;
206 bool MadeChange =
false;
207 if (IsV5OrAbove && HasUniformWorkGroupSize) {
215 for (
int I = 0;
I < 3; ++
I) {
216 Value *BlockCount = BlockCounts[
I];
222 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
223 : (
I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
224 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
229 if (Pred != ICmpInst::ICMP_ULT)
238 for (
Value *Remainder : Remainders) {
244 }
else if (HasUniformWorkGroupSize) {
264 for (
int I = 0;
I < 3; ++
I) {
265 Value *GroupSize = GroupSizes[
I];
266 Value *GridSize = GridSizes[
I];
267 if (!GroupSize || !GridSize)
272 I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
273 : (
I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
274 : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
277 auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
281 for (
User *
UMin : ZextGroupSize->users()) {
286 if (HasReqdWorkGroupSize) {
288 = mdconst::extract<ConstantInt>(MD->getOperand(
I));
290 KnownSize,
UMin->getType(),
false));
292 UMin->replaceAllUsesWith(ZextGroupSize);
303 if (!HasReqdWorkGroupSize)
306 for (
int I = 0;
I < 3;
I++) {
307 Value *GroupSize = GroupSizes[
I];
311 ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(
I));
323bool AMDGPULowerKernelAttributes::runOnModule(
Module &M) {
324 bool MadeChange =
false;
332 for (
auto *U :
BasePtr->users()) {
334 if (HandledUses.
insert(CI).second) {
345 "AMDGPU Kernel Attributes",
false,
false)
349char AMDGPULowerKernelAttributes::
ID = 0;
352 return new AMDGPULowerKernelAttributes();
359 Function *BasePtr = getBasePtrIntrinsic(*
F.getParent(), IsV5OrAbove);
365 if (
CallInst *CI = dyn_cast<CallInst>(&
I)) {
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool processUse(CallInst *CI, bool IsV5OrAbove)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
print must be executed print the must be executed context for all instructions
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Target-Independent Code Generator Pass Configuration Options pass.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
void setPreservesAll()
Set by analyses that do not transform their input at all.
const Function * getParent() const
Return the enclosing method, or null if none.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static Constant * getIntegerCast(Constant *C, Type *Ty, bool IsSigned)
Create a ZExt, Bitcast or Trunc for integer -> integer casts.
This is the shared class of boolean and integer constants.
static ConstantInt * getTrue(LLVMContext &Context)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
const BasicBlock * getParent() const
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
StringRef - Represent a constant reference to a string, i.e.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getCodeObjectVersion(const Module &M)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
ModulePass * createAMDGPULowerKernelAttributesPass()
@ UMin
Unisgned integer min implemented in terms of select(cmp()).
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)