LLVM  10.0.0svn
AMDGPULowerKernelAttributes.cpp
Go to the documentation of this file.
1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
11 /// get_local_size-like functions.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
18 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/PatternMatch.h"
24 #include "llvm/Pass.h"
25 
26 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
27 
28 using namespace llvm;
29 
30 namespace {
31 
32 // Field offsets in hsa_kernel_dispatch_packet_t.
34  WORKGROUP_SIZE_X = 4,
35  WORKGROUP_SIZE_Y = 6,
36  WORKGROUP_SIZE_Z = 8,
37 
38  GRID_SIZE_X = 12,
39  GRID_SIZE_Y = 16,
40  GRID_SIZE_Z = 20
41 };
42 
43 class AMDGPULowerKernelAttributes : public ModulePass {
44  Module *Mod = nullptr;
45 
46 public:
47  static char ID;
48 
49  AMDGPULowerKernelAttributes() : ModulePass(ID) {}
50 
51  bool processUse(CallInst *CI);
52 
53  bool doInitialization(Module &M) override;
54  bool runOnModule(Module &M) override;
55 
56  StringRef getPassName() const override {
57  return "AMDGPU Kernel Attributes";
58  }
59 
60  void getAnalysisUsage(AnalysisUsage &AU) const override {
61  AU.setPreservesAll();
62  }
63 };
64 
65 } // end anonymous namespace
66 
67 bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
68  Mod = &M;
69  return false;
70 }
71 
72 bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
73  Function *F = CI->getParent()->getParent();
74 
75  auto MD = F->getMetadata("reqd_work_group_size");
76  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
77 
78  const bool HasUniformWorkGroupSize =
79  F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
80 
81  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
82  return false;
83 
84  Value *WorkGroupSizeX = nullptr;
85  Value *WorkGroupSizeY = nullptr;
86  Value *WorkGroupSizeZ = nullptr;
87 
88  Value *GridSizeX = nullptr;
89  Value *GridSizeY = nullptr;
90  Value *GridSizeZ = nullptr;
91 
92  const DataLayout &DL = Mod->getDataLayout();
93 
94  // We expect to see several GEP users, casted to the appropriate type and
95  // loaded.
96  for (User *U : CI->users()) {
97  if (!U->hasOneUse())
98  continue;
99 
100  int64_t Offset = 0;
101  if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
102  continue;
103 
104  auto *BCI = dyn_cast<BitCastInst>(*U->user_begin());
105  if (!BCI || !BCI->hasOneUse())
106  continue;
107 
108  auto *Load = dyn_cast<LoadInst>(*BCI->user_begin());
109  if (!Load || !Load->isSimple())
110  continue;
111 
112  unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
113 
114  // TODO: Handle merged loads.
115  switch (Offset) {
116  case WORKGROUP_SIZE_X:
117  if (LoadSize == 2)
118  WorkGroupSizeX = Load;
119  break;
120  case WORKGROUP_SIZE_Y:
121  if (LoadSize == 2)
122  WorkGroupSizeY = Load;
123  break;
124  case WORKGROUP_SIZE_Z:
125  if (LoadSize == 2)
126  WorkGroupSizeZ = Load;
127  break;
128  case GRID_SIZE_X:
129  if (LoadSize == 4)
130  GridSizeX = Load;
131  break;
132  case GRID_SIZE_Y:
133  if (LoadSize == 4)
134  GridSizeY = Load;
135  break;
136  case GRID_SIZE_Z:
137  if (LoadSize == 4)
138  GridSizeZ = Load;
139  break;
140  default:
141  break;
142  }
143  }
144 
145  // Pattern match the code used to handle partial workgroup dispatches in the
146  // library implementation of get_local_size, so the entire function can be
147  // constant folded with a known group size.
148  //
149  // uint r = grid_size - group_id * group_size;
150  // get_local_size = (r < group_size) ? r : group_size;
151  //
152  // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
153  // the grid_size is required to be a multiple of group_size). In this case:
154  //
155  // grid_size - (group_id * group_size) < group_size
156  // ->
157  // grid_size < group_size + (group_id * group_size)
158  //
159  // (grid_size / group_size) < 1 + group_id
160  //
161  // grid_size / group_size is at least 1, so we can conclude the select
162  // condition is false (except for group_id == 0, where the select result is
163  // the same).
164 
165  bool MadeChange = false;
166  Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
167  Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };
168 
169  for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
170  Value *GroupSize = WorkGroupSizes[I];
171  Value *GridSize = GridSizes[I];
172  if (!GroupSize || !GridSize)
173  continue;
174 
175  for (User *U : GroupSize->users()) {
176  auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
177  if (!ZextGroupSize)
178  continue;
179 
180  for (User *ZextUser : ZextGroupSize->users()) {
181  auto *SI = dyn_cast<SelectInst>(ZextUser);
182  if (!SI)
183  continue;
184 
185  using namespace llvm::PatternMatch;
186  auto GroupIDIntrin = I == 0 ?
187  m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
188  (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
189  m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
190 
191  auto SubExpr = m_Sub(m_Specific(GridSize),
192  m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
193 
194  ICmpInst::Predicate Pred;
195  if (match(SI,
196  m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
197  SubExpr,
198  m_Specific(ZextGroupSize))) &&
199  Pred == ICmpInst::ICMP_ULT) {
200  if (HasReqdWorkGroupSize) {
201  ConstantInt *KnownSize
202  = mdconst::extract<ConstantInt>(MD->getOperand(I));
203  SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
204  SI->getType(),
205  false));
206  } else {
207  SI->replaceAllUsesWith(ZextGroupSize);
208  }
209 
210  MadeChange = true;
211  }
212  }
213  }
214  }
215 
216  if (!HasReqdWorkGroupSize)
217  return MadeChange;
218 
219  // Eliminate any other loads we can from the dispatch packet.
220  for (int I = 0; I < 3; ++I) {
221  Value *GroupSize = WorkGroupSizes[I];
222  if (!GroupSize)
223  continue;
224 
225  ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
226  GroupSize->replaceAllUsesWith(
228  GroupSize->getType(),
229  false));
230  MadeChange = true;
231  }
232 
233  return MadeChange;
234 }
235 
236 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
237 // TargetPassConfig for subtarget.
238 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
239  StringRef DispatchPtrName
240  = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
241 
242  Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
243  if (!DispatchPtr) // Dispatch ptr not used.
244  return false;
245 
246  bool MadeChange = false;
247 
248  SmallPtrSet<Instruction *, 4> HandledUses;
249  for (auto *U : DispatchPtr->users()) {
250  CallInst *CI = cast<CallInst>(U);
251  if (HandledUses.insert(CI).second) {
252  if (processUse(CI))
253  MadeChange = true;
254  }
255  }
256 
257  return MadeChange;
258 }
259 
260 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
261  "AMDGPU IR optimizations", false, false)
262 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
263  false, false)
264 
265 char AMDGPULowerKernelAttributes::ID = 0;
266 
268  return new AMDGPULowerKernelAttributes();
269 }
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:728
This class represents lattice values for constants.
Definition: AllocatorList.h:23
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
AMDGPU IR optimizations
This class represents zero extension of integer types.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:783
This class represents a function call, abstracting a target machine&#39;s calling convention.
unsigned less than
Definition: InstrTypes.h:757
ModulePass * createAMDGPULowerKernelAttributesPass()
F(f)
An instruction for reading from memory.
Definition: Instructions.h:167
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:47
static Constant * getIntegerCast(Constant *C, Type *Ty, bool isSigned)
Create a ZExt, Bitcast or Trunc for integer -> integer casts.
Definition: Constants.cpp:1631
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:637
This class represents the LLVM &#39;select&#39; instruction.
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1440
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
#define DEBUG_TYPE
This class represents a no-op cast from one type to another.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
This file contains the declarations for the subclasses of Constant, which represent the different fla...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:576
Represent the analysis usage information of a pass.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:732
The AMDGPU TargetMachine interface definition for hw codgen targets.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
The access may modify the value stored in memory.
void setPreservesAll()
Set by analyses that do not transform their input at all.
iterator_range< user_iterator > users()
Definition: Value.h:419
StringRef getValueAsString() const
Return the attribute&#39;s value as a string.
Definition: Attributes.cpp:223
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:224
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:445
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset...
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:333
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelAttributes
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1074
Statically lint checks LLVM IR
Definition: Lint.cpp:192
const BasicBlock * getParent() const
Definition: Instruction.h:66
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)