LLVM  13.0.0git
AMDGPUOpenCLEnqueuedBlockLowering.cpp
Go to the documentation of this file.
1 //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This post-linking pass replaces the function pointer of enqueued
11 // block kernel with a global variable (runtime handle) and adds
12 // "runtime-handle" attribute to the enqueued block kernel.
13 //
14 // In LLVM CodeGen the runtime-handle metadata will be translated to
15 // RuntimeHandle metadata in code object. Runtime allocates a global buffer
16 // for each kernel with RuntimeHandel metadata and saves the kernel address
17 // required for the AQL packet into the buffer. __enqueue_kernel function
18 // in device library knows that the invoke function pointer in the block
19 // literal is actually runtime handle and loads the kernel address from it
20 // and put it into AQL packet for dispatching.
21 //
22 // This cannot be done in FE since FE cannot create a unique global variable
23 // with external linkage across LLVM modules. The global variable with internal
24 // linkage does not work since optimization passes will try to replace loads
25 // of the global variable with its initialization value.
26 //
27 // It also identifies the kernels directly or indirectly enqueues kernels
28 // and adds "calls-enqueue-kernel" function attribute to them, which will
29 // be used to determine whether to emit runtime metadata for the kernel
30 // enqueue related hidden kernel arguments.
31 //
32 //===----------------------------------------------------------------------===//
33 
34 #include "AMDGPU.h"
35 #include "llvm/ADT/DenseSet.h"
36 #include "llvm/ADT/SmallString.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Mangler.h"
39 #include "llvm/IR/Module.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/Debug.h"
42 
43 #define DEBUG_TYPE "amdgpu-lower-enqueued-block"
44 
45 using namespace llvm;
46 
47 namespace {
48 
49 /// Lower enqueued blocks.
50 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
51 public:
52  static char ID;
53 
54  explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
55 
56 private:
57  bool runOnModule(Module &M) override;
58 };
59 
60 } // end anonymous namespace
61 
63 
66 
67 INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
68  "Lower OpenCL enqueued blocks", false, false)
69 
71  return new AMDGPUOpenCLEnqueuedBlockLowering();
72 }
73 
74 /// Collect direct or indrect callers of \p F and save them
75 /// to \p Callers.
76 static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
77  for (auto U : F->users()) {
78  if (auto *CI = dyn_cast<CallInst>(&*U)) {
79  auto *Caller = CI->getParent()->getParent();
80  if (Callers.insert(Caller).second)
81  collectCallers(Caller, Callers);
82  }
83  }
84 }
85 
86 /// If \p U is instruction or constant, collect functions which directly or
87 /// indirectly use it.
89  if (auto *I = dyn_cast<Instruction>(U)) {
90  auto *F = I->getParent()->getParent();
91  if (Funcs.insert(F).second)
92  collectCallers(F, Funcs);
93  return;
94  }
95  if (!isa<Constant>(U))
96  return;
97  for (auto UU : U->users())
98  collectFunctionUsers(&*UU, Funcs);
99 }
100 
101 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
102  DenseSet<Function *> Callers;
103  auto &C = M.getContext();
104  bool Changed = false;
105  for (auto &F : M.functions()) {
106  if (F.hasFnAttribute("enqueued-block")) {
107  if (!F.hasName()) {
109  Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
110  M.getDataLayout());
111  F.setName(Name);
112  }
113  LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
114  auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
115  auto T = ArrayType::get(Type::getInt64Ty(C), 2);
116  auto *GV = new GlobalVariable(
117  M, T,
118  /*isConstant=*/false, GlobalValue::ExternalLinkage,
119  /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
120  /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
122  /*isExternallyInitialized=*/false);
123  LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
124 
125  for (auto U : F.users()) {
126  auto *UU = &*U;
127  if (!isa<ConstantExpr>(UU))
128  continue;
129  collectFunctionUsers(UU, Callers);
130  auto *BitCast = cast<ConstantExpr>(UU);
131  auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
132  BitCast->replaceAllUsesWith(NewPtr);
133  F.addFnAttr("runtime-handle", RuntimeHandle);
134  F.setLinkage(GlobalValue::ExternalLinkage);
135  Changed = true;
136  }
137  }
138  }
139 
140  for (auto F : Callers) {
141  if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
142  continue;
143  F->addFnAttr("calls-enqueue-kernel");
144  LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
145  }
146  return Changed;
147 }
llvm
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:238
llvm::Function
Definition: Function.h:61
Pass.h
llvm::GlobalValue::NotThreadLocal
@ NotThreadLocal
Definition: GlobalValue.h:179
llvm::GlobalVariable
Definition: GlobalVariable.h:40
Module.h
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::ConstantExpr::getPointerCast
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2032
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
SmallString.h
llvm::User
Definition: User.h:44
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
DenseSet.h
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:360
llvm::Mangler::getNameWithPrefix
void getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV, bool CannotUsePrivateLabel) const
Print the appropriate prefix and the specified global variable's name.
Definition: Mangler.cpp:114
llvm::AMDGPUOpenCLEnqueuedBlockLoweringID
char & AMDGPUOpenCLEnqueuedBlockLoweringID
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:64
collectCallers
static void collectCallers(Function *F, DenseSet< Function * > &Callers)
Collect direct or indrect callers of F and save them to Callers.
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:76
llvm::SmallString< 64 >
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:598
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
INITIALIZE_PASS
INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, "Lower OpenCL enqueued blocks", false, false) ModulePass *llvm
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:67
Mangler.h
AMDGPU.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:43
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:198
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:346
llvm::AMDGPU::HSAMD::Kernel::Attrs::Key::RuntimeHandle
constexpr char RuntimeHandle[]
Key for Kernel::Attr::Metadata::mRuntimeHandle.
Definition: AMDGPUMetadata.h:129
llvm::GlobalValue::ExternalLinkage
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:48
Instructions.h
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:216
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
collectFunctionUsers
static void collectFunctionUsers(User *U, DenseSet< Function * > &Funcs)
If U is instruction or constant, collect functions which directly or indirectly use it.
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:88