Line data Source code
1 : //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // \file
11 : // This post-linking pass replaces the function pointer of enqueued
12 : // block kernel with a global variable (runtime handle) and adds
13 : // "runtime-handle" attribute to the enqueued block kernel.
14 : //
15 : // In LLVM CodeGen the runtime-handle metadata will be translated to
16 : // RuntimeHandle metadata in code object. Runtime allocates a global buffer
17 : // for each kernel with RuntimeHandel metadata and saves the kernel address
18 : // required for the AQL packet into the buffer. __enqueue_kernel function
19 : // in device library knows that the invoke function pointer in the block
20 : // literal is actually runtime handle and loads the kernel address from it
21 : // and put it into AQL packet for dispatching.
22 : //
23 : // This cannot be done in FE since FE cannot create a unique global variable
24 : // with external linkage across LLVM modules. The global variable with internal
25 : // linkage does not work since optimization passes will try to replace loads
26 : // of the global variable with its initialization value.
27 : //
28 : // It also identifies the kernels directly or indirectly enqueues kernels
29 : // and adds "calls-enqueue-kernel" function attribute to them, which will
30 : // be used to determine whether to emit runtime metadata for the kernel
31 : // enqueue related hidden kernel arguments.
32 : //
33 : //===----------------------------------------------------------------------===//
34 :
35 : #include "AMDGPU.h"
36 : #include "llvm/ADT/DenseSet.h"
37 : #include "llvm/ADT/StringRef.h"
38 : #include "llvm/IR/Constants.h"
39 : #include "llvm/IR/DerivedTypes.h"
40 : #include "llvm/IR/Instructions.h"
41 : #include "llvm/IR/Mangler.h"
42 : #include "llvm/IR/Module.h"
43 : #include "llvm/IR/User.h"
44 : #include "llvm/Pass.h"
45 : #include "llvm/Support/Debug.h"
46 : #include "llvm/Support/raw_ostream.h"
47 :
48 : #define DEBUG_TYPE "amdgpu-lower-enqueued-block"
49 :
50 : using namespace llvm;
51 :
52 : namespace {
53 :
54 : /// Lower enqueued blocks.
55 : class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
56 : public:
57 : static char ID;
58 :
59 2247 : explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
60 :
61 : private:
62 : bool runOnModule(Module &M) override;
63 : };
64 :
65 : } // end anonymous namespace
66 :
67 : char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
68 :
69 : char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
70 : AMDGPUOpenCLEnqueuedBlockLowering::ID;
71 :
72 199024 : INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
73 : "Lower OpenCL enqueued blocks", false, false)
74 :
75 2246 : ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
76 2246 : return new AMDGPUOpenCLEnqueuedBlockLowering();
77 : }
78 :
79 : /// Collect direct or indrect callers of \p F and save them
80 : /// to \p Callers.
81 3 : static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
82 4 : for (auto U : F->users()) {
83 : if (auto *CI = dyn_cast<CallInst>(&*U)) {
84 1 : auto *Caller = CI->getParent()->getParent();
85 1 : if (Callers.insert(Caller).second)
86 1 : collectCallers(Caller, Callers);
87 : }
88 : }
89 3 : }
90 :
91 : /// If \p U is instruction or constant, collect functions which directly or
92 : /// indirectly use it.
93 12 : static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
94 : if (auto *I = dyn_cast<Instruction>(U)) {
95 6 : auto *F = I->getParent()->getParent();
96 6 : if (Funcs.insert(F).second)
97 2 : collectCallers(F, Funcs);
98 : return;
99 : }
100 6 : if (!isa<Constant>(U))
101 : return;
102 13 : for (auto UU : U->users())
103 7 : collectFunctionUsers(&*UU, Funcs);
104 : }
105 :
106 2229 : bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
107 : DenseSet<Function *> Callers;
108 2229 : auto &C = M.getContext();
109 : bool Changed = false;
110 28223 : for (auto &F : M.functions()) {
111 25994 : if (F.hasFnAttribute("enqueued-block")) {
112 4 : if (!F.hasName()) {
113 : SmallString<64> Name;
114 2 : Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
115 : M.getDataLayout());
116 4 : F.setName(Name);
117 : }
118 : LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
119 4 : auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
120 4 : auto T = ArrayType::get(Type::getInt64Ty(C), 2);
121 : auto *GV = new GlobalVariable(
122 : M, T,
123 : /*IsConstant=*/false, GlobalValue::ExternalLinkage,
124 4 : /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
125 : /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
126 : AMDGPUAS::GLOBAL_ADDRESS,
127 4 : /*IsExternallyInitialized=*/false);
128 : LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
129 :
130 9 : for (auto U : F.users()) {
131 : auto *UU = &*U;
132 5 : if (!isa<ConstantExpr>(UU))
133 : continue;
134 5 : collectFunctionUsers(UU, Callers);
135 : auto *BitCast = cast<ConstantExpr>(UU);
136 5 : auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
137 5 : BitCast->replaceAllUsesWith(NewPtr);
138 5 : F.addFnAttr("runtime-handle", RuntimeHandle);
139 : F.setLinkage(GlobalValue::ExternalLinkage);
140 : Changed = true;
141 : }
142 : }
143 : }
144 :
145 2232 : for (auto F : Callers) {
146 3 : if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
147 : continue;
148 6 : F->addFnAttr("calls-enqueue-kernel");
149 : LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
150 : }
151 2229 : return Changed;
152 : }
|