Line data Source code
1 : //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file This pass replaces accesses to kernel arguments with loads from
11 : /// offsets from the kernarg base pointer.
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "AMDGPU.h"
16 : #include "AMDGPUSubtarget.h"
17 : #include "AMDGPUTargetMachine.h"
18 : #include "llvm/ADT/StringRef.h"
19 : #include "llvm/Analysis/Loads.h"
20 : #include "llvm/CodeGen/Passes.h"
21 : #include "llvm/CodeGen/TargetPassConfig.h"
22 : #include "llvm/IR/Attributes.h"
23 : #include "llvm/IR/BasicBlock.h"
24 : #include "llvm/IR/Constants.h"
25 : #include "llvm/IR/DerivedTypes.h"
26 : #include "llvm/IR/Function.h"
27 : #include "llvm/IR/IRBuilder.h"
28 : #include "llvm/IR/InstrTypes.h"
29 : #include "llvm/IR/Instruction.h"
30 : #include "llvm/IR/Instructions.h"
31 : #include "llvm/IR/LLVMContext.h"
32 : #include "llvm/IR/MDBuilder.h"
33 : #include "llvm/IR/Metadata.h"
34 : #include "llvm/IR/Operator.h"
35 : #include "llvm/IR/Type.h"
36 : #include "llvm/IR/Value.h"
37 : #include "llvm/Pass.h"
38 : #include "llvm/Support/Casting.h"
39 :
40 : #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
41 :
42 : using namespace llvm;
43 :
44 : namespace {
45 :
46 : class AMDGPULowerKernelArguments : public FunctionPass{
47 : public:
48 : static char ID;
49 :
50 1964 : AMDGPULowerKernelArguments() : FunctionPass(ID) {}
51 :
52 : bool runOnFunction(Function &F) override;
53 :
54 1948 : void getAnalysisUsage(AnalysisUsage &AU) const override {
55 : AU.addRequired<TargetPassConfig>();
56 : AU.setPreservesAll();
57 1948 : }
58 : };
59 :
60 : } // end anonymous namespace
61 :
62 19424 : bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
63 : CallingConv::ID CC = F.getCallingConv();
64 19424 : if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
65 : return false;
66 :
67 14857 : auto &TPC = getAnalysis<TargetPassConfig>();
68 :
69 14857 : const TargetMachine &TM = TPC.getTM<TargetMachine>();
70 : const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
71 14857 : LLVMContext &Ctx = F.getParent()->getContext();
72 14857 : const DataLayout &DL = F.getParent()->getDataLayout();
73 : BasicBlock &EntryBlock = *F.begin();
74 14857 : IRBuilder<> Builder(&*EntryBlock.begin());
75 :
76 14857 : const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
77 14857 : const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
78 :
79 : unsigned MaxAlign;
80 : // FIXME: Alignment is broken broken with explicit arg offset.;
81 14857 : const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
82 14857 : if (TotalKernArgSize == 0)
83 : return false;
84 :
85 : CallInst *KernArgSegment =
86 29710 : Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
87 14855 : nullptr, F.getName() + ".kernarg.segment");
88 :
89 14855 : KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
90 14855 : KernArgSegment->addAttribute(AttributeList::ReturnIndex,
91 : Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
92 :
93 14855 : unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
94 : uint64_t ExplicitArgOffset = 0;
95 :
96 49194 : for (Argument &Arg : F.args()) {
97 34339 : Type *ArgTy = Arg.getType();
98 34339 : unsigned Align = DL.getABITypeAlignment(ArgTy);
99 34339 : unsigned Size = DL.getTypeSizeInBits(ArgTy);
100 34339 : unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
101 :
102 34339 : uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
103 34339 : ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
104 :
105 34339 : if (Arg.use_empty())
106 3960 : continue;
107 :
108 : if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
109 : // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
110 : // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
111 : // can't represent this with range metadata because it's only allowed for
112 : // integer types.
113 23773 : if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
114 2627 : ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
115 : continue;
116 :
117 : // FIXME: We can replace this with equivalent alias.scope/noalias
118 : // metadata, but this appears to be a lot of work.
119 23131 : if (Arg.hasNoAliasAttr())
120 : continue;
121 : }
122 :
123 : VectorType *VT = dyn_cast<VectorType>(ArgTy);
124 1506 : bool IsV3 = VT && VT->getNumElements() == 3;
125 : VectorType *V4Ty = nullptr;
126 :
127 : int64_t AlignDownOffset = alignDown(EltOffset, 4);
128 30379 : int64_t OffsetDiff = EltOffset - AlignDownOffset;
129 30379 : unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
130 :
131 : Value *ArgPtr;
132 30379 : if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
133 : // Since we don't have sub-dword scalar loads, avoid doing an extload by
134 : // loading earlier than the argument address, and extracting the relevant
135 : // bits.
136 : //
137 : // Additionally widen any sub-dword load to i32 even if suitably aligned,
138 : // so that CSE between different argument loads works easily.
139 :
140 653 : ArgPtr = Builder.CreateConstInBoundsGEP1_64(
141 : KernArgSegment,
142 : AlignDownOffset,
143 653 : Arg.getName() + ".kernarg.offset.align.down");
144 : ArgPtr = Builder.CreateBitCast(ArgPtr,
145 653 : Builder.getInt32Ty()->getPointerTo(AS),
146 1306 : ArgPtr->getName() + ".cast");
147 : } else {
148 29726 : ArgPtr = Builder.CreateConstInBoundsGEP1_64(
149 : KernArgSegment,
150 : AlignDownOffset,
151 29726 : Arg.getName() + ".kernarg.offset");
152 29726 : ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
153 59452 : ArgPtr->getName() + ".cast");
154 : }
155 :
156 30379 : if (IsV3 && Size >= 32) {
157 210 : V4Ty = VectorType::get(VT->getVectorElementType(), 4);
158 : // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
159 210 : ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
160 : }
161 :
162 30379 : LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
163 30379 : Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
164 :
165 : MDBuilder MDB(Ctx);
166 :
167 30379 : if (isa<PointerType>(ArgTy)) {
168 21885 : if (Arg.hasNonNullAttr())
169 4 : Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
170 :
171 21885 : uint64_t DerefBytes = Arg.getDereferenceableBytes();
172 21885 : if (DerefBytes != 0) {
173 4 : Load->setMetadata(
174 : LLVMContext::MD_dereferenceable,
175 : MDNode::get(Ctx,
176 8 : MDB.createConstant(
177 4 : ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
178 : }
179 :
180 21885 : uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
181 21885 : if (DerefOrNullBytes != 0) {
182 2 : Load->setMetadata(
183 : LLVMContext::MD_dereferenceable_or_null,
184 : MDNode::get(Ctx,
185 4 : MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
186 : DerefOrNullBytes))));
187 : }
188 :
189 21885 : unsigned ParamAlign = Arg.getParamAlignment();
190 21885 : if (ParamAlign != 0) {
191 2 : Load->setMetadata(
192 : LLVMContext::MD_align,
193 : MDNode::get(Ctx,
194 4 : MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
195 : ParamAlign))));
196 : }
197 : }
198 :
199 : // TODO: Convert noalias arg to !noalias
200 :
201 30379 : if (Size < 32 && !ArgTy->isAggregateType()) {
202 653 : Value *ExtractBits = OffsetDiff == 0 ?
203 111 : Load : Builder.CreateLShr(Load, OffsetDiff * 8);
204 :
205 653 : IntegerType *ArgIntTy = Builder.getIntNTy(Size);
206 653 : Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
207 : Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
208 653 : Arg.getName() + ".load");
209 653 : Arg.replaceAllUsesWith(NewVal);
210 29726 : } else if (IsV3) {
211 105 : Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
212 : {0, 1, 2},
213 105 : Arg.getName() + ".load");
214 105 : Arg.replaceAllUsesWith(Shuf);
215 : } else {
216 29621 : Load->setName(Arg.getName() + ".load");
217 29621 : Arg.replaceAllUsesWith(Load);
218 : }
219 : }
220 :
221 14855 : KernArgSegment->addAttribute(
222 : AttributeList::ReturnIndex,
223 14855 : Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
224 :
225 14855 : return true;
226 : }
227 :
228 85105 : INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
229 : "AMDGPU Lower Kernel Arguments", false, false)
230 199024 : INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
231 : false, false)
232 :
233 : char AMDGPULowerKernelArguments::ID = 0;
234 :
235 1962 : FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
236 1962 : return new AMDGPULowerKernelArguments();
237 : }
|