Line data Source code
1 : //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This pass eliminates allocas by either converting them into vectors or
11 : // by migrating them to local address space.
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "AMDGPU.h"
16 : #include "AMDGPUSubtarget.h"
17 : #include "Utils/AMDGPUBaseInfo.h"
18 : #include "llvm/ADT/APInt.h"
19 : #include "llvm/ADT/None.h"
20 : #include "llvm/ADT/STLExtras.h"
21 : #include "llvm/ADT/StringRef.h"
22 : #include "llvm/ADT/Triple.h"
23 : #include "llvm/ADT/Twine.h"
24 : #include "llvm/Analysis/CaptureTracking.h"
25 : #include "llvm/Analysis/ValueTracking.h"
26 : #include "llvm/CodeGen/TargetPassConfig.h"
27 : #include "llvm/IR/Attributes.h"
28 : #include "llvm/IR/BasicBlock.h"
29 : #include "llvm/IR/Constant.h"
30 : #include "llvm/IR/Constants.h"
31 : #include "llvm/IR/DataLayout.h"
32 : #include "llvm/IR/DerivedTypes.h"
33 : #include "llvm/IR/Function.h"
34 : #include "llvm/IR/GlobalValue.h"
35 : #include "llvm/IR/GlobalVariable.h"
36 : #include "llvm/IR/IRBuilder.h"
37 : #include "llvm/IR/Instruction.h"
38 : #include "llvm/IR/Instructions.h"
39 : #include "llvm/IR/IntrinsicInst.h"
40 : #include "llvm/IR/Intrinsics.h"
41 : #include "llvm/IR/LLVMContext.h"
42 : #include "llvm/IR/Metadata.h"
43 : #include "llvm/IR/Module.h"
44 : #include "llvm/IR/Type.h"
45 : #include "llvm/IR/User.h"
46 : #include "llvm/IR/Value.h"
47 : #include "llvm/Pass.h"
48 : #include "llvm/Support/Casting.h"
49 : #include "llvm/Support/Debug.h"
50 : #include "llvm/Support/ErrorHandling.h"
51 : #include "llvm/Support/MathExtras.h"
52 : #include "llvm/Support/raw_ostream.h"
53 : #include "llvm/Target/TargetMachine.h"
54 : #include <algorithm>
55 : #include <cassert>
56 : #include <cstdint>
57 : #include <map>
58 : #include <tuple>
59 : #include <utility>
60 : #include <vector>
61 :
62 : #define DEBUG_TYPE "amdgpu-promote-alloca"
63 :
64 : using namespace llvm;
65 :
66 : namespace {
67 :
68 : static cl::opt<bool> DisablePromoteAllocaToVector(
69 : "disable-promote-alloca-to-vector",
70 : cl::desc("Disable promote alloca to vector"),
71 : cl::init(false));
72 :
73 : // FIXME: This can create globals so should be a module pass.
74 : class AMDGPUPromoteAlloca : public FunctionPass {
75 : private:
76 : const TargetMachine *TM;
77 : Module *Mod = nullptr;
78 : const DataLayout *DL = nullptr;
79 :
80 : // FIXME: This should be per-kernel.
81 : uint32_t LocalMemLimit = 0;
82 : uint32_t CurrentLocalMemUsage = 0;
83 :
84 : bool IsAMDGCN = false;
85 : bool IsAMDHSA = false;
86 :
87 : std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
88 : Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
89 :
90 : /// BaseAlloca is the alloca root the search started from.
91 : /// Val may be that alloca or a recursive user of it.
92 : bool collectUsesWithPtrTypes(Value *BaseAlloca,
93 : Value *Val,
94 : std::vector<Value*> &WorkList) const;
95 :
96 : /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
97 : /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
98 : /// Returns true if both operands are derived from the same alloca. Val should
99 : /// be the same value as one of the input operands of UseInst.
100 : bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
101 : Instruction *UseInst,
102 : int OpIdx0, int OpIdx1) const;
103 :
104 : /// Check whether we have enough local memory for promotion.
105 : bool hasSufficientLocalMem(const Function &F);
106 :
107 : public:
108 : static char ID;
109 :
110 4442 : AMDGPUPromoteAlloca() : FunctionPass(ID) {}
111 :
112 : bool doInitialization(Module &M) override;
113 : bool runOnFunction(Function &F) override;
114 :
115 0 : StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
116 :
117 : bool handleAlloca(AllocaInst &I, bool SufficientLDS);
118 :
119 2204 : void getAnalysisUsage(AnalysisUsage &AU) const override {
120 2204 : AU.setPreservesCFG();
121 2204 : FunctionPass::getAnalysisUsage(AU);
122 2204 : }
123 : };
124 :
125 : } // end anonymous namespace
126 :
127 : char AMDGPUPromoteAlloca::ID = 0;
128 :
129 199024 : INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
130 : "AMDGPU promote alloca to vector or LDS", false, false)
131 :
132 : char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
133 :
134 2203 : bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
135 2203 : Mod = &M;
136 2203 : DL = &Mod->getDataLayout();
137 :
138 2203 : return false;
139 : }
140 :
141 21577 : bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
142 21577 : if (skipFunction(F))
143 : return false;
144 :
145 21574 : if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
146 21574 : TM = &TPC->getTM<TargetMachine>();
147 : else
148 : return false;
149 :
150 : const Triple &TT = TM->getTargetTriple();
151 21574 : IsAMDGCN = TT.getArch() == Triple::amdgcn;
152 21574 : IsAMDHSA = TT.getOS() == Triple::AMDHSA;
153 :
154 21574 : const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
155 21574 : if (!ST.isPromoteAllocaEnabled())
156 : return false;
157 :
158 20989 : bool SufficientLDS = hasSufficientLocalMem(F);
159 : bool Changed = false;
160 : BasicBlock &EntryBB = *F.begin();
161 150821 : for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
162 : AllocaInst *AI = dyn_cast<AllocaInst>(I);
163 :
164 : ++I;
165 129832 : if (AI)
166 544 : Changed |= handleAlloca(*AI, SufficientLDS);
167 : }
168 :
169 : return Changed;
170 : }
171 :
172 : std::pair<Value *, Value *>
173 165 : AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
174 165 : const Function &F = *Builder.GetInsertBlock()->getParent();
175 165 : const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
176 :
177 165 : if (!IsAMDHSA) {
178 : Function *LocalSizeYFn
179 99 : = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
180 : Function *LocalSizeZFn
181 99 : = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
182 :
183 99 : CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
184 99 : CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
185 :
186 99 : ST.makeLIDRangeMetadata(LocalSizeY);
187 99 : ST.makeLIDRangeMetadata(LocalSizeZ);
188 :
189 99 : return std::make_pair(LocalSizeY, LocalSizeZ);
190 : }
191 :
192 : // We must read the size out of the dispatch pointer.
193 : assert(IsAMDGCN);
194 :
195 : // We are indexing into this struct, and want to extract the workgroup_size_*
196 : // fields.
197 : //
198 : // typedef struct hsa_kernel_dispatch_packet_s {
199 : // uint16_t header;
200 : // uint16_t setup;
201 : // uint16_t workgroup_size_x ;
202 : // uint16_t workgroup_size_y;
203 : // uint16_t workgroup_size_z;
204 : // uint16_t reserved0;
205 : // uint32_t grid_size_x ;
206 : // uint32_t grid_size_y ;
207 : // uint32_t grid_size_z;
208 : //
209 : // uint32_t private_segment_size;
210 : // uint32_t group_segment_size;
211 : // uint64_t kernel_object;
212 : //
213 : // #ifdef HSA_LARGE_MODEL
214 : // void *kernarg_address;
215 : // #elif defined HSA_LITTLE_ENDIAN
216 : // void *kernarg_address;
217 : // uint32_t reserved1;
218 : // #else
219 : // uint32_t reserved1;
220 : // void *kernarg_address;
221 : // #endif
222 : // uint64_t reserved2;
223 : // hsa_signal_t completion_signal; // uint64_t wrapper
224 : // } hsa_kernel_dispatch_packet_t
225 : //
226 : Function *DispatchPtrFn
227 66 : = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
228 :
229 66 : CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
230 66 : DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
231 66 : DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
232 :
233 : // Size of the dispatch packet struct.
234 66 : DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
235 :
236 66 : Type *I32Ty = Type::getInt32Ty(Mod->getContext());
237 66 : Value *CastDispatchPtr = Builder.CreateBitCast(
238 66 : DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
239 :
240 : // We could do a single 64-bit load here, but it's likely that the basic
241 : // 32-bit and extract sequence is already present, and it is probably easier
242 : // to CSE this. The loads should be mergable later anyway.
243 66 : Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
244 66 : LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
245 :
246 66 : Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
247 66 : LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
248 :
249 66 : MDNode *MD = MDNode::get(Mod->getContext(), None);
250 66 : LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
251 66 : LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
252 66 : ST.makeLIDRangeMetadata(LoadZU);
253 :
254 : // Extract y component. Upper half of LoadZU should be zero already.
255 66 : Value *Y = Builder.CreateLShr(LoadXY, 16);
256 :
257 66 : return std::make_pair(Y, LoadZU);
258 : }
259 :
260 495 : Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
261 : const AMDGPUSubtarget &ST =
262 495 : AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
263 : Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
264 :
265 495 : switch (N) {
266 165 : case 0:
267 165 : IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
268 : : Intrinsic::r600_read_tidig_x;
269 : break;
270 165 : case 1:
271 165 : IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
272 : : Intrinsic::r600_read_tidig_y;
273 : break;
274 :
275 165 : case 2:
276 165 : IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
277 : : Intrinsic::r600_read_tidig_z;
278 : break;
279 0 : default:
280 0 : llvm_unreachable("invalid dimension");
281 : }
282 :
283 495 : Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
284 495 : CallInst *CI = Builder.CreateCall(WorkitemIdFn);
285 495 : ST.makeLIDRangeMetadata(CI);
286 :
287 495 : return CI;
288 : }
289 :
290 : static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
291 60 : return VectorType::get(ArrayTy->getElementType(),
292 60 : ArrayTy->getNumElements());
293 : }
294 :
295 : static Value *
296 : calculateVectorIndex(Value *Ptr,
297 : const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
298 211 : GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
299 :
300 : auto I = GEPIdx.find(GEP);
301 211 : return I == GEPIdx.end() ? nullptr : I->second;
302 : }
303 :
304 271 : static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
305 : // FIXME we only support simple cases
306 271 : if (GEP->getNumOperands() != 3)
307 : return nullptr;
308 :
309 : ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
310 271 : if (!I0 || !I0->isZero())
311 : return nullptr;
312 :
313 : return GEP->getOperand(2);
314 : }
315 :
316 : // Not an instruction handled below to turn into a vector.
317 : //
318 : // TODO: Check isTriviallyVectorizable for calls and handle other
319 : // instructions.
320 288 : static bool canVectorizeInst(Instruction *Inst, User *User) {
321 288 : switch (Inst->getOpcode()) {
322 : case Instruction::Load: {
323 : // Currently only handle the case where the Pointer Operand is a GEP.
324 : // Also we could not vectorize volatile or atomic loads.
325 : LoadInst *LI = cast<LoadInst>(Inst);
326 : return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
327 : }
328 : case Instruction::BitCast:
329 : return true;
330 : case Instruction::Store: {
331 : // Must be the stored pointer operand, not a stored value, plus
332 : // since it should be canonical form, the User should be a GEP.
333 : // Also we could not vectorize volatile or atomic stores.
334 : StoreInst *SI = cast<StoreInst>(Inst);
335 179 : return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
336 : }
337 22 : default:
338 22 : return false;
339 : }
340 : }
341 :
342 529 : static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
343 :
344 529 : if (DisablePromoteAllocaToVector) {
345 : LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
346 : return false;
347 : }
348 :
349 309 : ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
350 :
351 : LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
352 :
353 : // FIXME: There is no reason why we can't support larger arrays, we
354 : // are just being conservative for now.
355 : // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
356 : // could also be promoted but we don't currently handle this case
357 198 : if (!AllocaTy ||
358 198 : AllocaTy->getNumElements() > 16 ||
359 143 : AllocaTy->getNumElements() < 2 ||
360 143 : !VectorType::isValidElementType(AllocaTy->getElementType())) {
361 : LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
362 193 : return false;
363 : }
364 :
365 : std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
366 : std::vector<Value*> WorkList;
367 346 : for (User *AllocaUser : Alloca->users()) {
368 286 : GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
369 286 : if (!GEP) {
370 15 : if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
371 56 : return false;
372 :
373 6 : WorkList.push_back(AllocaUser);
374 6 : continue;
375 : }
376 :
377 271 : Value *Index = GEPToVectorIndex(GEP);
378 :
379 : // If we can't compute a vector index from this GEP, then we can't
380 : // promote this alloca to vector.
381 271 : if (!Index) {
382 : LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
383 : << '\n');
384 : return false;
385 : }
386 :
387 270 : GEPVectorIdx[GEP] = Index;
388 497 : for (User *GEPUser : AllocaUser->users()) {
389 273 : if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
390 : return false;
391 :
392 227 : WorkList.push_back(GEPUser);
393 : }
394 : }
395 :
396 : VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
397 :
398 : LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
399 : << *VectorTy << '\n');
400 :
401 286 : for (Value *V : WorkList) {
402 : Instruction *Inst = cast<Instruction>(V);
403 226 : IRBuilder<> Builder(Inst);
404 226 : switch (Inst->getOpcode()) {
405 63 : case Instruction::Load: {
406 63 : Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
407 : Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
408 : Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
409 :
410 63 : Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
411 63 : Value *VecValue = Builder.CreateLoad(BitCast);
412 63 : Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
413 63 : Inst->replaceAllUsesWith(ExtractElement);
414 63 : Inst->eraseFromParent();
415 63 : break;
416 : }
417 148 : case Instruction::Store: {
418 148 : Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
419 :
420 : StoreInst *SI = cast<StoreInst>(Inst);
421 : Value *Ptr = SI->getPointerOperand();
422 : Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
423 148 : Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
424 148 : Value *VecValue = Builder.CreateLoad(BitCast);
425 148 : Value *NewVecValue = Builder.CreateInsertElement(VecValue,
426 : SI->getValueOperand(),
427 : Index);
428 148 : Builder.CreateStore(NewVecValue, BitCast);
429 148 : Inst->eraseFromParent();
430 148 : break;
431 : }
432 : case Instruction::BitCast:
433 : case Instruction::AddrSpaceCast:
434 : break;
435 :
436 0 : default:
437 0 : llvm_unreachable("Inconsistency in instructions promotable to vector");
438 : }
439 : }
440 : return true;
441 : }
442 :
443 33 : static bool isCallPromotable(CallInst *CI) {
444 : IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
445 : if (!II)
446 : return false;
447 :
448 : switch (II->getIntrinsicID()) {
449 : case Intrinsic::memcpy:
450 : case Intrinsic::memmove:
451 : case Intrinsic::memset:
452 : case Intrinsic::lifetime_start:
453 : case Intrinsic::lifetime_end:
454 : case Intrinsic::invariant_start:
455 : case Intrinsic::invariant_end:
456 : case Intrinsic::launder_invariant_group:
457 : case Intrinsic::strip_invariant_group:
458 : case Intrinsic::objectsize:
459 : return true;
460 : default:
461 : return false;
462 : }
463 : }
464 :
465 0 : bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
466 : Value *Val,
467 : Instruction *Inst,
468 : int OpIdx0,
469 : int OpIdx1) const {
470 : // Figure out which operand is the one we might not be promoting.
471 0 : Value *OtherOp = Inst->getOperand(OpIdx0);
472 0 : if (Val == OtherOp)
473 0 : OtherOp = Inst->getOperand(OpIdx1);
474 :
475 0 : if (isa<ConstantPointerNull>(OtherOp))
476 0 : return true;
477 :
478 0 : Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
479 : if (!isa<AllocaInst>(OtherObj))
480 0 : return false;
481 :
482 : // TODO: We should be able to replace undefs with the right pointer type.
483 :
484 : // TODO: If we know the other base object is another promotable
485 : // alloca, not necessarily this alloca, we can do this. The
486 : // important part is both must have the same address space at
487 : // the end.
488 0 : if (OtherObj != BaseAlloca) {
489 : LLVM_DEBUG(
490 : dbgs() << "Found a binary instruction with another alloca object\n");
491 0 : return false;
492 : }
493 :
494 : return true;
495 : }
496 :
497 681 : bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
498 : Value *BaseAlloca,
499 : Value *Val,
500 : std::vector<Value*> &WorkList) const {
501 :
502 1559 : for (User *User : Val->users()) {
503 1026 : if (is_contained(WorkList, User))
504 : continue;
505 :
506 1017 : if (CallInst *CI = dyn_cast<CallInst>(User)) {
507 33 : if (!isCallPromotable(CI))
508 148 : return false;
509 :
510 11 : WorkList.push_back(User);
511 11 : continue;
512 : }
513 :
514 : Instruction *UseInst = cast<Instruction>(User);
515 984 : if (UseInst->getOpcode() == Instruction::PtrToInt)
516 : return false;
517 :
518 : if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
519 231 : if (LI->isVolatile())
520 : return false;
521 :
522 : continue;
523 : }
524 :
525 : if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
526 260 : if (SI->isVolatile())
527 : return false;
528 :
529 : // Reject if the stored value is not the pointer operand.
530 253 : if (SI->getPointerOperand() != Val)
531 : return false;
532 : } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
533 2 : if (RMW->isVolatile())
534 : return false;
535 : } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
536 2 : if (CAS->isVolatile())
537 : return false;
538 : }
539 :
540 : // Only promote a select if we know that the other select operand
541 : // is from another pointer that will also be promoted.
542 : if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
543 6 : if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
544 : return false;
545 :
546 : // May need to rewrite constant operands.
547 3 : WorkList.push_back(ICmp);
548 : }
549 :
550 729 : if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
551 : // Give up if the pointer may be captured.
552 4 : if (PointerMayBeCaptured(UseInst, true, true))
553 : return false;
554 : // Don't collect the users of this.
555 1 : WorkList.push_back(User);
556 1 : continue;
557 : }
558 :
559 1450 : if (!User->getType()->isPointerTy())
560 : continue;
561 :
562 : if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
563 : // Be conservative if an address could be computed outside the bounds of
564 : // the alloca.
565 440 : if (!GEP->isInBounds())
566 : return false;
567 : }
568 :
569 : // Only promote a select if we know that the other select operand is from
570 : // another pointer that will also be promoted.
571 : if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
572 11 : if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
573 : return false;
574 : }
575 :
576 : // Repeat for phis.
577 : if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
578 : // TODO: Handle more complex cases. We should be able to replace loops
579 : // over arrays.
580 7 : switch (Phi->getNumIncomingValues()) {
581 : case 1:
582 : break;
583 6 : case 2:
584 6 : if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
585 : return false;
586 : break;
587 : default:
588 : return false;
589 : }
590 : }
591 :
592 374 : WorkList.push_back(User);
593 374 : if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
594 : return false;
595 : }
596 :
597 533 : return true;
598 : }
599 :
600 20989 : bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
601 :
602 : FunctionType *FTy = F.getFunctionType();
603 20989 : const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
604 :
605 : // If the function has any arguments in the local address space, then it's
606 : // possible these arguments require the entire local memory space, so
607 : // we cannot use local memory in the pass.
608 66350 : for (Type *ParamTy : FTy->params()) {
609 : PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
610 27376 : if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
611 2019 : LocalMemLimit = 0;
612 : LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
613 : "local memory disabled.\n");
614 2019 : return false;
615 : }
616 : }
617 :
618 18970 : LocalMemLimit = ST.getLocalMemorySize();
619 18970 : if (LocalMemLimit == 0)
620 : return false;
621 :
622 18943 : const DataLayout &DL = Mod->getDataLayout();
623 :
624 : // Check how much local memory is being used by global objects
625 18943 : CurrentLocalMemUsage = 0;
626 22141 : for (GlobalVariable &GV : Mod->globals()) {
627 3198 : if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
628 : continue;
629 :
630 9017 : for (const User *U : GV.users()) {
631 : const Instruction *Use = dyn_cast<Instruction>(U);
632 : if (!Use)
633 : continue;
634 :
635 6016 : if (Use->getParent()->getParent() == &F) {
636 : unsigned Align = GV.getAlignment();
637 312 : if (Align == 0)
638 20 : Align = DL.getABITypeAlignment(GV.getValueType());
639 :
640 : // FIXME: Try to account for padding here. The padding is currently
641 : // determined from the inverse order of uses in the function. I'm not
642 : // sure if the use list order is in any way connected to this, so the
643 : // total reported size is likely incorrect.
644 312 : uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
645 312 : CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
646 312 : CurrentLocalMemUsage += AllocSize;
647 312 : break;
648 : }
649 : }
650 : }
651 :
652 18943 : unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
653 18943 : F);
654 :
655 : // Restrict local memory usage so that we don't drastically reduce occupancy,
656 : // unless it is already significantly reduced.
657 :
658 : // TODO: Have some sort of hint or other heuristics to guess occupancy based
659 : // on other factors..
660 18943 : unsigned OccupancyHint = ST.getWavesPerEU(F).second;
661 18943 : if (OccupancyHint == 0)
662 0 : OccupancyHint = 7;
663 :
664 : // Clamp to max value.
665 18943 : OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
666 :
667 : // Check the hint but ignore it if it's obviously wrong from the existing LDS
668 : // usage.
669 18943 : MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
670 :
671 :
672 : // Round up to the next tier of usage.
673 : unsigned MaxSizeWithWaveCount
674 18943 : = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
675 :
676 : // Program is possibly broken by using more local mem than available.
677 18943 : if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
678 : return false;
679 :
680 18939 : LocalMemLimit = MaxSizeWithWaveCount;
681 :
682 : LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
683 : << " bytes of LDS\n"
684 : << " Rounding size to " << MaxSizeWithWaveCount
685 : << " with a maximum occupancy of " << MaxOccupancy << '\n'
686 : << " and " << (LocalMemLimit - CurrentLocalMemUsage)
687 : << " available for promotion\n");
688 :
689 18939 : return true;
690 : }
691 :
692 : // FIXME: Should try to pick the most likely to be profitable allocas first.
693 544 : bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
694 : // Array allocations are probably not worth handling, since an allocation of
695 : // the array type is the canonical form.
696 544 : if (!I.isStaticAlloca() || I.isArrayAllocation())
697 15 : return false;
698 :
699 529 : IRBuilder<> Builder(&I);
700 :
701 : // First try to replace the alloca with a vector
702 529 : Type *AllocaTy = I.getAllocatedType();
703 :
704 : LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
705 :
706 529 : if (tryPromoteAllocaToVector(&I))
707 : return true; // Promoted to vector.
708 :
709 469 : const Function &ContainingFunction = *I.getParent()->getParent();
710 : CallingConv::ID CC = ContainingFunction.getCallingConv();
711 :
712 : // Don't promote the alloca to LDS for shader calling conventions as the work
713 : // item ID intrinsics are not supported for these calling conventions.
714 : // Furthermore not all LDS is available for some of the stages.
715 469 : switch (CC) {
716 : case CallingConv::AMDGPU_KERNEL:
717 : case CallingConv::SPIR_KERNEL:
718 : break;
719 : default:
720 : LLVM_DEBUG(
721 : dbgs()
722 : << " promote alloca to LDS not supported with calling convention.\n");
723 : return false;
724 : }
725 :
726 : // Not likely to have sufficient local memory for promotion.
727 380 : if (!SufficientLDS)
728 : return false;
729 :
730 379 : const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
731 379 : unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
732 :
733 379 : const DataLayout &DL = Mod->getDataLayout();
734 :
735 : unsigned Align = I.getAlignment();
736 379 : if (Align == 0)
737 207 : Align = DL.getABITypeAlignment(I.getAllocatedType());
738 :
739 : // FIXME: This computed padding is likely wrong since it depends on inverse
740 : // usage order.
741 : //
742 : // FIXME: It is also possible that if we're allowed to use all of the memory
743 : // could could end up using more than the maximum due to alignment padding.
744 :
745 379 : uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
746 379 : uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
747 379 : NewSize += AllocSize;
748 :
749 379 : if (NewSize > LocalMemLimit) {
750 : LLVM_DEBUG(dbgs() << " " << AllocSize
751 : << " bytes of local memory not available to promote\n");
752 : return false;
753 : }
754 :
755 307 : CurrentLocalMemUsage = NewSize;
756 :
757 : std::vector<Value*> WorkList;
758 :
759 307 : if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
760 : LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
761 : return false;
762 : }
763 :
764 : LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
765 :
766 165 : Function *F = I.getParent()->getParent();
767 :
768 165 : Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
769 : GlobalVariable *GV = new GlobalVariable(
770 165 : *Mod, GVTy, false, GlobalValue::InternalLinkage,
771 165 : UndefValue::get(GVTy),
772 330 : Twine(F->getName()) + Twine('.') + I.getName(),
773 : nullptr,
774 : GlobalVariable::NotThreadLocal,
775 165 : AMDGPUAS::LOCAL_ADDRESS);
776 : GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
777 330 : GV->setAlignment(I.getAlignment());
778 :
779 : Value *TCntY, *TCntZ;
780 :
781 165 : std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
782 165 : Value *TIdX = getWorkitemID(Builder, 0);
783 165 : Value *TIdY = getWorkitemID(Builder, 1);
784 165 : Value *TIdZ = getWorkitemID(Builder, 2);
785 :
786 165 : Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
787 165 : Tmp0 = Builder.CreateMul(Tmp0, TIdX);
788 165 : Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
789 165 : Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
790 165 : TID = Builder.CreateAdd(TID, TIdZ);
791 :
792 : Value *Indices[] = {
793 165 : Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
794 : TID
795 165 : };
796 :
797 165 : Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
798 165 : I.mutateType(Offset->getType());
799 165 : I.replaceAllUsesWith(Offset);
800 165 : I.eraseFromParent();
801 :
802 522 : for (Value *V : WorkList) {
803 : CallInst *Call = dyn_cast<CallInst>(V);
804 : if (!Call) {
805 : if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
806 : Value *Src0 = CI->getOperand(0);
807 3 : Type *EltTy = Src0->getType()->getPointerElementType();
808 3 : PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
809 :
810 3 : if (isa<ConstantPointerNull>(CI->getOperand(0)))
811 1 : CI->setOperand(0, ConstantPointerNull::get(NewTy));
812 :
813 3 : if (isa<ConstantPointerNull>(CI->getOperand(1)))
814 1 : CI->setOperand(1, ConstantPointerNull::get(NewTy));
815 :
816 3 : continue;
817 : }
818 :
819 : // The operand's value should be corrected on its own and we don't want to
820 : // touch the users.
821 : if (isa<AddrSpaceCastInst>(V))
822 : continue;
823 :
824 342 : Type *EltTy = V->getType()->getPointerElementType();
825 342 : PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
826 :
827 : // FIXME: It doesn't really make sense to try to do this for all
828 : // instructions.
829 : V->mutateType(NewTy);
830 :
831 : // Adjust the types of any constant operands.
832 : if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
833 9 : if (isa<ConstantPointerNull>(SI->getOperand(1)))
834 1 : SI->setOperand(1, ConstantPointerNull::get(NewTy));
835 :
836 9 : if (isa<ConstantPointerNull>(SI->getOperand(2)))
837 1 : SI->setOperand(2, ConstantPointerNull::get(NewTy));
838 : } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
839 11 : for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
840 7 : if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
841 2 : Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
842 : }
843 : }
844 :
845 342 : continue;
846 : }
847 :
848 : IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
849 11 : Builder.SetInsertPoint(Intr);
850 11 : switch (Intr->getIntrinsicID()) {
851 2 : case Intrinsic::lifetime_start:
852 : case Intrinsic::lifetime_end:
853 : // These intrinsics are for address space 0 only
854 2 : Intr->eraseFromParent();
855 2 : continue;
856 : case Intrinsic::memcpy: {
857 : MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
858 2 : Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
859 : MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
860 2 : MemCpy->getLength(), MemCpy->isVolatile());
861 2 : Intr->eraseFromParent();
862 2 : continue;
863 : }
864 : case Intrinsic::memmove: {
865 : MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
866 2 : Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
867 : MemMove->getRawSource(), MemMove->getSourceAlignment(),
868 2 : MemMove->getLength(), MemMove->isVolatile());
869 2 : Intr->eraseFromParent();
870 2 : continue;
871 : }
872 : case Intrinsic::memset: {
873 : MemSetInst *MemSet = cast<MemSetInst>(Intr);
874 1 : Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
875 : MemSet->getLength(), MemSet->getDestAlignment(),
876 1 : MemSet->isVolatile());
877 1 : Intr->eraseFromParent();
878 1 : continue;
879 : }
880 3 : case Intrinsic::invariant_start:
881 : case Intrinsic::invariant_end:
882 : case Intrinsic::launder_invariant_group:
883 : case Intrinsic::strip_invariant_group:
884 3 : Intr->eraseFromParent();
885 : // FIXME: I think the invariant marker should still theoretically apply,
886 : // but the intrinsics need to be changed to accept pointers with any
887 : // address space.
888 3 : continue;
889 1 : case Intrinsic::objectsize: {
890 1 : Value *Src = Intr->getOperand(0);
891 1 : Type *SrcTy = Src->getType()->getPointerElementType();
892 2 : Function *ObjectSize = Intrinsic::getDeclaration(Mod,
893 : Intrinsic::objectsize,
894 1 : { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
895 : );
896 :
897 1 : CallInst *NewCall = Builder.CreateCall(
898 : ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
899 1 : Intr->replaceAllUsesWith(NewCall);
900 1 : Intr->eraseFromParent();
901 1 : continue;
902 : }
903 0 : default:
904 0 : Intr->print(errs());
905 0 : llvm_unreachable("Don't know how to promote alloca intrinsic use.");
906 : }
907 : }
908 : return true;
909 : }
910 :
911 2200 : FunctionPass *llvm::createAMDGPUPromoteAlloca() {
912 2200 : return new AMDGPUPromoteAlloca();
913 : }
|