LLVM  mainline
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
00001 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This pass eliminates allocas by either converting them into vectors or
00011 // by migrating them to local address space.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "AMDGPU.h"
00016 #include "AMDGPUSubtarget.h"
00017 #include "llvm/Analysis/ValueTracking.h"
00018 #include "llvm/IR/IRBuilder.h"
00019 #include "llvm/IR/InstVisitor.h"
00020 #include "llvm/Support/Debug.h"
00021 #include "llvm/Support/raw_ostream.h"
00022 
00023 #define DEBUG_TYPE "amdgpu-promote-alloca"
00024 
00025 using namespace llvm;
00026 
00027 namespace {
00028 
00029 class AMDGPUPromoteAlloca : public FunctionPass,
00030                        public InstVisitor<AMDGPUPromoteAlloca> {
00031 
00032   static char ID;
00033   Module *Mod;
00034   const AMDGPUSubtarget &ST;
00035   int LocalMemAvailable;
00036 
00037 public:
00038   AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
00039                                                    LocalMemAvailable(0) { }
00040   bool doInitialization(Module &M) override;
00041   bool runOnFunction(Function &F) override;
00042   const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
00043   void visitAlloca(AllocaInst &I);
00044 };
00045 
00046 } // End anonymous namespace
00047 
00048 char AMDGPUPromoteAlloca::ID = 0;
00049 
00050 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
00051   Mod = &M;
00052   return false;
00053 }
00054 
00055 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
00056 
00057   FunctionType *FTy = F.getFunctionType();
00058 
00059   LocalMemAvailable = ST.getLocalMemorySize();
00060 
00061 
00062   // If the function has any arguments in the local address space, then it's
00063   // possible these arguments require the entire local memory space, so
00064   // we cannot use local memory in the pass.
00065   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
00066     Type *ParamTy = FTy->getParamType(i);
00067     if (ParamTy->isPointerTy() &&
00068         ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
00069       LocalMemAvailable = 0;
00070       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
00071                       "local memory disabled.\n");
00072       break;
00073     }
00074   }
00075 
00076   if (LocalMemAvailable > 0) {
00077     // Check how much local memory is being used by global objects
00078     for (Module::global_iterator I = Mod->global_begin(),
00079                                  E = Mod->global_end(); I != E; ++I) {
00080       GlobalVariable *GV = &*I;
00081       if (GV->getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
00082         continue;
00083       for (Value::use_iterator U = GV->use_begin(),
00084                                UE = GV->use_end(); U != UE; ++U) {
00085         Instruction *Use = dyn_cast<Instruction>(*U);
00086         if (!Use)
00087           continue;
00088         if (Use->getParent()->getParent() == &F)
00089           LocalMemAvailable -=
00090               Mod->getDataLayout().getTypeAllocSize(GV->getValueType());
00091       }
00092     }
00093   }
00094 
00095   LocalMemAvailable = std::max(0, LocalMemAvailable);
00096   DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
00097 
00098   visit(F);
00099 
00100   return false;
00101 }
00102 
00103 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
00104   return VectorType::get(ArrayTy->getArrayElementType(),
00105                          ArrayTy->getArrayNumElements());
00106 }
00107 
00108 static Value *
00109 calculateVectorIndex(Value *Ptr,
00110                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
00111   if (isa<AllocaInst>(Ptr))
00112     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
00113 
00114   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
00115 
00116   auto I = GEPIdx.find(GEP);
00117   return I == GEPIdx.end() ? nullptr : I->second;
00118 }
00119 
00120 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
00121   // FIXME we only support simple cases
00122   if (GEP->getNumOperands() != 3)
00123     return NULL;
00124 
00125   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
00126   if (!I0 || !I0->isZero())
00127     return NULL;
00128 
00129   return GEP->getOperand(2);
00130 }
00131 
00132 // Not an instruction handled below to turn into a vector.
00133 //
00134 // TODO: Check isTriviallyVectorizable for calls and handle other
00135 // instructions.
00136 static bool canVectorizeInst(Instruction *Inst, User *User) {
00137   switch (Inst->getOpcode()) {
00138   case Instruction::Load:
00139   case Instruction::BitCast:
00140   case Instruction::AddrSpaceCast:
00141     return true;
00142   case Instruction::Store: {
00143     // Must be the stored pointer operand, not a stored value.
00144     StoreInst *SI = cast<StoreInst>(Inst);
00145     return SI->getPointerOperand() == User;
00146   }
00147   default:
00148     return false;
00149   }
00150 }
00151 
00152 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
00153   Type *AllocaTy = Alloca->getAllocatedType();
00154 
00155   DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
00156 
00157   // FIXME: There is no reason why we can't support larger arrays, we
00158   // are just being conservative for now.
00159   if (!AllocaTy->isArrayTy() ||
00160       AllocaTy->getArrayElementType()->isVectorTy() ||
00161       AllocaTy->getArrayNumElements() > 4) {
00162 
00163     DEBUG(dbgs() << "  Cannot convert type to vector");
00164     return false;
00165   }
00166 
00167   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
00168   std::vector<Value*> WorkList;
00169   for (User *AllocaUser : Alloca->users()) {
00170     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
00171     if (!GEP) {
00172       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
00173         return false;
00174 
00175       WorkList.push_back(AllocaUser);
00176       continue;
00177     }
00178 
00179     Value *Index = GEPToVectorIndex(GEP);
00180 
00181     // If we can't compute a vector index from this GEP, then we can't
00182     // promote this alloca to vector.
00183     if (!Index) {
00184       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
00185       return false;
00186     }
00187 
00188     GEPVectorIdx[GEP] = Index;
00189     for (User *GEPUser : AllocaUser->users()) {
00190       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
00191         return false;
00192 
00193       WorkList.push_back(GEPUser);
00194     }
00195   }
00196 
00197   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
00198 
00199   DEBUG(dbgs() << "  Converting alloca to vector "
00200         << *AllocaTy << " -> " << *VectorTy << '\n');
00201 
00202   for (std::vector<Value*>::iterator I = WorkList.begin(),
00203                                      E = WorkList.end(); I != E; ++I) {
00204     Instruction *Inst = cast<Instruction>(*I);
00205     IRBuilder<> Builder(Inst);
00206     switch (Inst->getOpcode()) {
00207     case Instruction::Load: {
00208       Value *Ptr = Inst->getOperand(0);
00209       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
00210       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
00211       Value *VecValue = Builder.CreateLoad(BitCast);
00212       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
00213       Inst->replaceAllUsesWith(ExtractElement);
00214       Inst->eraseFromParent();
00215       break;
00216     }
00217     case Instruction::Store: {
00218       Value *Ptr = Inst->getOperand(1);
00219       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
00220       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
00221       Value *VecValue = Builder.CreateLoad(BitCast);
00222       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
00223                                                        Inst->getOperand(0),
00224                                                        Index);
00225       Builder.CreateStore(NewVecValue, BitCast);
00226       Inst->eraseFromParent();
00227       break;
00228     }
00229     case Instruction::BitCast:
00230     case Instruction::AddrSpaceCast:
00231       break;
00232 
00233     default:
00234       Inst->dump();
00235       llvm_unreachable("Inconsistency in instructions promotable to vector");
00236     }
00237   }
00238   return true;
00239 }
00240 
00241 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
00242   bool Success = true;
00243   for (User *User : Val->users()) {
00244     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
00245       continue;
00246     if (CallInst *CI = dyn_cast<CallInst>(User)) {
00247       // TODO: We might be able to handle some cases where the callee is a
00248       // constantexpr bitcast of a function.
00249       if (!CI->getCalledFunction())
00250         return false;
00251 
00252       WorkList.push_back(User);
00253       continue;
00254     }
00255 
00256     // FIXME: Correctly handle ptrtoint instructions.
00257     Instruction *UseInst = dyn_cast<Instruction>(User);
00258     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
00259       return false;
00260 
00261     if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
00262       // Reject if the stored value is not the pointer operand.
00263       if (SI->getPointerOperand() != Val)
00264         return false;
00265     }
00266 
00267     if (!User->getType()->isPointerTy())
00268       continue;
00269 
00270     WorkList.push_back(User);
00271 
00272     Success &= collectUsesWithPtrTypes(User, WorkList);
00273   }
00274   return Success;
00275 }
00276 
00277 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
00278   if (!I.isStaticAlloca())
00279     return;
00280 
00281   IRBuilder<> Builder(&I);
00282 
00283   // First try to replace the alloca with a vector
00284   Type *AllocaTy = I.getAllocatedType();
00285 
00286   DEBUG(dbgs() << "Trying to promote " << I << '\n');
00287 
00288   if (tryPromoteAllocaToVector(&I))
00289     return;
00290 
00291   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
00292 
00293   // FIXME: This is the maximum work group size.  We should try to get
00294   // value from the reqd_work_group_size function attribute if it is
00295   // available.
00296   unsigned WorkGroupSize = 256;
00297   int AllocaSize =
00298       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
00299 
00300   if (AllocaSize > LocalMemAvailable) {
00301     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
00302     return;
00303   }
00304 
00305   std::vector<Value*> WorkList;
00306 
00307   if (!collectUsesWithPtrTypes(&I, WorkList)) {
00308     DEBUG(dbgs() << " Do not know how to convert all uses\n");
00309     return;
00310   }
00311 
00312   DEBUG(dbgs() << "Promoting alloca to local memory\n");
00313   LocalMemAvailable -= AllocaSize;
00314 
00315   Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
00316   GlobalVariable *GV = new GlobalVariable(
00317       *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
00318       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
00319 
00320   FunctionType *FTy = FunctionType::get(
00321       Type::getInt32Ty(Mod->getContext()), false);
00322   AttributeSet AttrSet;
00323   AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
00324 
00325   Value *ReadLocalSizeY = Mod->getOrInsertFunction(
00326       "llvm.r600.read.local.size.y", FTy, AttrSet);
00327   Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
00328       "llvm.r600.read.local.size.z", FTy, AttrSet);
00329   Value *ReadTIDIGX = Mod->getOrInsertFunction(
00330       "llvm.r600.read.tidig.x", FTy, AttrSet);
00331   Value *ReadTIDIGY = Mod->getOrInsertFunction(
00332       "llvm.r600.read.tidig.y", FTy, AttrSet);
00333   Value *ReadTIDIGZ = Mod->getOrInsertFunction(
00334       "llvm.r600.read.tidig.z", FTy, AttrSet);
00335 
00336   Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
00337   Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
00338   Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
00339   Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
00340   Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
00341 
00342   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
00343   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
00344   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
00345   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
00346   TID = Builder.CreateAdd(TID, TIdZ);
00347 
00348   std::vector<Value*> Indices;
00349   Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
00350   Indices.push_back(TID);
00351 
00352   Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
00353   I.mutateType(Offset->getType());
00354   I.replaceAllUsesWith(Offset);
00355   I.eraseFromParent();
00356 
00357   for (std::vector<Value*>::iterator i = WorkList.begin(),
00358                                      e = WorkList.end(); i != e; ++i) {
00359     Value *V = *i;
00360     CallInst *Call = dyn_cast<CallInst>(V);
00361     if (!Call) {
00362       Type *EltTy = V->getType()->getPointerElementType();
00363       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
00364 
00365       // The operand's value should be corrected on its own.
00366       if (isa<AddrSpaceCastInst>(V))
00367         continue;
00368 
00369       // FIXME: It doesn't really make sense to try to do this for all
00370       // instructions.
00371       V->mutateType(NewTy);
00372       continue;
00373     }
00374 
00375     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
00376     if (!Intr) {
00377       std::vector<Type*> ArgTypes;
00378       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
00379                                 ArgIdx != ArgEnd; ++ArgIdx) {
00380         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
00381       }
00382       Function *F = Call->getCalledFunction();
00383       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
00384                                                 F->isVarArg());
00385       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
00386                                              NewType, F->getAttributes());
00387       Function *NewF = cast<Function>(C);
00388       Call->setCalledFunction(NewF);
00389       continue;
00390     }
00391 
00392     Builder.SetInsertPoint(Intr);
00393     switch (Intr->getIntrinsicID()) {
00394     case Intrinsic::lifetime_start:
00395     case Intrinsic::lifetime_end:
00396       // These intrinsics are for address space 0 only
00397       Intr->eraseFromParent();
00398       continue;
00399     case Intrinsic::memcpy: {
00400       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
00401       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
00402                            MemCpy->getLength(), MemCpy->getAlignment(),
00403                            MemCpy->isVolatile());
00404       Intr->eraseFromParent();
00405       continue;
00406     }
00407     case Intrinsic::memset: {
00408       MemSetInst *MemSet = cast<MemSetInst>(Intr);
00409       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
00410                            MemSet->getLength(), MemSet->getAlignment(),
00411                            MemSet->isVolatile());
00412       Intr->eraseFromParent();
00413       continue;
00414     }
00415     case Intrinsic::invariant_start:
00416     case Intrinsic::invariant_end:
00417     case Intrinsic::invariant_group_barrier:
00418       Intr->eraseFromParent();
00419       // FIXME: I think the invariant marker should still theoretically apply,
00420       // but the intrinsics need to be changed to accept pointers with any
00421       // address space.
00422       continue;
00423     default:
00424       Intr->dump();
00425       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
00426     }
00427   }
00428 }
00429 
00430 FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
00431   return new AMDGPUPromoteAlloca(ST);
00432 }