LLVM  4.0.0
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/MDBuilder.h"
21 #include "llvm/Support/Debug.h"
23 
24 #define DEBUG_TYPE "amdgpu-promote-alloca"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 // FIXME: This can create globals so should be a module pass.
31 class AMDGPUPromoteAlloca : public FunctionPass {
32 private:
33  const TargetMachine *TM;
34  Module *Mod;
35  const DataLayout *DL;
36  MDNode *MaxWorkGroupSizeRange;
37 
38  // FIXME: This should be per-kernel.
39  uint32_t LocalMemLimit;
40  uint32_t CurrentLocalMemUsage;
41 
42  bool IsAMDGCN;
43  bool IsAMDHSA;
44 
45  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
46  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
47 
48  /// BaseAlloca is the alloca root the search started from.
49  /// Val may be that alloca or a recursive user of it.
50  bool collectUsesWithPtrTypes(Value *BaseAlloca,
51  Value *Val,
52  std::vector<Value*> &WorkList) const;
53 
54  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
55  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
56  /// Returns true if both operands are derived from the same alloca. Val should
57  /// be the same value as one of the input operands of UseInst.
58  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
59  Instruction *UseInst,
60  int OpIdx0, int OpIdx1) const;
61 
62 public:
63  static char ID;
64 
65  AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
67  TM(TM_),
68  Mod(nullptr),
69  DL(nullptr),
70  MaxWorkGroupSizeRange(nullptr),
71  LocalMemLimit(0),
72  CurrentLocalMemUsage(0),
73  IsAMDGCN(false),
74  IsAMDHSA(false) { }
75 
76  bool doInitialization(Module &M) override;
77  bool runOnFunction(Function &F) override;
78 
79  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
80 
81  void handleAlloca(AllocaInst &I);
82 
83  void getAnalysisUsage(AnalysisUsage &AU) const override {
84  AU.setPreservesCFG();
86  }
87 };
88 
89 } // End anonymous namespace
90 
92 
93 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
94  "AMDGPU promote alloca to vector or LDS", false, false)
95 
96 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
97 
98 
99 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
100  if (!TM)
101  return false;
102 
103  Mod = &M;
104  DL = &Mod->getDataLayout();
105 
106  // The maximum workitem id.
107  //
108  // FIXME: Should get as subtarget property. Usually runtime enforced max is
109  // 256.
110  MDBuilder MDB(Mod->getContext());
111  MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
112 
113  const Triple &TT = TM->getTargetTriple();
114 
115  IsAMDGCN = TT.getArch() == Triple::amdgcn;
116  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
117 
118  return false;
119 }
120 
121 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
122  if (!TM || skipFunction(F))
123  return false;
124 
125  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
126  if (!ST.isPromoteAllocaEnabled())
127  return false;
128 
129  FunctionType *FTy = F.getFunctionType();
130 
131  // If the function has any arguments in the local address space, then it's
132  // possible these arguments require the entire local memory space, so
133  // we cannot use local memory in the pass.
134  for (Type *ParamTy : FTy->params()) {
135  PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
136  if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
137  LocalMemLimit = 0;
138  DEBUG(dbgs() << "Function has local memory argument. Promoting to "
139  "local memory disabled.\n");
140  return false;
141  }
142  }
143 
144  LocalMemLimit = ST.getLocalMemorySize();
145  if (LocalMemLimit == 0)
146  return false;
147 
148  const DataLayout &DL = Mod->getDataLayout();
149 
150  // Check how much local memory is being used by global objects
151  CurrentLocalMemUsage = 0;
152  for (GlobalVariable &GV : Mod->globals()) {
153  if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
154  continue;
155 
156  for (const User *U : GV.users()) {
157  const Instruction *Use = dyn_cast<Instruction>(U);
158  if (!Use)
159  continue;
160 
161  if (Use->getParent()->getParent() == &F) {
162  unsigned Align = GV.getAlignment();
163  if (Align == 0)
164  Align = DL.getABITypeAlignment(GV.getValueType());
165 
166  // FIXME: Try to account for padding here. The padding is currently
167  // determined from the inverse order of uses in the function. I'm not
168  // sure if the use list order is in any way connected to this, so the
169  // total reported size is likely incorrect.
170  uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
171  CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
172  CurrentLocalMemUsage += AllocSize;
173  break;
174  }
175  }
176  }
177 
178  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
179 
180  // Restrict local memory usage so that we don't drastically reduce occupancy,
181  // unless it is already significantly reduced.
182 
183  // TODO: Have some sort of hint or other heuristics to guess occupancy based
184  // on other factors..
185  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
186  if (OccupancyHint == 0)
187  OccupancyHint = 7;
188 
189  // Clamp to max value.
190  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
191 
192  // Check the hint but ignore it if it's obviously wrong from the existing LDS
193  // usage.
194  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
195 
196 
197  // Round up to the next tier of usage.
198  unsigned MaxSizeWithWaveCount
199  = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
200 
201  // Program is possibly broken by using more local mem than available.
202  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
203  return false;
204 
205  LocalMemLimit = MaxSizeWithWaveCount;
206 
207  DEBUG(
208  dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
209  << " Rounding size to " << MaxSizeWithWaveCount
210  << " with a maximum occupancy of " << MaxOccupancy << '\n'
211  << " and " << (LocalMemLimit - CurrentLocalMemUsage)
212  << " available for promotion\n"
213  );
214 
215  BasicBlock &EntryBB = *F.begin();
216  for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
218 
219  ++I;
220  if (AI)
221  handleAlloca(*AI);
222  }
223 
224  return true;
225 }
226 
227 std::pair<Value *, Value *>
228 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
229  if (!IsAMDHSA) {
230  Function *LocalSizeYFn
231  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
232  Function *LocalSizeZFn
233  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
234 
235  CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
236  CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
237 
238  LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
239  LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
240 
241  return std::make_pair(LocalSizeY, LocalSizeZ);
242  }
243 
244  // We must read the size out of the dispatch pointer.
245  assert(IsAMDGCN);
246 
247  // We are indexing into this struct, and want to extract the workgroup_size_*
248  // fields.
249  //
250  // typedef struct hsa_kernel_dispatch_packet_s {
251  // uint16_t header;
252  // uint16_t setup;
253  // uint16_t workgroup_size_x ;
254  // uint16_t workgroup_size_y;
255  // uint16_t workgroup_size_z;
256  // uint16_t reserved0;
257  // uint32_t grid_size_x ;
258  // uint32_t grid_size_y ;
259  // uint32_t grid_size_z;
260  //
261  // uint32_t private_segment_size;
262  // uint32_t group_segment_size;
263  // uint64_t kernel_object;
264  //
265  // #ifdef HSA_LARGE_MODEL
266  // void *kernarg_address;
267  // #elif defined HSA_LITTLE_ENDIAN
268  // void *kernarg_address;
269  // uint32_t reserved1;
270  // #else
271  // uint32_t reserved1;
272  // void *kernarg_address;
273  // #endif
274  // uint64_t reserved2;
275  // hsa_signal_t completion_signal; // uint64_t wrapper
276  // } hsa_kernel_dispatch_packet_t
277  //
278  Function *DispatchPtrFn
279  = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
280 
281  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
283  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
284 
285  // Size of the dispatch packet struct.
287 
288  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
289  Value *CastDispatchPtr = Builder.CreateBitCast(
290  DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
291 
292  // We could do a single 64-bit load here, but it's likely that the basic
293  // 32-bit and extract sequence is already present, and it is probably easier
294  // to CSE this. The loads should be mergable later anyway.
295  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
296  LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
297 
298  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
299  LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
300 
301  MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
304  LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
305 
306  // Extract y component. Upper half of LoadZU should be zero already.
307  Value *Y = Builder.CreateLShr(LoadXY, 16);
308 
309  return std::make_pair(Y, LoadZU);
310 }
311 
312 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
314 
315  switch (N) {
316  case 0:
317  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
318  : Intrinsic::r600_read_tidig_x;
319  break;
320  case 1:
321  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
322  : Intrinsic::r600_read_tidig_y;
323  break;
324 
325  case 2:
326  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
327  : Intrinsic::r600_read_tidig_z;
328  break;
329  default:
330  llvm_unreachable("invalid dimension");
331  }
332 
333  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
334  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
335  CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
336 
337  return CI;
338 }
339 
340 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
341  return VectorType::get(ArrayTy->getArrayElementType(),
342  ArrayTy->getArrayNumElements());
343 }
344 
345 static Value *
347  const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
348  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
349 
350  auto I = GEPIdx.find(GEP);
351  return I == GEPIdx.end() ? nullptr : I->second;
352 }
353 
355  // FIXME we only support simple cases
356  if (GEP->getNumOperands() != 3)
357  return nullptr;
358 
359  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
360  if (!I0 || !I0->isZero())
361  return nullptr;
362 
363  return GEP->getOperand(2);
364 }
365 
366 // Not an instruction handled below to turn into a vector.
367 //
368 // TODO: Check isTriviallyVectorizable for calls and handle other
369 // instructions.
370 static bool canVectorizeInst(Instruction *Inst, User *User) {
371  switch (Inst->getOpcode()) {
372  case Instruction::Load:
373  case Instruction::BitCast:
374  case Instruction::AddrSpaceCast:
375  return true;
376  case Instruction::Store: {
377  // Must be the stored pointer operand, not a stored value.
378  StoreInst *SI = cast<StoreInst>(Inst);
379  return SI->getPointerOperand() == User;
380  }
381  default:
382  return false;
383  }
384 }
385 
386 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
387  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
388 
389  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
390 
391  // FIXME: There is no reason why we can't support larger arrays, we
392  // are just being conservative for now.
393  if (!AllocaTy ||
394  AllocaTy->getElementType()->isVectorTy() ||
395  AllocaTy->getNumElements() > 4 ||
396  AllocaTy->getNumElements() < 2) {
397  DEBUG(dbgs() << " Cannot convert type to vector\n");
398  return false;
399  }
400 
401  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
402  std::vector<Value*> WorkList;
403  for (User *AllocaUser : Alloca->users()) {
405  if (!GEP) {
406  if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
407  return false;
408 
409  WorkList.push_back(AllocaUser);
410  continue;
411  }
412 
413  Value *Index = GEPToVectorIndex(GEP);
414 
415  // If we can't compute a vector index from this GEP, then we can't
416  // promote this alloca to vector.
417  if (!Index) {
418  DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
419  return false;
420  }
421 
422  GEPVectorIdx[GEP] = Index;
423  for (User *GEPUser : AllocaUser->users()) {
424  if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
425  return false;
426 
427  WorkList.push_back(GEPUser);
428  }
429  }
430 
431  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
432 
433  DEBUG(dbgs() << " Converting alloca to vector "
434  << *AllocaTy << " -> " << *VectorTy << '\n');
435 
436  for (Value *V : WorkList) {
437  Instruction *Inst = cast<Instruction>(V);
438  IRBuilder<> Builder(Inst);
439  switch (Inst->getOpcode()) {
440  case Instruction::Load: {
441  Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
442  Value *Ptr = Inst->getOperand(0);
443  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
444 
445  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
446  Value *VecValue = Builder.CreateLoad(BitCast);
447  Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
448  Inst->replaceAllUsesWith(ExtractElement);
449  Inst->eraseFromParent();
450  break;
451  }
452  case Instruction::Store: {
453  Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
454 
455  Value *Ptr = Inst->getOperand(1);
456  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
457  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
458  Value *VecValue = Builder.CreateLoad(BitCast);
459  Value *NewVecValue = Builder.CreateInsertElement(VecValue,
460  Inst->getOperand(0),
461  Index);
462  Builder.CreateStore(NewVecValue, BitCast);
463  Inst->eraseFromParent();
464  break;
465  }
466  case Instruction::BitCast:
467  case Instruction::AddrSpaceCast:
468  break;
469 
470  default:
471  llvm_unreachable("Inconsistency in instructions promotable to vector");
472  }
473  }
474  return true;
475 }
476 
477 static bool isCallPromotable(CallInst *CI) {
479  if (!II)
480  return false;
481 
482  switch (II->getIntrinsicID()) {
483  case Intrinsic::memcpy:
484  case Intrinsic::memmove:
485  case Intrinsic::memset:
486  case Intrinsic::lifetime_start:
487  case Intrinsic::lifetime_end:
488  case Intrinsic::invariant_start:
489  case Intrinsic::invariant_end:
490  case Intrinsic::invariant_group_barrier:
491  case Intrinsic::objectsize:
492  return true;
493  default:
494  return false;
495  }
496 }
497 
498 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
499  Value *Val,
500  Instruction *Inst,
501  int OpIdx0,
502  int OpIdx1) const {
503  // Figure out which operand is the one we might not be promoting.
504  Value *OtherOp = Inst->getOperand(OpIdx0);
505  if (Val == OtherOp)
506  OtherOp = Inst->getOperand(OpIdx1);
507 
508  if (isa<ConstantPointerNull>(OtherOp))
509  return true;
510 
511  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
512  if (!isa<AllocaInst>(OtherObj))
513  return false;
514 
515  // TODO: We should be able to replace undefs with the right pointer type.
516 
517  // TODO: If we know the other base object is another promotable
518  // alloca, not necessarily this alloca, we can do this. The
519  // important part is both must have the same address space at
520  // the end.
521  if (OtherObj != BaseAlloca) {
522  DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
523  return false;
524  }
525 
526  return true;
527 }
528 
529 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
530  Value *BaseAlloca,
531  Value *Val,
532  std::vector<Value*> &WorkList) const {
533 
534  for (User *User : Val->users()) {
535  if (is_contained(WorkList, User))
536  continue;
537 
538  if (CallInst *CI = dyn_cast<CallInst>(User)) {
539  if (!isCallPromotable(CI))
540  return false;
541 
542  WorkList.push_back(User);
543  continue;
544  }
545 
546  Instruction *UseInst = cast<Instruction>(User);
547  if (UseInst->getOpcode() == Instruction::PtrToInt)
548  return false;
549 
550  if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
551  if (LI->isVolatile())
552  return false;
553 
554  continue;
555  }
556 
557  if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
558  if (SI->isVolatile())
559  return false;
560 
561  // Reject if the stored value is not the pointer operand.
562  if (SI->getPointerOperand() != Val)
563  return false;
564  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
565  if (RMW->isVolatile())
566  return false;
567  } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
568  if (CAS->isVolatile())
569  return false;
570  }
571 
572  // Only promote a select if we know that the other select operand
573  // is from another pointer that will also be promoted.
574  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
575  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
576  return false;
577 
578  // May need to rewrite constant operands.
579  WorkList.push_back(ICmp);
580  }
581 
582  if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
583  // Don't collect the users of this.
584  WorkList.push_back(User);
585  continue;
586  }
587 
588  if (!User->getType()->isPointerTy())
589  continue;
590 
591  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
592  // Be conservative if an address could be computed outside the bounds of
593  // the alloca.
594  if (!GEP->isInBounds())
595  return false;
596  }
597 
598  // Only promote a select if we know that the other select operand is from
599  // another pointer that will also be promoted.
600  if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
601  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
602  return false;
603  }
604 
605  // Repeat for phis.
606  if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
607  // TODO: Handle more complex cases. We should be able to replace loops
608  // over arrays.
609  switch (Phi->getNumIncomingValues()) {
610  case 1:
611  break;
612  case 2:
613  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
614  return false;
615  break;
616  default:
617  return false;
618  }
619  }
620 
621  WorkList.push_back(User);
622  if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
623  return false;
624  }
625 
626  return true;
627 }
628 
629 // FIXME: Should try to pick the most likely to be profitable allocas first.
630 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
631  // Array allocations are probably not worth handling, since an allocation of
632  // the array type is the canonical form.
633  if (!I.isStaticAlloca() || I.isArrayAllocation())
634  return;
635 
636  IRBuilder<> Builder(&I);
637 
638  // First try to replace the alloca with a vector
639  Type *AllocaTy = I.getAllocatedType();
640 
641  DEBUG(dbgs() << "Trying to promote " << I << '\n');
642 
643  if (tryPromoteAllocaToVector(&I)) {
644  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
645  return;
646  }
647 
648  const Function &ContainingFunction = *I.getParent()->getParent();
649 
650  // Don't promote the alloca to LDS for shader calling conventions as the work
651  // item ID intrinsics are not supported for these calling conventions.
652  // Furthermore not all LDS is available for some of the stages.
653  if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
654  return;
655 
656  const AMDGPUSubtarget &ST =
657  TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
658  // FIXME: We should also try to get this value from the reqd_work_group_size
659  // function attribute if it is available.
660  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
661 
662  const DataLayout &DL = Mod->getDataLayout();
663 
664  unsigned Align = I.getAlignment();
665  if (Align == 0)
666  Align = DL.getABITypeAlignment(I.getAllocatedType());
667 
668  // FIXME: This computed padding is likely wrong since it depends on inverse
669  // usage order.
670  //
671  // FIXME: It is also possible that if we're allowed to use all of the memory
672  // could could end up using more than the maximum due to alignment padding.
673 
674  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
675  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
676  NewSize += AllocSize;
677 
678  if (NewSize > LocalMemLimit) {
679  DEBUG(dbgs() << " " << AllocSize
680  << " bytes of local memory not available to promote\n");
681  return;
682  }
683 
684  CurrentLocalMemUsage = NewSize;
685 
686  std::vector<Value*> WorkList;
687 
688  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
689  DEBUG(dbgs() << " Do not know how to convert all uses\n");
690  return;
691  }
692 
693  DEBUG(dbgs() << "Promoting alloca to local memory\n");
694 
695  Function *F = I.getParent()->getParent();
696 
697  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
698  GlobalVariable *GV = new GlobalVariable(
699  *Mod, GVTy, false, GlobalValue::InternalLinkage,
700  UndefValue::get(GVTy),
701  Twine(F->getName()) + Twine('.') + I.getName(),
702  nullptr,
706  GV->setAlignment(I.getAlignment());
707 
708  Value *TCntY, *TCntZ;
709 
710  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
711  Value *TIdX = getWorkitemID(Builder, 0);
712  Value *TIdY = getWorkitemID(Builder, 1);
713  Value *TIdZ = getWorkitemID(Builder, 2);
714 
715  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
716  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
717  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
718  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
719  TID = Builder.CreateAdd(TID, TIdZ);
720 
721  Value *Indices[] = {
722  Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
723  TID
724  };
725 
726  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
727  I.mutateType(Offset->getType());
728  I.replaceAllUsesWith(Offset);
729  I.eraseFromParent();
730 
731  for (Value *V : WorkList) {
733  if (!Call) {
734  if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
735  Value *Src0 = CI->getOperand(0);
736  Type *EltTy = Src0->getType()->getPointerElementType();
738 
739  if (isa<ConstantPointerNull>(CI->getOperand(0)))
740  CI->setOperand(0, ConstantPointerNull::get(NewTy));
741 
742  if (isa<ConstantPointerNull>(CI->getOperand(1)))
743  CI->setOperand(1, ConstantPointerNull::get(NewTy));
744 
745  continue;
746  }
747 
748  // The operand's value should be corrected on its own and we don't want to
749  // touch the users.
750  if (isa<AddrSpaceCastInst>(V))
751  continue;
752 
753  Type *EltTy = V->getType()->getPointerElementType();
755 
756  // FIXME: It doesn't really make sense to try to do this for all
757  // instructions.
758  V->mutateType(NewTy);
759 
760  // Adjust the types of any constant operands.
761  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
762  if (isa<ConstantPointerNull>(SI->getOperand(1)))
763  SI->setOperand(1, ConstantPointerNull::get(NewTy));
764 
765  if (isa<ConstantPointerNull>(SI->getOperand(2)))
766  SI->setOperand(2, ConstantPointerNull::get(NewTy));
767  } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
768  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
769  if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
770  Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
771  }
772  }
773 
774  continue;
775  }
776 
777  IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
778  Builder.SetInsertPoint(Intr);
779  switch (Intr->getIntrinsicID()) {
780  case Intrinsic::lifetime_start:
781  case Intrinsic::lifetime_end:
782  // These intrinsics are for address space 0 only
783  Intr->eraseFromParent();
784  continue;
785  case Intrinsic::memcpy: {
786  MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
787  Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
788  MemCpy->getLength(), MemCpy->getAlignment(),
789  MemCpy->isVolatile());
790  Intr->eraseFromParent();
791  continue;
792  }
793  case Intrinsic::memmove: {
794  MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
795  Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
796  MemMove->getLength(), MemMove->getAlignment(),
797  MemMove->isVolatile());
798  Intr->eraseFromParent();
799  continue;
800  }
801  case Intrinsic::memset: {
802  MemSetInst *MemSet = cast<MemSetInst>(Intr);
803  Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
804  MemSet->getLength(), MemSet->getAlignment(),
805  MemSet->isVolatile());
806  Intr->eraseFromParent();
807  continue;
808  }
809  case Intrinsic::invariant_start:
810  case Intrinsic::invariant_end:
811  case Intrinsic::invariant_group_barrier:
812  Intr->eraseFromParent();
813  // FIXME: I think the invariant marker should still theoretically apply,
814  // but the intrinsics need to be changed to accept pointers with any
815  // address space.
816  continue;
817  case Intrinsic::objectsize: {
818  Value *Src = Intr->getOperand(0);
819  Type *SrcTy = Src->getType()->getPointerElementType();
820  Function *ObjectSize = Intrinsic::getDeclaration(Mod,
821  Intrinsic::objectsize,
823  );
824 
825  CallInst *NewCall
826  = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
827  Intr->replaceAllUsesWith(NewCall);
828  Intr->eraseFromParent();
829  continue;
830  }
831  default:
832  Intr->dump();
833  llvm_unreachable("Don't know how to promote alloca intrinsic use.");
834  }
835  }
836 }
837 
839  return new AMDGPUPromoteAlloca(TM);
840 }
unsigned getAlignment() const
OSType getOS() const
getOS - Get the parsed operating system type of this triple.
Definition: Triple.h:279
const NoneType None
Definition: None.h:23
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:76
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:72
AMDGPU specific subclass of TargetSubtarget.
bool isVolatile() const
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:52
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
an instruction that atomically checks whether a specified value is in a memory location, and, if it is, stores a new value there.
Definition: Instructions.h:504
Value * getValue() const
Return the arguments to the instruction.
unsigned getNumOperands() const
Definition: User.h:167
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:84
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
This class represents a function call, abstracting a target machine's calling convention.
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:128
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:655
FunctionPass * createAMDGPUPromoteAlloca(const TargetMachine *TM=nullptr)
This class wraps the llvm.memset intrinsic.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:100
Metadata node.
Definition: Metadata.h:830
The two locations do not alias at all.
Definition: AliasAnalysis.h:79
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:664
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memset to the specified pointer and the specified value.
Definition: IRBuilder.h:404
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:471
An instruction for reading from memory.
Definition: Instructions.h:164
int getLocalMemorySize() const
Address space for local memory.
Definition: AMDGPU.h:141
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:669
Hexagon Common GEP
DispatchPtr(false)
Type * getPointerElementType() const
Definition: Type.h:358
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:165
uint64_t getArrayNumElements() const
Definition: DerivedTypes.h:364
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:195
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:191
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:228
bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1...
static bool canVectorizeInst(Instruction *Inst, User *User)
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, Value * > &GEPIdx)
This class represents the LLVM 'select' instruction.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class wraps the llvm.memmove intrinsic.
Type * getArrayElementType() const
Definition: Type.h:347
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:588
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:813
static bool tryPromoteAllocaToVector(AllocaInst *Alloca)
Class to represent function types.
Definition: DerivedTypes.h:102
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1362
#define F(x, y, z)
Definition: MD5.cpp:51
Class to represent array types.
Definition: DerivedTypes.h:345
CallInst * CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memmove between the specified pointers.
Definition: IRBuilder.h:443
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:949
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:270
An instruction for storing to memory.
Definition: Instructions.h:300
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:401
iterator begin()
Definition: Function.h:535
Value * CreateInBoundsGEP(Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1158
Type * getElementType() const
Definition: DerivedTypes.h:336
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
Class to represent pointers.
Definition: DerivedTypes.h:443
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:830
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1323
unsigned getMaxWavesPerEU() const
bool isShader(CallingConv::ID cc)
static Value * GEPToVectorIndex(GetElementPtrInst *GEP)
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
Address space for constant memory (VTX2)
Definition: AMDGPU.h:140
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:219
Address space for private memory.
Definition: AMDGPU.h:138
unsigned getAlignment() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:109
Value * getRawDest() const
Represent the analysis usage information of a pass.
uint32_t Offset
This instruction compares its operands according to the predicate given to the constructor.
uint64_t getNumElements() const
Definition: DerivedTypes.h:335
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
Value * getOperand(unsigned i) const
Definition: User.h:145
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:213
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1337
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
bool isPromoteAllocaEnabled() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1183
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:857
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:3540
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const
Inverse of getMaxLocalMemWithWaveCount.
#define INITIALIZE_TM_PASS(passName, arg, name, cfg, analysis)
This initializer registers TargetMachine constructor, so the pass being initialized can use target de...
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:689
#define DEBUG_TYPE
This is the shared class of boolean and integer constants.
Definition: Constants.h:88
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:408
static bool isCallPromotable(CallInst *CI)
iterator end()
Definition: BasicBlock.h:230
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
Value * getLength() const
This class wraps the llvm.memcpy intrinsic.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:198
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:276
Value * CreateConstInBoundsGEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition: IRBuilder.h:1245
void setOperand(unsigned i, Value *Val)
Definition: User.h:150
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Class to represent vector types.
Definition: DerivedTypes.h:369
Class for arbitrary precision integers.
Definition: APInt.h:77
iterator_range< user_iterator > users()
Definition: Value.h:370
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1132
CallInst * CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:422
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:203
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:169
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.cpp:230
CallInst * CreateCall(Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1579
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:606
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
Rename collisions when linking (static functions).
Definition: GlobalValue.h:56
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition: Value.h:558
Value * getRawSource() const
Return the arguments to the instruction.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LoadInst * CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name)
Definition: IRBuilder.h:1100
LLVM Value Representation.
Definition: Value.h:71
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:111
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:631
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:951
#define DEBUG(X)
Definition: Debug.h:100
Primary interface to the complete machine description for the target machine.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:678
char & AMDGPUPromoteAllocaID
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml","ocaml 3.10-compatible collector")
int * Ptr
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:102
Value * getPointerOperand()
Definition: Instructions.h:394
const BasicBlock * getParent() const
Definition: Instruction.h:62
void addDereferenceableAttr(unsigned i, uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
an instruction to allocate memory on the stack
Definition: Instructions.h:60
static VectorType * arrayTypeToVecType(Type *ArrayTy)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:783