LLVM  7.0.0svn
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/APInt.h"
19 #include "llvm/ADT/None.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/ADT/Twine.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/Constant.h"
30 #include "llvm/IR/Constants.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/IR/GlobalVariable.h"
36 #include "llvm/IR/IRBuilder.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/LLVMContext.h"
42 #include "llvm/IR/Metadata.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/User.h"
46 #include "llvm/IR/Value.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/Debug.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <map>
58 #include <tuple>
59 #include <utility>
60 #include <vector>
61 
62 #define DEBUG_TYPE "amdgpu-promote-alloca"
63 
64 using namespace llvm;
65 
66 namespace {
67 
68 static cl::opt<bool> DisablePromoteAllocaToVector(
69  "disable-promote-alloca-to-vector",
70  cl::desc("Disable promote alloca to vector"),
71  cl::init(false));
72 
73 // FIXME: This can create globals so should be a module pass.
74 class AMDGPUPromoteAlloca : public FunctionPass {
75 private:
76  const TargetMachine *TM;
77  Module *Mod = nullptr;
78  const DataLayout *DL = nullptr;
79  AMDGPUAS AS;
80 
81  // FIXME: This should be per-kernel.
82  uint32_t LocalMemLimit = 0;
83  uint32_t CurrentLocalMemUsage = 0;
84 
85  bool IsAMDGCN = false;
86  bool IsAMDHSA = false;
87 
88  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
89  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
90 
91  /// BaseAlloca is the alloca root the search started from.
92  /// Val may be that alloca or a recursive user of it.
93  bool collectUsesWithPtrTypes(Value *BaseAlloca,
94  Value *Val,
95  std::vector<Value*> &WorkList) const;
96 
97  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
98  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
99  /// Returns true if both operands are derived from the same alloca. Val should
100  /// be the same value as one of the input operands of UseInst.
101  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
102  Instruction *UseInst,
103  int OpIdx0, int OpIdx1) const;
104 
105  /// Check whether we have enough local memory for promotion.
106  bool hasSufficientLocalMem(const Function &F);
107 
108 public:
109  static char ID;
110 
111  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
112 
113  bool doInitialization(Module &M) override;
114  bool runOnFunction(Function &F) override;
115 
116  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
117 
118  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
119 
120  void getAnalysisUsage(AnalysisUsage &AU) const override {
121  AU.setPreservesCFG();
123  }
124 };
125 
126 } // end anonymous namespace
127 
128 char AMDGPUPromoteAlloca::ID = 0;
129 
130 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
131  "AMDGPU promote alloca to vector or LDS", false, false)
132 
134 
135 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
136  Mod = &M;
137  DL = &Mod->getDataLayout();
138 
139  return false;
140 }
141 
143  if (skipFunction(F))
144  return false;
145 
146  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
147  TM = &TPC->getTM<TargetMachine>();
148  else
149  return false;
150 
151  const Triple &TT = TM->getTargetTriple();
152  IsAMDGCN = TT.getArch() == Triple::amdgcn;
153  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
154 
155  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
156  if (!ST.isPromoteAllocaEnabled())
157  return false;
158 
160 
161  bool SufficientLDS = hasSufficientLocalMem(F);
162  bool Changed = false;
163  BasicBlock &EntryBB = *F.begin();
164  for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
166 
167  ++I;
168  if (AI)
169  Changed |= handleAlloca(*AI, SufficientLDS);
170  }
171 
172  return Changed;
173 }
174 
175 std::pair<Value *, Value *>
176 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
177  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
178  *Builder.GetInsertBlock()->getParent());
179 
180  if (!IsAMDHSA) {
181  Function *LocalSizeYFn
182  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
183  Function *LocalSizeZFn
184  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
185 
186  CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
187  CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
188 
189  ST.makeLIDRangeMetadata(LocalSizeY);
190  ST.makeLIDRangeMetadata(LocalSizeZ);
191 
192  return std::make_pair(LocalSizeY, LocalSizeZ);
193  }
194 
195  // We must read the size out of the dispatch pointer.
196  assert(IsAMDGCN);
197 
198  // We are indexing into this struct, and want to extract the workgroup_size_*
199  // fields.
200  //
201  // typedef struct hsa_kernel_dispatch_packet_s {
202  // uint16_t header;
203  // uint16_t setup;
204  // uint16_t workgroup_size_x ;
205  // uint16_t workgroup_size_y;
206  // uint16_t workgroup_size_z;
207  // uint16_t reserved0;
208  // uint32_t grid_size_x ;
209  // uint32_t grid_size_y ;
210  // uint32_t grid_size_z;
211  //
212  // uint32_t private_segment_size;
213  // uint32_t group_segment_size;
214  // uint64_t kernel_object;
215  //
216  // #ifdef HSA_LARGE_MODEL
217  // void *kernarg_address;
218  // #elif defined HSA_LITTLE_ENDIAN
219  // void *kernarg_address;
220  // uint32_t reserved1;
221  // #else
222  // uint32_t reserved1;
223  // void *kernarg_address;
224  // #endif
225  // uint64_t reserved2;
226  // hsa_signal_t completion_signal; // uint64_t wrapper
227  // } hsa_kernel_dispatch_packet_t
228  //
229  Function *DispatchPtrFn
230  = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
231 
232  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
234  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
235 
236  // Size of the dispatch packet struct.
238 
239  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
240  Value *CastDispatchPtr = Builder.CreateBitCast(
241  DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
242 
243  // We could do a single 64-bit load here, but it's likely that the basic
244  // 32-bit and extract sequence is already present, and it is probably easier
245  // to CSE this. The loads should be mergable later anyway.
246  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
247  LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
248 
249  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
250  LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
251 
252  MDNode *MD = MDNode::get(Mod->getContext(), None);
255  ST.makeLIDRangeMetadata(LoadZU);
256 
257  // Extract y component. Upper half of LoadZU should be zero already.
258  Value *Y = Builder.CreateLShr(LoadXY, 16);
259 
260  return std::make_pair(Y, LoadZU);
261 }
262 
263 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
264  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
265  *Builder.GetInsertBlock()->getParent());
267 
268  switch (N) {
269  case 0:
270  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
271  : Intrinsic::r600_read_tidig_x;
272  break;
273  case 1:
274  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
275  : Intrinsic::r600_read_tidig_y;
276  break;
277 
278  case 2:
279  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
280  : Intrinsic::r600_read_tidig_z;
281  break;
282  default:
283  llvm_unreachable("invalid dimension");
284  }
285 
286  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
287  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
288  ST.makeLIDRangeMetadata(CI);
289 
290  return CI;
291 }
292 
294  return VectorType::get(ArrayTy->getElementType(),
295  ArrayTy->getNumElements());
296 }
297 
298 static Value *
300  const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
301  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
302 
303  auto I = GEPIdx.find(GEP);
304  return I == GEPIdx.end() ? nullptr : I->second;
305 }
306 
308  // FIXME we only support simple cases
309  if (GEP->getNumOperands() != 3)
310  return nullptr;
311 
312  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
313  if (!I0 || !I0->isZero())
314  return nullptr;
315 
316  return GEP->getOperand(2);
317 }
318 
319 // Not an instruction handled below to turn into a vector.
320 //
321 // TODO: Check isTriviallyVectorizable for calls and handle other
322 // instructions.
323 static bool canVectorizeInst(Instruction *Inst, User *User) {
324  switch (Inst->getOpcode()) {
325  case Instruction::Load: {
326  // Currently only handle the case where the Pointer Operand is a GEP.
327  // Also we could not vectorize volatile or atomic loads.
328  LoadInst *LI = cast<LoadInst>(Inst);
329  return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
330  }
331  case Instruction::BitCast:
332  return true;
333  case Instruction::Store: {
334  // Must be the stored pointer operand, not a stored value, plus
335  // since it should be canonical form, the User should be a GEP.
336  // Also we could not vectorize volatile or atomic stores.
337  StoreInst *SI = cast<StoreInst>(Inst);
338  return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
339  }
340  default:
341  return false;
342  }
343 }
344 
346 
347  if (DisablePromoteAllocaToVector) {
348  LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
349  return false;
350  }
351 
352  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
353 
354  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
355 
356  // FIXME: There is no reason why we can't support larger arrays, we
357  // are just being conservative for now.
358  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
359  // could also be promoted but we don't currently handle this case
360  if (!AllocaTy ||
361  AllocaTy->getNumElements() > 16 ||
362  AllocaTy->getNumElements() < 2 ||
364  LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
365  return false;
366  }
367 
368  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
369  std::vector<Value*> WorkList;
370  for (User *AllocaUser : Alloca->users()) {
372  if (!GEP) {
373  if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
374  return false;
375 
376  WorkList.push_back(AllocaUser);
377  continue;
378  }
379 
380  Value *Index = GEPToVectorIndex(GEP);
381 
382  // If we can't compute a vector index from this GEP, then we can't
383  // promote this alloca to vector.
384  if (!Index) {
385  LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
386  << '\n');
387  return false;
388  }
389 
390  GEPVectorIdx[GEP] = Index;
391  for (User *GEPUser : AllocaUser->users()) {
392  if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
393  return false;
394 
395  WorkList.push_back(GEPUser);
396  }
397  }
398 
399  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
400 
401  LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
402  << *VectorTy << '\n');
403 
404  for (Value *V : WorkList) {
405  Instruction *Inst = cast<Instruction>(V);
406  IRBuilder<> Builder(Inst);
407  switch (Inst->getOpcode()) {
408  case Instruction::Load: {
409  Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
410  Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
411  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
412 
413  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
414  Value *VecValue = Builder.CreateLoad(BitCast);
415  Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
416  Inst->replaceAllUsesWith(ExtractElement);
417  Inst->eraseFromParent();
418  break;
419  }
420  case Instruction::Store: {
421  Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
422 
423  StoreInst *SI = cast<StoreInst>(Inst);
424  Value *Ptr = SI->getPointerOperand();
425  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
426  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
427  Value *VecValue = Builder.CreateLoad(BitCast);
428  Value *NewVecValue = Builder.CreateInsertElement(VecValue,
429  SI->getValueOperand(),
430  Index);
431  Builder.CreateStore(NewVecValue, BitCast);
432  Inst->eraseFromParent();
433  break;
434  }
435  case Instruction::BitCast:
436  case Instruction::AddrSpaceCast:
437  break;
438 
439  default:
440  llvm_unreachable("Inconsistency in instructions promotable to vector");
441  }
442  }
443  return true;
444 }
445 
446 static bool isCallPromotable(CallInst *CI) {
448  if (!II)
449  return false;
450 
451  switch (II->getIntrinsicID()) {
452  case Intrinsic::memcpy:
453  case Intrinsic::memmove:
454  case Intrinsic::memset:
455  case Intrinsic::lifetime_start:
456  case Intrinsic::lifetime_end:
457  case Intrinsic::invariant_start:
458  case Intrinsic::invariant_end:
459  case Intrinsic::launder_invariant_group:
460  case Intrinsic::objectsize:
461  return true;
462  default:
463  return false;
464  }
465 }
466 
467 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
468  Value *Val,
469  Instruction *Inst,
470  int OpIdx0,
471  int OpIdx1) const {
472  // Figure out which operand is the one we might not be promoting.
473  Value *OtherOp = Inst->getOperand(OpIdx0);
474  if (Val == OtherOp)
475  OtherOp = Inst->getOperand(OpIdx1);
476 
477  if (isa<ConstantPointerNull>(OtherOp))
478  return true;
479 
480  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
481  if (!isa<AllocaInst>(OtherObj))
482  return false;
483 
484  // TODO: We should be able to replace undefs with the right pointer type.
485 
486  // TODO: If we know the other base object is another promotable
487  // alloca, not necessarily this alloca, we can do this. The
488  // important part is both must have the same address space at
489  // the end.
490  if (OtherObj != BaseAlloca) {
491  LLVM_DEBUG(
492  dbgs() << "Found a binary instruction with another alloca object\n");
493  return false;
494  }
495 
496  return true;
497 }
498 
499 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
500  Value *BaseAlloca,
501  Value *Val,
502  std::vector<Value*> &WorkList) const {
503 
504  for (User *User : Val->users()) {
505  if (is_contained(WorkList, User))
506  continue;
507 
508  if (CallInst *CI = dyn_cast<CallInst>(User)) {
509  if (!isCallPromotable(CI))
510  return false;
511 
512  WorkList.push_back(User);
513  continue;
514  }
515 
516  Instruction *UseInst = cast<Instruction>(User);
517  if (UseInst->getOpcode() == Instruction::PtrToInt)
518  return false;
519 
520  if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
521  if (LI->isVolatile())
522  return false;
523 
524  continue;
525  }
526 
527  if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
528  if (SI->isVolatile())
529  return false;
530 
531  // Reject if the stored value is not the pointer operand.
532  if (SI->getPointerOperand() != Val)
533  return false;
534  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
535  if (RMW->isVolatile())
536  return false;
537  } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
538  if (CAS->isVolatile())
539  return false;
540  }
541 
542  // Only promote a select if we know that the other select operand
543  // is from another pointer that will also be promoted.
544  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
545  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
546  return false;
547 
548  // May need to rewrite constant operands.
549  WorkList.push_back(ICmp);
550  }
551 
552  if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
553  // Give up if the pointer may be captured.
554  if (PointerMayBeCaptured(UseInst, true, true))
555  return false;
556  // Don't collect the users of this.
557  WorkList.push_back(User);
558  continue;
559  }
560 
561  if (!User->getType()->isPointerTy())
562  continue;
563 
564  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
565  // Be conservative if an address could be computed outside the bounds of
566  // the alloca.
567  if (!GEP->isInBounds())
568  return false;
569  }
570 
571  // Only promote a select if we know that the other select operand is from
572  // another pointer that will also be promoted.
573  if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
574  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
575  return false;
576  }
577 
578  // Repeat for phis.
579  if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
580  // TODO: Handle more complex cases. We should be able to replace loops
581  // over arrays.
582  switch (Phi->getNumIncomingValues()) {
583  case 1:
584  break;
585  case 2:
586  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
587  return false;
588  break;
589  default:
590  return false;
591  }
592  }
593 
594  WorkList.push_back(User);
595  if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
596  return false;
597  }
598 
599  return true;
600 }
601 
602 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
603 
604  FunctionType *FTy = F.getFunctionType();
605  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
606 
607  // If the function has any arguments in the local address space, then it's
608  // possible these arguments require the entire local memory space, so
609  // we cannot use local memory in the pass.
610  for (Type *ParamTy : FTy->params()) {
611  PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
612  if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
613  LocalMemLimit = 0;
614  LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
615  "local memory disabled.\n");
616  return false;
617  }
618  }
619 
620  LocalMemLimit = ST.getLocalMemorySize();
621  if (LocalMemLimit == 0)
622  return false;
623 
624  const DataLayout &DL = Mod->getDataLayout();
625 
626  // Check how much local memory is being used by global objects
627  CurrentLocalMemUsage = 0;
628  for (GlobalVariable &GV : Mod->globals()) {
629  if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
630  continue;
631 
632  for (const User *U : GV.users()) {
633  const Instruction *Use = dyn_cast<Instruction>(U);
634  if (!Use)
635  continue;
636 
637  if (Use->getParent()->getParent() == &F) {
638  unsigned Align = GV.getAlignment();
639  if (Align == 0)
640  Align = DL.getABITypeAlignment(GV.getValueType());
641 
642  // FIXME: Try to account for padding here. The padding is currently
643  // determined from the inverse order of uses in the function. I'm not
644  // sure if the use list order is in any way connected to this, so the
645  // total reported size is likely incorrect.
646  uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
647  CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
648  CurrentLocalMemUsage += AllocSize;
649  break;
650  }
651  }
652  }
653 
654  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
655  F);
656 
657  // Restrict local memory usage so that we don't drastically reduce occupancy,
658  // unless it is already significantly reduced.
659 
660  // TODO: Have some sort of hint or other heuristics to guess occupancy based
661  // on other factors..
662  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
663  if (OccupancyHint == 0)
664  OccupancyHint = 7;
665 
666  // Clamp to max value.
667  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
668 
669  // Check the hint but ignore it if it's obviously wrong from the existing LDS
670  // usage.
671  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
672 
673 
674  // Round up to the next tier of usage.
675  unsigned MaxSizeWithWaveCount
676  = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
677 
678  // Program is possibly broken by using more local mem than available.
679  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
680  return false;
681 
682  LocalMemLimit = MaxSizeWithWaveCount;
683 
684  LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
685  << " bytes of LDS\n"
686  << " Rounding size to " << MaxSizeWithWaveCount
687  << " with a maximum occupancy of " << MaxOccupancy << '\n'
688  << " and " << (LocalMemLimit - CurrentLocalMemUsage)
689  << " available for promotion\n");
690 
691  return true;
692 }
693 
694 // FIXME: Should try to pick the most likely to be profitable allocas first.
695 bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
696  // Array allocations are probably not worth handling, since an allocation of
697  // the array type is the canonical form.
698  if (!I.isStaticAlloca() || I.isArrayAllocation())
699  return false;
700 
701  IRBuilder<> Builder(&I);
702 
703  // First try to replace the alloca with a vector
704  Type *AllocaTy = I.getAllocatedType();
705 
706  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
707 
708  if (tryPromoteAllocaToVector(&I, AS))
709  return true; // Promoted to vector.
710 
711  const Function &ContainingFunction = *I.getParent()->getParent();
712  CallingConv::ID CC = ContainingFunction.getCallingConv();
713 
714  // Don't promote the alloca to LDS for shader calling conventions as the work
715  // item ID intrinsics are not supported for these calling conventions.
716  // Furthermore not all LDS is available for some of the stages.
717  switch (CC) {
720  break;
721  default:
722  LLVM_DEBUG(
723  dbgs()
724  << " promote alloca to LDS not supported with calling convention.\n");
725  return false;
726  }
727 
728  // Not likely to have sufficient local memory for promotion.
729  if (!SufficientLDS)
730  return false;
731 
732  const AMDGPUSubtarget &ST =
733  TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
734  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
735 
736  const DataLayout &DL = Mod->getDataLayout();
737 
738  unsigned Align = I.getAlignment();
739  if (Align == 0)
740  Align = DL.getABITypeAlignment(I.getAllocatedType());
741 
742  // FIXME: This computed padding is likely wrong since it depends on inverse
743  // usage order.
744  //
745  // FIXME: It is also possible that if we're allowed to use all of the memory
746  // could could end up using more than the maximum due to alignment padding.
747 
748  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
749  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
750  NewSize += AllocSize;
751 
752  if (NewSize > LocalMemLimit) {
753  LLVM_DEBUG(dbgs() << " " << AllocSize
754  << " bytes of local memory not available to promote\n");
755  return false;
756  }
757 
758  CurrentLocalMemUsage = NewSize;
759 
760  std::vector<Value*> WorkList;
761 
762  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
763  LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
764  return false;
765  }
766 
767  LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
768 
769  Function *F = I.getParent()->getParent();
770 
771  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
772  GlobalVariable *GV = new GlobalVariable(
773  *Mod, GVTy, false, GlobalValue::InternalLinkage,
774  UndefValue::get(GVTy),
775  Twine(F->getName()) + Twine('.') + I.getName(),
776  nullptr,
778  AS.LOCAL_ADDRESS);
780  GV->setAlignment(I.getAlignment());
781 
782  Value *TCntY, *TCntZ;
783 
784  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
785  Value *TIdX = getWorkitemID(Builder, 0);
786  Value *TIdY = getWorkitemID(Builder, 1);
787  Value *TIdZ = getWorkitemID(Builder, 2);
788 
789  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
790  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
791  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
792  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
793  TID = Builder.CreateAdd(TID, TIdZ);
794 
795  Value *Indices[] = {
797  TID
798  };
799 
800  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
801  I.mutateType(Offset->getType());
802  I.replaceAllUsesWith(Offset);
803  I.eraseFromParent();
804 
805  for (Value *V : WorkList) {
806  CallInst *Call = dyn_cast<CallInst>(V);
807  if (!Call) {
808  if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
809  Value *Src0 = CI->getOperand(0);
810  Type *EltTy = Src0->getType()->getPointerElementType();
811  PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
812 
813  if (isa<ConstantPointerNull>(CI->getOperand(0)))
814  CI->setOperand(0, ConstantPointerNull::get(NewTy));
815 
816  if (isa<ConstantPointerNull>(CI->getOperand(1)))
817  CI->setOperand(1, ConstantPointerNull::get(NewTy));
818 
819  continue;
820  }
821 
822  // The operand's value should be corrected on its own and we don't want to
823  // touch the users.
824  if (isa<AddrSpaceCastInst>(V))
825  continue;
826 
827  Type *EltTy = V->getType()->getPointerElementType();
828  PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
829 
830  // FIXME: It doesn't really make sense to try to do this for all
831  // instructions.
832  V->mutateType(NewTy);
833 
834  // Adjust the types of any constant operands.
835  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
836  if (isa<ConstantPointerNull>(SI->getOperand(1)))
837  SI->setOperand(1, ConstantPointerNull::get(NewTy));
838 
839  if (isa<ConstantPointerNull>(SI->getOperand(2)))
840  SI->setOperand(2, ConstantPointerNull::get(NewTy));
841  } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
842  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
843  if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
844  Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
845  }
846  }
847 
848  continue;
849  }
850 
851  IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
852  Builder.SetInsertPoint(Intr);
853  switch (Intr->getIntrinsicID()) {
854  case Intrinsic::lifetime_start:
855  case Intrinsic::lifetime_end:
856  // These intrinsics are for address space 0 only
857  Intr->eraseFromParent();
858  continue;
859  case Intrinsic::memcpy: {
860  MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
861  Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
862  MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
863  MemCpy->getLength(), MemCpy->isVolatile());
864  Intr->eraseFromParent();
865  continue;
866  }
867  case Intrinsic::memmove: {
868  MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
869  Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
870  MemMove->getRawSource(), MemMove->getSourceAlignment(),
871  MemMove->getLength(), MemMove->isVolatile());
872  Intr->eraseFromParent();
873  continue;
874  }
875  case Intrinsic::memset: {
876  MemSetInst *MemSet = cast<MemSetInst>(Intr);
877  Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
878  MemSet->getLength(), MemSet->getDestAlignment(),
879  MemSet->isVolatile());
880  Intr->eraseFromParent();
881  continue;
882  }
883  case Intrinsic::invariant_start:
884  case Intrinsic::invariant_end:
885  case Intrinsic::launder_invariant_group:
886  Intr->eraseFromParent();
887  // FIXME: I think the invariant marker should still theoretically apply,
888  // but the intrinsics need to be changed to accept pointers with any
889  // address space.
890  continue;
891  case Intrinsic::objectsize: {
892  Value *Src = Intr->getOperand(0);
893  Type *SrcTy = Src->getType()->getPointerElementType();
894  Function *ObjectSize = Intrinsic::getDeclaration(Mod,
895  Intrinsic::objectsize,
896  { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
897  );
898 
899  CallInst *NewCall = Builder.CreateCall(
900  ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
901  Intr->replaceAllUsesWith(NewCall);
902  Intr->eraseFromParent();
903  continue;
904  }
905  default:
906  Intr->print(errs());
907  llvm_unreachable("Don't know how to promote alloca intrinsic use.");
908  }
909  }
910  return true;
911 }
912 
914  return new AMDGPUPromoteAlloca();
915 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
Value * CreateInBoundsGEP(Value *Ptr, ArrayRef< Value *> IdxList, const Twine &Name="")
Definition: IRBuilder.h:1393
Value * getValueOperand()
Definition: Instructions.h:395
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
bool isSimple() const
Definition: Instructions.h:262
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Value * getPointerOperand(Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
AMDGPU specific subclass of TargetSubtarget.
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:63
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
an instruction that atomically checks whether a specified value is in a memory location, and, if it is, stores a new value there.
Definition: Instructions.h:514
OSType getOS() const
getOS - Get the parsed operating system type of this triple.
Definition: Triple.h:292
bool isPromoteAllocaEnabled() const
This class represents a function call, abstracting a target machine&#39;s calling convention.
This file contains the declarations for metadata subclasses.
AMDGPUAS getAMDGPUAS(const Module &M)
unsigned getSourceAlignment() const
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:617
The two locations do not alias at all.
Definition: AliasAnalysis.h:85
Value * getValue() const
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:201
This class wraps the llvm.memset intrinsic.
FunctionPass * createAMDGPUPromoteAlloca()
Metadata node.
Definition: Metadata.h:862
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:677
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memset to the specified pointer and the specified value.
Definition: IRBuilder.h:404
An instruction for reading from memory.
Definition: Instructions.h:164
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:677
Hexagon Common GEP
Value * getLength() const
static Constant * getNullValue(Type *Ty)
Constructor to create a &#39;0&#39; constant of arbitrary type.
Definition: Constants.cpp:258
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:264
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, Value *> &GEPIdx)
static bool canVectorizeInst(Instruction *Inst, User *User)
This class represents the LLVM &#39;select&#39; instruction.
Type * getPointerElementType() const
Definition: Type.h:373
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
OpenCL uses address spaces to differentiate between various memory regions on the hardware...
Definition: AMDGPU.h:220
unsigned getAlignment() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:109
This class wraps the llvm.memmove intrinsic.
int getLocalMemorySize() const
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:639
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:731
This file contains the simple types necessary to represent the attributes associated with functions a...
unsigned Intr
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:962
CallInst * CreateMemMove(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memmove between the specified pointers.
Definition: IRBuilder.h:494
unsigned getDestAlignment() const
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
This file implements a class to represent arbitrary precision integral constant values and operations...
Class to represent function types.
Definition: DerivedTypes.h:103
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1629
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:92
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:283
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:121
Class to represent array types.
Definition: DerivedTypes.h:369
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:608
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
An instruction for storing to memory.
Definition: Instructions.h:306
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:439
iterator begin()
Definition: Function.h:642
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1001
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127
Value * getOperand(unsigned i) const
Definition: User.h:170
Class to represent pointers.
Definition: DerivedTypes.h:467
Value * getOperand(unsigned i_nocapture) const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:837
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1164
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1368
static Value * GEPToVectorIndex(GetElementPtrInst *GEP)
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:221
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Represent the analysis usage information of a pass.
This instruction compares its operands according to the predicate given to the constructor.
void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
Definition: AsmWriter.cpp:4035
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Value * getPointerOperand()
Definition: Instructions.h:270
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1382
const AMDGPUAS & AS
bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures, bool StoreCaptures)
PointerMayBeCaptured - Return true if this pointer value may be captured by the enclosing function (w...
bool isVolatile() const
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1226
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:996
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:102
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:34
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
unsigned getNumOperands() const
Definition: User.h:192
#define DEBUG_TYPE
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
static bool isCallPromotable(CallInst *CI)
iterator end()
Definition: BasicBlock.h:266
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:199
Module.h This file contains the declarations for the Module class.
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:722
CallInst * CreateMemCpy(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:446
This class wraps the llvm.memcpy intrinsic.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
Value * getRawSource() const
Return the arguments to the instruction.
Value * CreateConstInBoundsGEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition: IRBuilder.h:1491
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:137
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
The access may modify the value stored in memory.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:150
Class to represent vector types.
Definition: DerivedTypes.h:393
iterator_range< user_iterator > users()
Definition: Value.h:399
static VectorType * arrayTypeToVecType(ArrayType *ArrayTy)
unsigned getMaxWavesPerEU() const
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:428
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:215
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:176
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:224
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:568
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
Rename collisions when linking (static functions).
Definition: GlobalValue.h:56
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition: Value.h:603
bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void addDereferenceableAttr(unsigned i, uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
LoadInst * CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name)
Provided to resolve &#39;CreateAlignedLoad(Ptr, Align, "...")&#39; correctly, instead of converting the strin...
Definition: IRBuilder.h:1328
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:73
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1072
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Type * getElementType() const
Definition: DerivedTypes.h:360
char & AMDGPUPromoteAllocaID
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size...
bool isSimple() const
Definition: Instructions.h:387
Address space for local memory.
Definition: AMDGPU.h:232
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
#define LLVM_DEBUG(X)
Definition: Debug.h:119
Address space for constant memory (VTX2)
Definition: AMDGPU.h:231
Value * getPointerOperand()
Definition: Instructions.h:398
Value * getRawDest() const
unsigned PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:222
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
const BasicBlock * getParent() const
Definition: Instruction.h:67
an instruction to allocate memory on the stack
Definition: Instructions.h:60
CallInst * CreateCall(Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1871
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:967