LLVM  7.0.0svn
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/APInt.h"
19 #include "llvm/ADT/None.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/ADT/Twine.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/Constant.h"
30 #include "llvm/IR/Constants.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/IR/GlobalVariable.h"
36 #include "llvm/IR/IRBuilder.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/LLVMContext.h"
42 #include "llvm/IR/Metadata.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/User.h"
46 #include "llvm/IR/Value.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/Debug.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <map>
58 #include <tuple>
59 #include <utility>
60 #include <vector>
61 
62 #define DEBUG_TYPE "amdgpu-promote-alloca"
63 
64 using namespace llvm;
65 
66 namespace {
67 
68 static cl::opt<bool> DisablePromoteAllocaToVector(
69  "disable-promote-alloca-to-vector",
70  cl::desc("Disable promote alloca to vector"),
71  cl::init(false));
72 
73 // FIXME: This can create globals so should be a module pass.
74 class AMDGPUPromoteAlloca : public FunctionPass {
75 private:
76  const TargetMachine *TM;
77  Module *Mod = nullptr;
78  const DataLayout *DL = nullptr;
79  AMDGPUAS AS;
80 
81  // FIXME: This should be per-kernel.
82  uint32_t LocalMemLimit = 0;
83  uint32_t CurrentLocalMemUsage = 0;
84 
85  bool IsAMDGCN = false;
86  bool IsAMDHSA = false;
87 
88  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
89  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
90 
91  /// BaseAlloca is the alloca root the search started from.
92  /// Val may be that alloca or a recursive user of it.
93  bool collectUsesWithPtrTypes(Value *BaseAlloca,
94  Value *Val,
95  std::vector<Value*> &WorkList) const;
96 
97  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
98  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
99  /// Returns true if both operands are derived from the same alloca. Val should
100  /// be the same value as one of the input operands of UseInst.
101  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
102  Instruction *UseInst,
103  int OpIdx0, int OpIdx1) const;
104 
105  /// Check whether we have enough local memory for promotion.
106  bool hasSufficientLocalMem(const Function &F);
107 
108 public:
109  static char ID;
110 
111  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
112 
113  bool doInitialization(Module &M) override;
114  bool runOnFunction(Function &F) override;
115 
116  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
117 
118  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
119 
120  void getAnalysisUsage(AnalysisUsage &AU) const override {
121  AU.setPreservesCFG();
123  }
124 };
125 
126 } // end anonymous namespace
127 
128 char AMDGPUPromoteAlloca::ID = 0;
129 
130 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
131  "AMDGPU promote alloca to vector or LDS", false, false)
132 
134 
135 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
136  Mod = &M;
137  DL = &Mod->getDataLayout();
138 
139  return false;
140 }
141 
143  if (skipFunction(F))
144  return false;
145 
146  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
147  TM = &TPC->getTM<TargetMachine>();
148  else
149  return false;
150 
151  const Triple &TT = TM->getTargetTriple();
152  IsAMDGCN = TT.getArch() == Triple::amdgcn;
153  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
154 
155  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
156  if (!ST.isPromoteAllocaEnabled())
157  return false;
158 
160 
161  bool SufficientLDS = hasSufficientLocalMem(F);
162  bool Changed = false;
163  BasicBlock &EntryBB = *F.begin();
164  for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
166 
167  ++I;
168  if (AI)
169  Changed |= handleAlloca(*AI, SufficientLDS);
170  }
171 
172  return Changed;
173 }
174 
175 std::pair<Value *, Value *>
176 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
177  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
178  *Builder.GetInsertBlock()->getParent());
179 
180  if (!IsAMDHSA) {
181  Function *LocalSizeYFn
182  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
183  Function *LocalSizeZFn
184  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
185 
186  CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
187  CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
188 
189  ST.makeLIDRangeMetadata(LocalSizeY);
190  ST.makeLIDRangeMetadata(LocalSizeZ);
191 
192  return std::make_pair(LocalSizeY, LocalSizeZ);
193  }
194 
195  // We must read the size out of the dispatch pointer.
196  assert(IsAMDGCN);
197 
198  // We are indexing into this struct, and want to extract the workgroup_size_*
199  // fields.
200  //
201  // typedef struct hsa_kernel_dispatch_packet_s {
202  // uint16_t header;
203  // uint16_t setup;
204  // uint16_t workgroup_size_x ;
205  // uint16_t workgroup_size_y;
206  // uint16_t workgroup_size_z;
207  // uint16_t reserved0;
208  // uint32_t grid_size_x ;
209  // uint32_t grid_size_y ;
210  // uint32_t grid_size_z;
211  //
212  // uint32_t private_segment_size;
213  // uint32_t group_segment_size;
214  // uint64_t kernel_object;
215  //
216  // #ifdef HSA_LARGE_MODEL
217  // void *kernarg_address;
218  // #elif defined HSA_LITTLE_ENDIAN
219  // void *kernarg_address;
220  // uint32_t reserved1;
221  // #else
222  // uint32_t reserved1;
223  // void *kernarg_address;
224  // #endif
225  // uint64_t reserved2;
226  // hsa_signal_t completion_signal; // uint64_t wrapper
227  // } hsa_kernel_dispatch_packet_t
228  //
229  Function *DispatchPtrFn
230  = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
231 
232  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
234  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
235 
236  // Size of the dispatch packet struct.
238 
239  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
240  Value *CastDispatchPtr = Builder.CreateBitCast(
241  DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
242 
243  // We could do a single 64-bit load here, but it's likely that the basic
244  // 32-bit and extract sequence is already present, and it is probably easier
245  // to CSE this. The loads should be mergable later anyway.
246  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
247  LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
248 
249  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
250  LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
251 
252  MDNode *MD = MDNode::get(Mod->getContext(), None);
255  ST.makeLIDRangeMetadata(LoadZU);
256 
257  // Extract y component. Upper half of LoadZU should be zero already.
258  Value *Y = Builder.CreateLShr(LoadXY, 16);
259 
260  return std::make_pair(Y, LoadZU);
261 }
262 
263 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
264  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
265  *Builder.GetInsertBlock()->getParent());
267 
268  switch (N) {
269  case 0:
270  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
271  : Intrinsic::r600_read_tidig_x;
272  break;
273  case 1:
274  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
275  : Intrinsic::r600_read_tidig_y;
276  break;
277 
278  case 2:
279  IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
280  : Intrinsic::r600_read_tidig_z;
281  break;
282  default:
283  llvm_unreachable("invalid dimension");
284  }
285 
286  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
287  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
288  ST.makeLIDRangeMetadata(CI);
289 
290  return CI;
291 }
292 
294  return VectorType::get(ArrayTy->getElementType(),
295  ArrayTy->getNumElements());
296 }
297 
298 static Value *
300  const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
301  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
302 
303  auto I = GEPIdx.find(GEP);
304  return I == GEPIdx.end() ? nullptr : I->second;
305 }
306 
308  // FIXME we only support simple cases
309  if (GEP->getNumOperands() != 3)
310  return nullptr;
311 
312  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
313  if (!I0 || !I0->isZero())
314  return nullptr;
315 
316  return GEP->getOperand(2);
317 }
318 
319 // Not an instruction handled below to turn into a vector.
320 //
321 // TODO: Check isTriviallyVectorizable for calls and handle other
322 // instructions.
323 static bool canVectorizeInst(Instruction *Inst, User *User) {
324  switch (Inst->getOpcode()) {
325  case Instruction::Load: {
326  LoadInst *LI = cast<LoadInst>(Inst);
327  // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
328  return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
329  }
330  case Instruction::BitCast:
331  case Instruction::AddrSpaceCast:
332  return true;
333  case Instruction::Store: {
334  // Must be the stored pointer operand, not a stored value, plus
335  // since it should be canonical form, the User should be a GEP.
336  StoreInst *SI = cast<StoreInst>(Inst);
337  return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
338  }
339  default:
340  return false;
341  }
342 }
343 
345 
346  if (DisablePromoteAllocaToVector) {
347  DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
348  return false;
349  }
350 
351  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
352 
353  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
354 
355  // FIXME: There is no reason why we can't support larger arrays, we
356  // are just being conservative for now.
357  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
358  // could also be promoted but we don't currently handle this case
359  if (!AllocaTy ||
360  AllocaTy->getNumElements() > 16 ||
361  AllocaTy->getNumElements() < 2 ||
363  DEBUG(dbgs() << " Cannot convert type to vector\n");
364  return false;
365  }
366 
367  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
368  std::vector<Value*> WorkList;
369  for (User *AllocaUser : Alloca->users()) {
371  if (!GEP) {
372  if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
373  return false;
374 
375  WorkList.push_back(AllocaUser);
376  continue;
377  }
378 
379  Value *Index = GEPToVectorIndex(GEP);
380 
381  // If we can't compute a vector index from this GEP, then we can't
382  // promote this alloca to vector.
383  if (!Index) {
384  DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
385  return false;
386  }
387 
388  GEPVectorIdx[GEP] = Index;
389  for (User *GEPUser : AllocaUser->users()) {
390  if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
391  return false;
392 
393  WorkList.push_back(GEPUser);
394  }
395  }
396 
397  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
398 
399  DEBUG(dbgs() << " Converting alloca to vector "
400  << *AllocaTy << " -> " << *VectorTy << '\n');
401 
402  for (Value *V : WorkList) {
403  Instruction *Inst = cast<Instruction>(V);
404  IRBuilder<> Builder(Inst);
405  switch (Inst->getOpcode()) {
406  case Instruction::Load: {
407  Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
408  Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
409  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
410 
411  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
412  Value *VecValue = Builder.CreateLoad(BitCast);
413  Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
414  Inst->replaceAllUsesWith(ExtractElement);
415  Inst->eraseFromParent();
416  break;
417  }
418  case Instruction::Store: {
419  Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
420 
421  StoreInst *SI = cast<StoreInst>(Inst);
422  Value *Ptr = SI->getPointerOperand();
423  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
424  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
425  Value *VecValue = Builder.CreateLoad(BitCast);
426  Value *NewVecValue = Builder.CreateInsertElement(VecValue,
427  SI->getValueOperand(),
428  Index);
429  Builder.CreateStore(NewVecValue, BitCast);
430  Inst->eraseFromParent();
431  break;
432  }
433  case Instruction::BitCast:
434  case Instruction::AddrSpaceCast:
435  break;
436 
437  default:
438  llvm_unreachable("Inconsistency in instructions promotable to vector");
439  }
440  }
441  return true;
442 }
443 
444 static bool isCallPromotable(CallInst *CI) {
446  if (!II)
447  return false;
448 
449  switch (II->getIntrinsicID()) {
450  case Intrinsic::memcpy:
451  case Intrinsic::memmove:
452  case Intrinsic::memset:
453  case Intrinsic::lifetime_start:
454  case Intrinsic::lifetime_end:
455  case Intrinsic::invariant_start:
456  case Intrinsic::invariant_end:
457  case Intrinsic::invariant_group_barrier:
458  case Intrinsic::objectsize:
459  return true;
460  default:
461  return false;
462  }
463 }
464 
465 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
466  Value *Val,
467  Instruction *Inst,
468  int OpIdx0,
469  int OpIdx1) const {
470  // Figure out which operand is the one we might not be promoting.
471  Value *OtherOp = Inst->getOperand(OpIdx0);
472  if (Val == OtherOp)
473  OtherOp = Inst->getOperand(OpIdx1);
474 
475  if (isa<ConstantPointerNull>(OtherOp))
476  return true;
477 
478  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
479  if (!isa<AllocaInst>(OtherObj))
480  return false;
481 
482  // TODO: We should be able to replace undefs with the right pointer type.
483 
484  // TODO: If we know the other base object is another promotable
485  // alloca, not necessarily this alloca, we can do this. The
486  // important part is both must have the same address space at
487  // the end.
488  if (OtherObj != BaseAlloca) {
489  DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
490  return false;
491  }
492 
493  return true;
494 }
495 
496 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
497  Value *BaseAlloca,
498  Value *Val,
499  std::vector<Value*> &WorkList) const {
500 
501  for (User *User : Val->users()) {
502  if (is_contained(WorkList, User))
503  continue;
504 
505  if (CallInst *CI = dyn_cast<CallInst>(User)) {
506  if (!isCallPromotable(CI))
507  return false;
508 
509  WorkList.push_back(User);
510  continue;
511  }
512 
513  Instruction *UseInst = cast<Instruction>(User);
514  if (UseInst->getOpcode() == Instruction::PtrToInt)
515  return false;
516 
517  if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
518  if (LI->isVolatile())
519  return false;
520 
521  continue;
522  }
523 
524  if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
525  if (SI->isVolatile())
526  return false;
527 
528  // Reject if the stored value is not the pointer operand.
529  if (SI->getPointerOperand() != Val)
530  return false;
531  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
532  if (RMW->isVolatile())
533  return false;
534  } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
535  if (CAS->isVolatile())
536  return false;
537  }
538 
539  // Only promote a select if we know that the other select operand
540  // is from another pointer that will also be promoted.
541  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
542  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
543  return false;
544 
545  // May need to rewrite constant operands.
546  WorkList.push_back(ICmp);
547  }
548 
549  if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
550  // Give up if the pointer may be captured.
551  if (PointerMayBeCaptured(UseInst, true, true))
552  return false;
553  // Don't collect the users of this.
554  WorkList.push_back(User);
555  continue;
556  }
557 
558  if (!User->getType()->isPointerTy())
559  continue;
560 
561  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
562  // Be conservative if an address could be computed outside the bounds of
563  // the alloca.
564  if (!GEP->isInBounds())
565  return false;
566  }
567 
568  // Only promote a select if we know that the other select operand is from
569  // another pointer that will also be promoted.
570  if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
571  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
572  return false;
573  }
574 
575  // Repeat for phis.
576  if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
577  // TODO: Handle more complex cases. We should be able to replace loops
578  // over arrays.
579  switch (Phi->getNumIncomingValues()) {
580  case 1:
581  break;
582  case 2:
583  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
584  return false;
585  break;
586  default:
587  return false;
588  }
589  }
590 
591  WorkList.push_back(User);
592  if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
593  return false;
594  }
595 
596  return true;
597 }
598 
599 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
600 
601  FunctionType *FTy = F.getFunctionType();
602  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
603 
604  // If the function has any arguments in the local address space, then it's
605  // possible these arguments require the entire local memory space, so
606  // we cannot use local memory in the pass.
607  for (Type *ParamTy : FTy->params()) {
608  PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
609  if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
610  LocalMemLimit = 0;
611  DEBUG(dbgs() << "Function has local memory argument. Promoting to "
612  "local memory disabled.\n");
613  return false;
614  }
615  }
616 
617  LocalMemLimit = ST.getLocalMemorySize();
618  if (LocalMemLimit == 0)
619  return false;
620 
621  const DataLayout &DL = Mod->getDataLayout();
622 
623  // Check how much local memory is being used by global objects
624  CurrentLocalMemUsage = 0;
625  for (GlobalVariable &GV : Mod->globals()) {
626  if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
627  continue;
628 
629  for (const User *U : GV.users()) {
630  const Instruction *Use = dyn_cast<Instruction>(U);
631  if (!Use)
632  continue;
633 
634  if (Use->getParent()->getParent() == &F) {
635  unsigned Align = GV.getAlignment();
636  if (Align == 0)
637  Align = DL.getABITypeAlignment(GV.getValueType());
638 
639  // FIXME: Try to account for padding here. The padding is currently
640  // determined from the inverse order of uses in the function. I'm not
641  // sure if the use list order is in any way connected to this, so the
642  // total reported size is likely incorrect.
643  uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
644  CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
645  CurrentLocalMemUsage += AllocSize;
646  break;
647  }
648  }
649  }
650 
651  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
652  F);
653 
654  // Restrict local memory usage so that we don't drastically reduce occupancy,
655  // unless it is already significantly reduced.
656 
657  // TODO: Have some sort of hint or other heuristics to guess occupancy based
658  // on other factors..
659  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
660  if (OccupancyHint == 0)
661  OccupancyHint = 7;
662 
663  // Clamp to max value.
664  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
665 
666  // Check the hint but ignore it if it's obviously wrong from the existing LDS
667  // usage.
668  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
669 
670 
671  // Round up to the next tier of usage.
672  unsigned MaxSizeWithWaveCount
673  = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
674 
675  // Program is possibly broken by using more local mem than available.
676  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
677  return false;
678 
679  LocalMemLimit = MaxSizeWithWaveCount;
680 
681  DEBUG(
682  dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
683  << " Rounding size to " << MaxSizeWithWaveCount
684  << " with a maximum occupancy of " << MaxOccupancy << '\n'
685  << " and " << (LocalMemLimit - CurrentLocalMemUsage)
686  << " available for promotion\n"
687  );
688 
689  return true;
690 }
691 
692 // FIXME: Should try to pick the most likely to be profitable allocas first.
693 bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
694  // Array allocations are probably not worth handling, since an allocation of
695  // the array type is the canonical form.
696  if (!I.isStaticAlloca() || I.isArrayAllocation())
697  return false;
698 
699  IRBuilder<> Builder(&I);
700 
701  // First try to replace the alloca with a vector
702  Type *AllocaTy = I.getAllocatedType();
703 
704  DEBUG(dbgs() << "Trying to promote " << I << '\n');
705 
706  if (tryPromoteAllocaToVector(&I, AS))
707  return true; // Promoted to vector.
708 
709  const Function &ContainingFunction = *I.getParent()->getParent();
710  CallingConv::ID CC = ContainingFunction.getCallingConv();
711 
712  // Don't promote the alloca to LDS for shader calling conventions as the work
713  // item ID intrinsics are not supported for these calling conventions.
714  // Furthermore not all LDS is available for some of the stages.
715  switch (CC) {
718  break;
719  default:
720  DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
721  return false;
722  }
723 
724  // Not likely to have sufficient local memory for promotion.
725  if (!SufficientLDS)
726  return false;
727 
728  const AMDGPUSubtarget &ST =
729  TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
730  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
731 
732  const DataLayout &DL = Mod->getDataLayout();
733 
734  unsigned Align = I.getAlignment();
735  if (Align == 0)
736  Align = DL.getABITypeAlignment(I.getAllocatedType());
737 
738  // FIXME: This computed padding is likely wrong since it depends on inverse
739  // usage order.
740  //
741  // FIXME: It is also possible that if we're allowed to use all of the memory
742  // could could end up using more than the maximum due to alignment padding.
743 
744  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
745  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
746  NewSize += AllocSize;
747 
748  if (NewSize > LocalMemLimit) {
749  DEBUG(dbgs() << " " << AllocSize
750  << " bytes of local memory not available to promote\n");
751  return false;
752  }
753 
754  CurrentLocalMemUsage = NewSize;
755 
756  std::vector<Value*> WorkList;
757 
758  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
759  DEBUG(dbgs() << " Do not know how to convert all uses\n");
760  return false;
761  }
762 
763  DEBUG(dbgs() << "Promoting alloca to local memory\n");
764 
765  Function *F = I.getParent()->getParent();
766 
767  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
768  GlobalVariable *GV = new GlobalVariable(
769  *Mod, GVTy, false, GlobalValue::InternalLinkage,
770  UndefValue::get(GVTy),
771  Twine(F->getName()) + Twine('.') + I.getName(),
772  nullptr,
774  AS.LOCAL_ADDRESS);
776  GV->setAlignment(I.getAlignment());
777 
778  Value *TCntY, *TCntZ;
779 
780  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
781  Value *TIdX = getWorkitemID(Builder, 0);
782  Value *TIdY = getWorkitemID(Builder, 1);
783  Value *TIdZ = getWorkitemID(Builder, 2);
784 
785  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
786  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
787  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
788  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
789  TID = Builder.CreateAdd(TID, TIdZ);
790 
791  Value *Indices[] = {
793  TID
794  };
795 
796  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
797  I.mutateType(Offset->getType());
798  I.replaceAllUsesWith(Offset);
799  I.eraseFromParent();
800 
801  for (Value *V : WorkList) {
802  CallInst *Call = dyn_cast<CallInst>(V);
803  if (!Call) {
804  if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
805  Value *Src0 = CI->getOperand(0);
806  Type *EltTy = Src0->getType()->getPointerElementType();
807  PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
808 
809  if (isa<ConstantPointerNull>(CI->getOperand(0)))
810  CI->setOperand(0, ConstantPointerNull::get(NewTy));
811 
812  if (isa<ConstantPointerNull>(CI->getOperand(1)))
813  CI->setOperand(1, ConstantPointerNull::get(NewTy));
814 
815  continue;
816  }
817 
818  // The operand's value should be corrected on its own and we don't want to
819  // touch the users.
820  if (isa<AddrSpaceCastInst>(V))
821  continue;
822 
823  Type *EltTy = V->getType()->getPointerElementType();
824  PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
825 
826  // FIXME: It doesn't really make sense to try to do this for all
827  // instructions.
828  V->mutateType(NewTy);
829 
830  // Adjust the types of any constant operands.
831  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
832  if (isa<ConstantPointerNull>(SI->getOperand(1)))
833  SI->setOperand(1, ConstantPointerNull::get(NewTy));
834 
835  if (isa<ConstantPointerNull>(SI->getOperand(2)))
836  SI->setOperand(2, ConstantPointerNull::get(NewTy));
837  } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
838  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
839  if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
840  Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
841  }
842  }
843 
844  continue;
845  }
846 
847  IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
848  Builder.SetInsertPoint(Intr);
849  switch (Intr->getIntrinsicID()) {
850  case Intrinsic::lifetime_start:
851  case Intrinsic::lifetime_end:
852  // These intrinsics are for address space 0 only
853  Intr->eraseFromParent();
854  continue;
855  case Intrinsic::memcpy: {
856  MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
857  Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
858  MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
859  MemCpy->getLength(), MemCpy->isVolatile());
860  Intr->eraseFromParent();
861  continue;
862  }
863  case Intrinsic::memmove: {
864  MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
865  Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
866  MemMove->getRawSource(), MemMove->getSourceAlignment(),
867  MemMove->getLength(), MemMove->isVolatile());
868  Intr->eraseFromParent();
869  continue;
870  }
871  case Intrinsic::memset: {
872  MemSetInst *MemSet = cast<MemSetInst>(Intr);
873  Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
874  MemSet->getLength(), MemSet->getDestAlignment(),
875  MemSet->isVolatile());
876  Intr->eraseFromParent();
877  continue;
878  }
879  case Intrinsic::invariant_start:
880  case Intrinsic::invariant_end:
881  case Intrinsic::invariant_group_barrier:
882  Intr->eraseFromParent();
883  // FIXME: I think the invariant marker should still theoretically apply,
884  // but the intrinsics need to be changed to accept pointers with any
885  // address space.
886  continue;
887  case Intrinsic::objectsize: {
888  Value *Src = Intr->getOperand(0);
889  Type *SrcTy = Src->getType()->getPointerElementType();
890  Function *ObjectSize = Intrinsic::getDeclaration(Mod,
891  Intrinsic::objectsize,
892  { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
893  );
894 
895  CallInst *NewCall = Builder.CreateCall(
896  ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
897  Intr->replaceAllUsesWith(NewCall);
898  Intr->eraseFromParent();
899  continue;
900  }
901  default:
902  Intr->print(errs());
903  llvm_unreachable("Don't know how to promote alloca intrinsic use.");
904  }
905  }
906  return true;
907 }
908 
910  return new AMDGPUPromoteAlloca();
911 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
Value * CreateInBoundsGEP(Value *Ptr, ArrayRef< Value *> IdxList, const Twine &Name="")
Definition: IRBuilder.h:1283
Value * getValueOperand()
Definition: Instructions.h:395
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:63
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
an instruction that atomically checks whether a specified value is in a memory location, and, if it is, stores a new value there.
Definition: Instructions.h:514
OSType getOS() const
getOS - Get the parsed operating system type of this triple.
Definition: Triple.h:294
bool isPromoteAllocaEnabled() const
This class represents a function call, abstracting a target machine&#39;s calling convention.
This file contains the declarations for metadata subclasses.
AMDGPUAS getAMDGPUAS(const Module &M)
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:617
This class wraps the llvm.memset intrinsic.
FunctionPass * createAMDGPUPromoteAlloca()
Metadata node.
Definition: Metadata.h:862
The two locations do not alias at all.
Definition: AliasAnalysis.h:85
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:677
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memset to the specified pointer and the specified value.
Definition: IRBuilder.h:405
An instruction for reading from memory.
Definition: Instructions.h:164
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:677
Hexagon Common GEP
Value * getLength() const
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:137
static Constant * getNullValue(Type *Ty)
Constructor to create a &#39;0&#39; constant of arbitrary type.
Definition: Constants.cpp:245
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:252
Value * getRawSource() const
Return the arguments to the instruction.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:217
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, Value *> &GEPIdx)
static Value * getPointerOperand(Instruction &Inst)
static bool canVectorizeInst(Instruction *Inst, User *User)
Address space for local memory.
Definition: AMDGPU.h:226
This class represents the LLVM &#39;select&#39; instruction.
Type * getPointerElementType() const
Definition: Type.h:373
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
OpenCL uses address spaces to differentiate between various memory regions on the hardware...
Definition: AMDGPU.h:214
unsigned getAlignment() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:109
This class wraps the llvm.memmove intrinsic.
int getLocalMemorySize() const
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:639
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:707
This file contains the simple types necessary to represent the attributes associated with functions a...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:932
CallInst * CreateMemMove(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memmove between the specified pointers.
Definition: IRBuilder.h:490
unsigned getDestAlignment() const
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
This file implements a class to represent arbitrary precision integral constant values and operations...
unsigned getSourceAlignment() const
Class to represent function types.
Definition: DerivedTypes.h:103
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1487
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:91
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:285
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:122
Class to represent array types.
Definition: DerivedTypes.h:369
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:608
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
An instruction for storing to memory.
Definition: Instructions.h:306
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:439
iterator begin()
Definition: Function.h:634
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:980
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:128
Value * getOperand(unsigned i) const
Definition: User.h:154
Class to represent pointers.
Definition: DerivedTypes.h:467
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:837
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1164
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1355
static Value * GEPToVectorIndex(GetElementPtrInst *GEP)
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:221
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Represent the analysis usage information of a pass.
This instruction compares its operands according to the predicate given to the constructor.
void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
Definition: AsmWriter.cpp:3501
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Value * getPointerOperand()
Definition: Instructions.h:270
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1369
const AMDGPUAS & AS
bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures, bool StoreCaptures)
PointerMayBeCaptured - Return true if this pointer value may be captured by the enclosing function (w...
bool isVolatile() const
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:976
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:102
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:34
Address space for constant memory (VTX2)
Definition: AMDGPU.h:225
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
unsigned getNumOperands() const
Definition: User.h:176
#define DEBUG_TYPE
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
static bool isCallPromotable(CallInst *CI)
iterator end()
Definition: BasicBlock.h:254
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:194
Module.h This file contains the declarations for the Module class.
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:724
CallInst * CreateMemCpy(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:423
This class wraps the llvm.memcpy intrinsic.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
Value * CreateConstInBoundsGEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition: IRBuilder.h:1370
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:145
Class to represent vector types.
Definition: DerivedTypes.h:393
iterator_range< user_iterator > users()
Definition: Value.h:405
The access may modify the value stored in memory.
static VectorType * arrayTypeToVecType(ArrayType *ArrayTy)
unsigned getMaxWavesPerEU() const
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:422
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:339
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:215
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:176
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:224
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:568
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
Value * getValue() const
Return the arguments to the instruction.
Rename collisions when linking (static functions).
Definition: GlobalValue.h:56
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition: Value.h:608
bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LoadInst * CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name)
Definition: IRBuilder.h:1225
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:73
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1070
#define DEBUG(X)
Definition: Debug.h:118
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Type * getElementType() const
Definition: DerivedTypes.h:360
char & AMDGPUPromoteAllocaID
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:201
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Value * getPointerOperand()
Definition: Instructions.h:398
Value * getRawDest() const
unsigned PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:216
void addDereferenceableAttr(unsigned i, uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
const BasicBlock * getParent() const
Definition: Instruction.h:67
an instruction to allocate memory on the stack
Definition: Instructions.h:60
CallInst * CreateCall(Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1702
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:873