LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(32));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(4));
81
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(4));
87
88// Shared implementation which can do both promotion to vector and to LDS.
89class AMDGPUPromoteAllocaImpl {
90private:
91 const TargetMachine &TM;
92 LoopInfo &LI;
93 Module *Mod = nullptr;
94 const DataLayout *DL = nullptr;
95
96 // FIXME: This should be per-kernel.
97 uint32_t LocalMemLimit = 0;
98 uint32_t CurrentLocalMemUsage = 0;
99 unsigned MaxVGPRs;
100 unsigned VGPRBudgetRatio;
101 unsigned MaxVectorRegs;
102
103 bool IsAMDGCN = false;
104 bool IsAMDHSA = false;
105
106 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
107 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
108
109 /// BaseAlloca is the alloca root the search started from.
110 /// Val may be that alloca or a recursive user of it.
111 bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
112 std::vector<Value *> &WorkList) const;
113
114 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116 /// Returns true if both operands are derived from the same alloca. Val should
117 /// be the same value as one of the input operands of UseInst.
118 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
119 Instruction *UseInst, int OpIdx0,
120 int OpIdx1) const;
121
122 /// Check whether we have enough local memory for promotion.
123 bool hasSufficientLocalMem(const Function &F);
124
125 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
126 bool tryPromoteAllocaToVector(AllocaInst &I);
127 bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
128
129 void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
130
131 void setFunctionLimits(const Function &F);
132
133public:
134 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
135
136 const Triple &TT = TM.getTargetTriple();
137 IsAMDGCN = TT.isAMDGCN();
138 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
139 }
140
141 bool run(Function &F, bool PromoteToLDS);
142};
143
144// FIXME: This can create globals so should be a module pass.
145class AMDGPUPromoteAlloca : public FunctionPass {
146public:
147 static char ID;
148
149 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
150
151 bool runOnFunction(Function &F) override {
152 if (skipFunction(F))
153 return false;
154 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
155 return AMDGPUPromoteAllocaImpl(
156 TPC->getTM<TargetMachine>(),
157 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
158 .run(F, /*PromoteToLDS*/ true);
159 return false;
160 }
161
162 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
163
164 void getAnalysisUsage(AnalysisUsage &AU) const override {
165 AU.setPreservesCFG();
168 }
169};
170
171static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
172 const Function &F) {
173 if (!TM.getTargetTriple().isAMDGCN())
174 return 128;
175
176 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
177
178 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
179 // Temporarily check both the attribute and the subtarget feature, until the
180 // latter is removed.
181 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
182 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
183
184 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
185 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
186 DynamicVGPRBlockSize);
187
188 // A non-entry function has only 32 caller preserved registers.
189 // Do not promote alloca which will force spilling unless we know the function
190 // will be inlined.
191 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
192 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
193 MaxVGPRs = std::min(MaxVGPRs, 32u);
194 return MaxVGPRs;
195}
196
197} // end anonymous namespace
198
199char AMDGPUPromoteAlloca::ID = 0;
200
202 "AMDGPU promote alloca to vector or LDS", false, false)
203// Move LDS uses from functions to kernels before promote alloca for accurate
204// estimation of LDS available
205INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
207INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
208 "AMDGPU promote alloca to vector or LDS", false, false)
209
210char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
211
214 auto &LI = AM.getResult<LoopAnalysis>(F);
215 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
216 if (Changed) {
219 return PA;
220 }
221 return PreservedAnalyses::all();
222}
223
226 auto &LI = AM.getResult<LoopAnalysis>(F);
227 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
228 if (Changed) {
231 return PA;
232 }
233 return PreservedAnalyses::all();
234}
235
237 return new AMDGPUPromoteAlloca();
238}
239
240static void collectAllocaUses(AllocaInst &Alloca,
242 SmallVector<Instruction *, 4> WorkList({&Alloca});
243 while (!WorkList.empty()) {
244 auto *Cur = WorkList.pop_back_val();
245 for (auto &U : Cur->uses()) {
246 Uses.push_back(&U);
247
248 if (isa<GetElementPtrInst>(U.getUser()))
249 WorkList.push_back(cast<Instruction>(U.getUser()));
250 }
251 }
252}
253
254void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
257
258 for (auto *Alloca : Allocas) {
259 LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
260 unsigned &Score = Scores[Alloca];
261 // Increment score by one for each user + a bonus for users within loops.
263 collectAllocaUses(*Alloca, Uses);
264 for (auto *U : Uses) {
265 Instruction *Inst = cast<Instruction>(U->getUser());
266 if (isa<GetElementPtrInst>(Inst))
267 continue;
268 unsigned UserScore =
269 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
270 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
271 Score += UserScore;
272 }
273 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
274 }
275
276 stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
277 return Scores.at(A) > Scores.at(B);
278 });
279
280 // clang-format off
282 dbgs() << "Sorted Worklist:\n";
283 for (auto *A: Allocas)
284 dbgs() << " " << *A << "\n";
285 );
286 // clang-format on
287}
288
289void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
290 // Load per function limits, overriding with global options where appropriate.
291 // R600 register tuples/aliasing are fragile with large vector promotions so
292 // apply architecture specific limit here.
293 const int R600MaxVectorRegs = 16;
294 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
295 "amdgpu-promote-alloca-to-vector-max-regs",
296 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
297 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
298 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
299 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
300 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
301 PromoteAllocaToVectorVGPRRatio);
302 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
303 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
304}
305
306bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
307 Mod = F.getParent();
308 DL = &Mod->getDataLayout();
309
311 if (!ST.isPromoteAllocaEnabled())
312 return false;
313
314 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
315 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
316 setFunctionLimits(F);
317
318 unsigned VectorizationBudget =
319 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
320 : (MaxVGPRs * 32)) /
321 VGPRBudgetRatio;
322
324 for (Instruction &I : F.getEntryBlock()) {
325 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
326 // Array allocations are probably not worth handling, since an allocation
327 // of the array type is the canonical form.
328 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
329 continue;
330 Allocas.push_back(AI);
331 }
332 }
333
334 sortAllocasToPromote(Allocas);
335
336 bool Changed = false;
337 for (AllocaInst *AI : Allocas) {
338 const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
339 // First, check if we have enough budget to vectorize this alloca.
340 if (AllocaCost <= VectorizationBudget) {
341 // If we do, attempt vectorization, otherwise, fall through and try
342 // promoting to LDS instead.
343 if (tryPromoteAllocaToVector(*AI)) {
344 Changed = true;
345 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
346 "Underflow!");
347 VectorizationBudget -= AllocaCost;
348 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
349 << VectorizationBudget << "\n");
350 continue;
351 }
352 } else {
353 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
354 << AllocaCost << ", budget:" << VectorizationBudget
355 << "): " << *AI << "\n");
356 }
357
358 if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
359 Changed = true;
360 }
361
362 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
363 // dangling pointers. If we want to reuse it past this point, the loop above
364 // would need to be updated to remove successfully promoted allocas.
365
366 return Changed;
367}
368
372};
373
374// Checks if the instruction I is a memset user of the alloca AI that we can
375// deal with. Currently, only non-volatile memsets that affect the whole alloca
376// are handled.
378 const DataLayout &DL) {
379 using namespace PatternMatch;
380 // For now we only care about non-volatile memsets that affect the whole type
381 // (start at index 0 and fill the whole alloca).
382 //
383 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
384 // (except maybe volatile ones?) - we just need to use shufflevector if it
385 // only affects a subset of the vector.
386 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
387 return I->getOperand(0) == AI &&
388 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
389}
390
392 Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
394 if (!GEP)
396
397 auto I = GEPIdx.find(GEP);
398 assert(I != GEPIdx.end() && "Must have entry for GEP!");
399
400 Value *IndexValue = I->second;
401 assert(IndexValue && "index value missing from GEP index map");
402 return IndexValue;
403}
404
406 Type *VecElemTy, const DataLayout &DL,
407 SmallVector<Instruction *> &NewInsts) {
408 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
409 // helper.
410 LLVMContext &Ctx = GEP->getContext();
411 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
413 APInt ConstOffset(BW, 0);
414
415 // Walk backwards through nested GEPs to collect both constant and variable
416 // offsets, so that nested vector GEP chains can be lowered in one step.
417 //
418 // Given this IR fragment as input:
419 //
420 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
421 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
422 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
423 // %3 = load i32, ptr addrspace(5) %2, align 4
424 //
425 // Combine both GEP operations in a single pass, producing:
426 // BasePtr = %0
427 // ConstOffset = 4
428 // VarOffsets = { %j -> element_size(<2 x i32>) }
429 //
430 // That lets us emit a single buffer_load directly into a VGPR, without ever
431 // allocating scratch memory for the intermediate pointer.
432 Value *CurPtr = GEP;
433 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
434 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
435 return nullptr;
436
437 // Move to the next outer pointer.
438 CurPtr = CurGEP->getPointerOperand();
439 }
440
441 assert(CurPtr == Alloca && "GEP not based on alloca");
442
443 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
444 if (VarOffsets.size() > 1)
445 return nullptr;
446
447 APInt IndexQuot;
448 int64_t Rem;
449 APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
450 if (Rem != 0)
451 return nullptr;
452 if (VarOffsets.size() == 0)
453 return ConstantInt::get(Ctx, IndexQuot);
454
455 IRBuilder<> Builder(GEP);
456
457 const auto &VarOffset = VarOffsets.front();
458 APInt OffsetQuot;
459 APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
460 if (Rem != 0 || OffsetQuot.isZero())
461 return nullptr;
462
463 Value *Offset = VarOffset.first;
464 if (!isa<IntegerType>(Offset->getType()))
465 return nullptr;
466
467 Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));
468 if (Offset != VarOffset.first)
470
471 if (!OffsetQuot.isOne()) {
472 ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
473 Offset = Builder.CreateMul(Offset, ConstMul);
475 NewInsts.push_back(NewInst);
476 }
477 if (ConstOffset.isZero())
478 return Offset;
479
480 ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
481 Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
482 if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
483 NewInsts.push_back(NewInst);
484 return IndexAdd;
485}
486
487/// Promotes a single user of the alloca to a vector form.
488///
489/// \param Inst Instruction to be promoted.
490/// \param DL Module Data Layout.
491/// \param VectorTy Vectorized Type.
492/// \param VecStoreSize Size of \p VectorTy in bytes.
493/// \param ElementSize Size of \p VectorTy element type in bytes.
494/// \param TransferInfo MemTransferInst info map.
495/// \param GEPVectorIdx GEP -> VectorIdx cache.
496/// \param CurVal Current value of the vector (e.g. last stored value)
497/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
498/// be promoted now. This happens when promoting requires \p
499/// CurVal, but \p CurVal is nullptr.
500/// \return the stored value if \p Inst would have written to the alloca, or
501/// nullptr otherwise.
503 Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
504 unsigned VecStoreSize, unsigned ElementSize,
506 std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
507 function_ref<Value *()> GetCurVal) {
508 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
509 // to do more folding, especially in the case of vector splats.
512 Builder.SetInsertPoint(Inst);
513
514 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
515 Type *PtrTy) -> Value * {
516 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
517 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
518 if (!PtrTy->isVectorTy())
519 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
520 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
521 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
522 // first cast the ptr vector to <2 x i64>.
523 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
524 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
525 return Builder.CreateBitOrPointerCast(
526 Val, FixedVectorType::get(EltTy, NumPtrElts));
527 };
528
529 Type *VecEltTy = VectorTy->getElementType();
530
531 switch (Inst->getOpcode()) {
532 case Instruction::Load: {
533 Value *CurVal = GetCurVal();
535 cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
536
537 // We're loading the full vector.
538 Type *AccessTy = Inst->getType();
539 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
540 if (Constant *CI = dyn_cast<Constant>(Index)) {
541 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
542 if (AccessTy->isPtrOrPtrVectorTy())
543 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
544 else if (CurVal->getType()->isPtrOrPtrVectorTy())
545 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
546 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
547 Inst->replaceAllUsesWith(NewVal);
548 return nullptr;
549 }
550 }
551
552 // Loading a subvector.
553 if (isa<FixedVectorType>(AccessTy)) {
554 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
555 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
556 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
557 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
558
559 Value *SubVec = PoisonValue::get(SubVecTy);
560 for (unsigned K = 0; K < NumLoadedElts; ++K) {
561 Value *CurIdx =
562 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
563 SubVec = Builder.CreateInsertElement(
564 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
565 }
566
567 if (AccessTy->isPtrOrPtrVectorTy())
568 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
569 else if (SubVecTy->isPtrOrPtrVectorTy())
570 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
571
572 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
573 Inst->replaceAllUsesWith(SubVec);
574 return nullptr;
575 }
576
577 // We're loading one element.
578 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
579 if (AccessTy != VecEltTy)
580 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
581
582 Inst->replaceAllUsesWith(ExtractElement);
583 return nullptr;
584 }
585 case Instruction::Store: {
586 // For stores, it's a bit trickier and it depends on whether we're storing
587 // the full vector or not. If we're storing the full vector, we don't need
588 // to know the current value. If this is a store of a single element, we
589 // need to know the value.
591 Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
592 Value *Val = SI->getValueOperand();
593
594 // We're storing the full vector, we can handle this without knowing CurVal.
595 Type *AccessTy = Val->getType();
596 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
597 if (Constant *CI = dyn_cast<Constant>(Index)) {
598 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
599 if (AccessTy->isPtrOrPtrVectorTy())
600 Val = CreateTempPtrIntCast(Val, AccessTy);
601 else if (VectorTy->isPtrOrPtrVectorTy())
602 Val = CreateTempPtrIntCast(Val, VectorTy);
603 return Builder.CreateBitOrPointerCast(Val, VectorTy);
604 }
605 }
606
607 // Storing a subvector.
608 if (isa<FixedVectorType>(AccessTy)) {
609 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
610 const unsigned NumWrittenElts =
611 AccessSize / DL.getTypeStoreSize(VecEltTy);
612 const unsigned NumVecElts = VectorTy->getNumElements();
613 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
614 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
615
616 if (SubVecTy->isPtrOrPtrVectorTy())
617 Val = CreateTempPtrIntCast(Val, SubVecTy);
618 else if (AccessTy->isPtrOrPtrVectorTy())
619 Val = CreateTempPtrIntCast(Val, AccessTy);
620
621 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
622
623 Value *CurVec = GetCurVal();
624 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
625 K < NumElts; ++K) {
626 Value *CurIdx =
627 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
628 CurVec = Builder.CreateInsertElement(
629 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
630 }
631 return CurVec;
632 }
633
634 if (Val->getType() != VecEltTy)
635 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
636 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
637 }
638 case Instruction::Call: {
639 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
640 // For memcpy, we need to know curval.
641 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
642 unsigned NumCopied = Length->getZExtValue() / ElementSize;
643 MemTransferInfo *TI = &TransferInfo[MTI];
644 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
645 unsigned DestBegin = TI->DestIndex->getZExtValue();
646
647 SmallVector<int> Mask;
648 for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
649 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
650 Mask.push_back(SrcBegin < VectorTy->getNumElements()
651 ? SrcBegin++
653 } else {
654 Mask.push_back(Idx);
655 }
656 }
657
658 return Builder.CreateShuffleVector(GetCurVal(), Mask);
659 }
660
661 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
662 // For memset, we don't need to know the previous value because we
663 // currently only allow memsets that cover the whole alloca.
664 Value *Elt = MSI->getOperand(1);
665 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
666 if (BytesPerElt > 1) {
667 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
668
669 // If the element type of the vector is a pointer, we need to first cast
670 // to an integer, then use a PtrCast.
671 if (VecEltTy->isPointerTy()) {
672 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
673 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
674 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
675 } else
676 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
677 }
678
679 return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
680 }
681
682 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
683 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
684 Intr->replaceAllUsesWith(
685 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
686 DL.getTypeAllocSize(VectorTy)));
687 return nullptr;
688 }
689 }
690
691 llvm_unreachable("Unsupported call when promoting alloca to vector");
692 }
693
694 default:
695 llvm_unreachable("Inconsistency in instructions promotable to vector");
696 }
697
698 llvm_unreachable("Did not return after promoting instruction!");
699}
700
701static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
702 const DataLayout &DL) {
703 // Access as a vector type can work if the size of the access vector is a
704 // multiple of the size of the alloca's vector element type.
705 //
706 // Examples:
707 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
708 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
709 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
710 // - 3*32 is not a multiple of 64
711 //
712 // We could handle more complicated cases, but it'd make things a lot more
713 // complicated.
714 if (isa<FixedVectorType>(AccessTy)) {
715 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
716 // If the type size and the store size don't match, we would need to do more
717 // than just bitcast to translate between an extracted/insertable subvectors
718 // and the accessed value.
719 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
720 return false;
721 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
722 return AccTS.isKnownMultipleOf(VecTS);
723 }
724
726 DL);
727}
728
729/// Iterates over an instruction worklist that may contain multiple instructions
730/// from the same basic block, but in a different order.
731template <typename InstContainer>
732static void forEachWorkListItem(const InstContainer &WorkList,
733 std::function<void(Instruction *)> Fn) {
734 // Bucket up uses of the alloca by the block they occur in.
735 // This is important because we have to handle multiple defs/uses in a block
736 // ourselves: SSAUpdater is purely for cross-block references.
738 for (Instruction *User : WorkList)
739 UsesByBlock[User->getParent()].insert(User);
740
741 for (Instruction *User : WorkList) {
742 BasicBlock *BB = User->getParent();
743 auto &BlockUses = UsesByBlock[BB];
744
745 // Already processed, skip.
746 if (BlockUses.empty())
747 continue;
748
749 // Only user in the block, directly process it.
750 if (BlockUses.size() == 1) {
751 Fn(User);
752 continue;
753 }
754
755 // Multiple users in the block, do a linear scan to see users in order.
756 for (Instruction &Inst : *BB) {
757 if (!BlockUses.contains(&Inst))
758 continue;
759
760 Fn(&Inst);
761 }
762
763 // Clear the block so we know it's been processed.
764 BlockUses.clear();
765 }
766}
767
768/// Find an insert point after an alloca, after all other allocas clustered at
769/// the start of the block.
772 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
773 ;
774 return I;
775}
776
778AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
779 if (DisablePromoteAllocaToVector) {
780 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
781 return nullptr;
782 }
783
784 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
785 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
786 uint64_t NumElems = 1;
787 Type *ElemTy;
788 do {
789 NumElems *= ArrayTy->getNumElements();
790 ElemTy = ArrayTy->getElementType();
791 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
792
793 // Check for array of vectors
794 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
795 if (InnerVectorTy) {
796 NumElems *= InnerVectorTy->getNumElements();
797 ElemTy = InnerVectorTy->getElementType();
798 }
799
800 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
801 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
802 if (ElementSize > 0) {
803 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
804 // Expand vector if required to match padding of inner type,
805 // i.e. odd size subvectors.
806 // Storage size of new vector must match that of alloca for correct
807 // behaviour of byte offsets and GEP computation.
808 if (NumElems * ElementSize != AllocaSize)
809 NumElems = AllocaSize / ElementSize;
810 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
811 VectorTy = FixedVectorType::get(ElemTy, NumElems);
812 }
813 }
814 }
815 if (!VectorTy) {
816 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
817 return nullptr;
818 }
819
820 const unsigned MaxElements =
821 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
822
823 if (VectorTy->getNumElements() > MaxElements ||
824 VectorTy->getNumElements() < 2) {
825 LLVM_DEBUG(dbgs() << " " << *VectorTy
826 << " has an unsupported number of elements\n");
827 return nullptr;
828 }
829
830 Type *VecEltTy = VectorTy->getElementType();
831 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
832 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
833 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
834 "does not match the type's size\n");
835 return nullptr;
836 }
837
838 return VectorTy;
839}
840
841// FIXME: Should try to pick the most likely to be profitable allocas first.
842bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
843 LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n');
844
845 Type *AllocaTy = Alloca.getAllocatedType();
846 FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy);
847 if (!VectorTy)
848 return false;
849
850 std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
852 SmallVector<Instruction *> UsersToRemove;
853 SmallVector<Instruction *> DeferredInsts;
854 SmallVector<Instruction *> NewGEPInsts;
856
857 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
858 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
859 << " " << *Inst << "\n");
860 for (auto *Inst : reverse(NewGEPInsts))
861 Inst->eraseFromParent();
862 return false;
863 };
864
866 collectAllocaUses(Alloca, Uses);
867
868 LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
869
870 Type *VecEltTy = VectorTy->getElementType();
871 unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
872 assert(ElementSize > 0);
873 for (auto *U : Uses) {
874 Instruction *Inst = cast<Instruction>(U->getUser());
875
876 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
877 // This is a store of the pointer, not to the pointer.
878 if (isa<StoreInst>(Inst) &&
879 U->getOperandNo() != StoreInst::getPointerOperandIndex())
880 return RejectUser(Inst, "pointer is being stored");
881
882 Type *AccessTy = getLoadStoreType(Inst);
883 if (AccessTy->isAggregateType())
884 return RejectUser(Inst, "unsupported load/store as aggregate");
885 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
886
887 // Check that this is a simple access of a vector element.
888 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
889 : cast<StoreInst>(Inst)->isSimple();
890 if (!IsSimple)
891 return RejectUser(Inst, "not a simple load or store");
892
893 Ptr = Ptr->stripPointerCasts();
894
895 // Alloca already accessed as vector.
896 if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
897 DL->getTypeStoreSize(AccessTy)) {
898 WorkList.push_back(Inst);
899 continue;
900 }
901
902 if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
903 return RejectUser(Inst, "not a supported access type");
904
905 WorkList.push_back(Inst);
906 continue;
907 }
908
909 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
910 // If we can't compute a vector index from this GEP, then we can't
911 // promote this alloca to vector.
912 Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
913 if (!Index)
914 return RejectUser(Inst, "cannot compute vector index for GEP");
915
916 GEPVectorIdx[GEP] = Index;
917 UsersToRemove.push_back(Inst);
918 continue;
919 }
920
921 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
922 MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
923 WorkList.push_back(Inst);
924 continue;
925 }
926
927 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
928 if (TransferInst->isVolatile())
929 return RejectUser(Inst, "mem transfer inst is volatile");
930
931 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
932 if (!Len || (Len->getZExtValue() % ElementSize))
933 return RejectUser(Inst, "mem transfer inst length is non-constant or "
934 "not a multiple of the vector element size");
935
936 if (TransferInfo.try_emplace(TransferInst).second) {
937 DeferredInsts.push_back(Inst);
938 WorkList.push_back(Inst);
939 }
940
941 auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
943 if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
944 return nullptr;
945
946 return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
947 };
948
949 unsigned OpNum = U->getOperandNo();
950 MemTransferInfo *TI = &TransferInfo[TransferInst];
951 if (OpNum == 0) {
952 Value *Dest = TransferInst->getDest();
953 ConstantInt *Index = getPointerIndexOfAlloca(Dest);
954 if (!Index)
955 return RejectUser(Inst, "could not calculate constant dest index");
956 TI->DestIndex = Index;
957 } else {
958 assert(OpNum == 1);
959 Value *Src = TransferInst->getSource();
960 ConstantInt *Index = getPointerIndexOfAlloca(Src);
961 if (!Index)
962 return RejectUser(Inst, "could not calculate constant src index");
963 TI->SrcIndex = Index;
964 }
965 continue;
966 }
967
968 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
969 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
970 WorkList.push_back(Inst);
971 continue;
972 }
973 }
974
975 // Ignore assume-like intrinsics and comparisons used in assumes.
976 if (isAssumeLikeIntrinsic(Inst)) {
977 if (!Inst->use_empty())
978 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
979 UsersToRemove.push_back(Inst);
980 continue;
981 }
982
983 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
984 return isAssumeLikeIntrinsic(cast<Instruction>(U));
985 })) {
986 UsersToRemove.push_back(Inst);
987 continue;
988 }
989
990 return RejectUser(Inst, "unhandled alloca user");
991 }
992
993 while (!DeferredInsts.empty()) {
994 Instruction *Inst = DeferredInsts.pop_back_val();
995 MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
996 // TODO: Support the case if the pointers are from different alloca or
997 // from different address spaces.
998 MemTransferInfo &Info = TransferInfo[TransferInst];
999 if (!Info.SrcIndex || !Info.DestIndex)
1000 return RejectUser(
1001 Inst, "mem transfer inst is missing constant src and/or dst index");
1002 }
1003
1004 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
1005 << *VectorTy << '\n');
1006 const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
1007
1008 // Alloca is uninitialized memory. Imitate that by making the first value
1009 // undef.
1010 SSAUpdater Updater;
1011 Updater.Initialize(VectorTy, "promotealloca");
1012
1013 BasicBlock *EntryBB = Alloca.getParent();
1014 BasicBlock::iterator InitInsertPos =
1015 skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
1016 // Alloca memory is undefined to begin, not poison.
1017 Value *AllocaInitValue =
1018 new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
1019 AllocaInitValue->takeName(&Alloca);
1020
1021 Updater.AddAvailableValue(EntryBB, AllocaInitValue);
1022
1023 // First handle the initial worklist, in basic block order.
1024 //
1025 // Insert a placeholder whenever we need the vector value at the top of a
1026 // basic block.
1027 SmallVector<Instruction *> Placeholders;
1028 forEachWorkListItem(WorkList, [&](Instruction *I) {
1029 BasicBlock *BB = I->getParent();
1030 auto GetCurVal = [&]() -> Value * {
1031 if (Value *CurVal = Updater.FindValueForBlock(BB))
1032 return CurVal;
1033
1034 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1035 return Placeholders.back();
1036
1037 // If the current value in the basic block is not yet known, insert a
1038 // placeholder that we will replace later.
1039 IRBuilder<> Builder(I);
1040 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1041 PoisonValue::get(VectorTy), "promotealloca.placeholder"));
1042 Placeholders.push_back(Placeholder);
1043 return Placeholders.back();
1044 };
1045
1046 Value *Result =
1047 promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize,
1048 TransferInfo, GEPVectorIdx, GetCurVal);
1049 if (Result)
1050 Updater.AddAvailableValue(BB, Result);
1051 });
1052
1053 // Now fixup the placeholders.
1054 for (Instruction *Placeholder : Placeholders) {
1055 Placeholder->replaceAllUsesWith(
1056 Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
1057 Placeholder->eraseFromParent();
1058 }
1059
1060 // Delete all instructions. On the first pass, new dummy loads may have been
1061 // added so we need to collect them too.
1062 DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1063 for (Instruction *I : InstsToDelete) {
1064 assert(I->use_empty());
1065 I->eraseFromParent();
1066 }
1067
1068 // Delete all the users that are known to be removeable.
1069 for (Instruction *I : reverse(UsersToRemove)) {
1070 I->dropDroppableUses();
1071 assert(I->use_empty());
1072 I->eraseFromParent();
1073 }
1074
1075 // Alloca should now be dead too.
1076 assert(Alloca.use_empty());
1077 Alloca.eraseFromParent();
1078 return true;
1079}
1080
1081std::pair<Value *, Value *>
1082AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1083 Function &F = *Builder.GetInsertBlock()->getParent();
1085
1086 if (!IsAMDHSA) {
1087 CallInst *LocalSizeY =
1088 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1089 CallInst *LocalSizeZ =
1090 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1091
1092 ST.makeLIDRangeMetadata(LocalSizeY);
1093 ST.makeLIDRangeMetadata(LocalSizeZ);
1094
1095 return std::pair(LocalSizeY, LocalSizeZ);
1096 }
1097
1098 // We must read the size out of the dispatch pointer.
1099 assert(IsAMDGCN);
1100
1101 // We are indexing into this struct, and want to extract the workgroup_size_*
1102 // fields.
1103 //
1104 // typedef struct hsa_kernel_dispatch_packet_s {
1105 // uint16_t header;
1106 // uint16_t setup;
1107 // uint16_t workgroup_size_x ;
1108 // uint16_t workgroup_size_y;
1109 // uint16_t workgroup_size_z;
1110 // uint16_t reserved0;
1111 // uint32_t grid_size_x ;
1112 // uint32_t grid_size_y ;
1113 // uint32_t grid_size_z;
1114 //
1115 // uint32_t private_segment_size;
1116 // uint32_t group_segment_size;
1117 // uint64_t kernel_object;
1118 //
1119 // #ifdef HSA_LARGE_MODEL
1120 // void *kernarg_address;
1121 // #elif defined HSA_LITTLE_ENDIAN
1122 // void *kernarg_address;
1123 // uint32_t reserved1;
1124 // #else
1125 // uint32_t reserved1;
1126 // void *kernarg_address;
1127 // #endif
1128 // uint64_t reserved2;
1129 // hsa_signal_t completion_signal; // uint64_t wrapper
1130 // } hsa_kernel_dispatch_packet_t
1131 //
1132 CallInst *DispatchPtr =
1133 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1134 DispatchPtr->addRetAttr(Attribute::NoAlias);
1135 DispatchPtr->addRetAttr(Attribute::NonNull);
1136 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1137
1138 // Size of the dispatch packet struct.
1139 DispatchPtr->addDereferenceableRetAttr(64);
1140
1141 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1142
1143 // We could do a single 64-bit load here, but it's likely that the basic
1144 // 32-bit and extract sequence is already present, and it is probably easier
1145 // to CSE this. The loads should be mergeable later anyway.
1146 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1147 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1148
1149 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1150 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1151
1152 MDNode *MD = MDNode::get(Mod->getContext(), {});
1153 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1154 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1155 ST.makeLIDRangeMetadata(LoadZU);
1156
1157 // Extract y component. Upper half of LoadZU should be zero already.
1158 Value *Y = Builder.CreateLShr(LoadXY, 16);
1159
1160 return std::pair(Y, LoadZU);
1161}
1162
1163Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1164 unsigned N) {
1165 Function *F = Builder.GetInsertBlock()->getParent();
1168 StringRef AttrName;
1169
1170 switch (N) {
1171 case 0:
1172 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1173 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1174 AttrName = "amdgpu-no-workitem-id-x";
1175 break;
1176 case 1:
1177 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1178 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1179 AttrName = "amdgpu-no-workitem-id-y";
1180 break;
1181
1182 case 2:
1183 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1184 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1185 AttrName = "amdgpu-no-workitem-id-z";
1186 break;
1187 default:
1188 llvm_unreachable("invalid dimension");
1189 }
1190
1191 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1192 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1193 ST.makeLIDRangeMetadata(CI);
1194 F->removeFnAttr(AttrName);
1195
1196 return CI;
1197}
1198
1199static bool isCallPromotable(CallInst *CI) {
1201 if (!II)
1202 return false;
1203
1204 switch (II->getIntrinsicID()) {
1205 case Intrinsic::memcpy:
1206 case Intrinsic::memmove:
1207 case Intrinsic::memset:
1208 case Intrinsic::lifetime_start:
1209 case Intrinsic::lifetime_end:
1210 case Intrinsic::invariant_start:
1211 case Intrinsic::invariant_end:
1212 case Intrinsic::launder_invariant_group:
1213 case Intrinsic::strip_invariant_group:
1214 case Intrinsic::objectsize:
1215 return true;
1216 default:
1217 return false;
1218 }
1219}
1220
1221bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1222 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1223 int OpIdx1) const {
1224 // Figure out which operand is the one we might not be promoting.
1225 Value *OtherOp = Inst->getOperand(OpIdx0);
1226 if (Val == OtherOp)
1227 OtherOp = Inst->getOperand(OpIdx1);
1228
1230 return true;
1231
1232 // TODO: getUnderlyingObject will not work on a vector getelementptr
1233 Value *OtherObj = getUnderlyingObject(OtherOp);
1234 if (!isa<AllocaInst>(OtherObj))
1235 return false;
1236
1237 // TODO: We should be able to replace undefs with the right pointer type.
1238
1239 // TODO: If we know the other base object is another promotable
1240 // alloca, not necessarily this alloca, we can do this. The
1241 // important part is both must have the same address space at
1242 // the end.
1243 if (OtherObj != BaseAlloca) {
1244 LLVM_DEBUG(
1245 dbgs() << "Found a binary instruction with another alloca object\n");
1246 return false;
1247 }
1248
1249 return true;
1250}
1251
1252bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1253 Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
1254
1255 for (User *User : Val->users()) {
1256 if (is_contained(WorkList, User))
1257 continue;
1258
1259 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1260 if (!isCallPromotable(CI))
1261 return false;
1262
1263 WorkList.push_back(User);
1264 continue;
1265 }
1266
1268 if (UseInst->getOpcode() == Instruction::PtrToInt)
1269 return false;
1270
1271 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1272 if (LI->isVolatile())
1273 return false;
1274 continue;
1275 }
1276
1277 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1278 if (SI->isVolatile())
1279 return false;
1280
1281 // Reject if the stored value is not the pointer operand.
1282 if (SI->getPointerOperand() != Val)
1283 return false;
1284 continue;
1285 }
1286
1287 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1288 if (RMW->isVolatile())
1289 return false;
1290 continue;
1291 }
1292
1293 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1294 if (CAS->isVolatile())
1295 return false;
1296 continue;
1297 }
1298
1299 // Only promote a select if we know that the other select operand
1300 // is from another pointer that will also be promoted.
1301 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1302 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
1303 return false;
1304
1305 // May need to rewrite constant operands.
1306 WorkList.push_back(ICmp);
1307 continue;
1308 }
1309
1311 // Be conservative if an address could be computed outside the bounds of
1312 // the alloca.
1313 if (!GEP->isInBounds())
1314 return false;
1315 } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
1316 // Only promote a select if we know that the other select operand is from
1317 // another pointer that will also be promoted.
1318 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
1319 return false;
1320 } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
1321 // Repeat for phis.
1322
1323 // TODO: Handle more complex cases. We should be able to replace loops
1324 // over arrays.
1325 switch (Phi->getNumIncomingValues()) {
1326 case 1:
1327 break;
1328 case 2:
1329 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
1330 return false;
1331 break;
1332 default:
1333 return false;
1334 }
1335 } else if (!isa<ExtractElementInst>(User)) {
1336 // Do not promote vector/aggregate type instructions. It is hard to track
1337 // their users.
1338
1339 // Do not promote addrspacecast.
1340 //
1341 // TODO: If we know the address is only observed through flat pointers, we
1342 // could still promote.
1343 return false;
1344 }
1345
1346 WorkList.push_back(User);
1347 if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
1348 return false;
1349 }
1350
1351 return true;
1352}
1353
1354bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1355
1356 FunctionType *FTy = F.getFunctionType();
1358
1359 // If the function has any arguments in the local address space, then it's
1360 // possible these arguments require the entire local memory space, so
1361 // we cannot use local memory in the pass.
1362 for (Type *ParamTy : FTy->params()) {
1363 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1364 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1365 LocalMemLimit = 0;
1366 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1367 "local memory disabled.\n");
1368 return false;
1369 }
1370 }
1371
1372 LocalMemLimit = ST.getAddressableLocalMemorySize();
1373 if (LocalMemLimit == 0)
1374 return false;
1375
1377 SmallPtrSet<const Constant *, 8> VisitedConstants;
1379
1380 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1381 for (const User *U : Val->users()) {
1382 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1383 if (Use->getFunction() == &F)
1384 return true;
1385 } else {
1386 const Constant *C = cast<Constant>(U);
1387 if (VisitedConstants.insert(C).second)
1388 Stack.push_back(C);
1389 }
1390 }
1391
1392 return false;
1393 };
1394
1395 for (GlobalVariable &GV : Mod->globals()) {
1397 continue;
1398
1399 if (visitUsers(&GV, &GV)) {
1400 UsedLDS.insert(&GV);
1401 Stack.clear();
1402 continue;
1403 }
1404
1405 // For any ConstantExpr uses, we need to recursively search the users until
1406 // we see a function.
1407 while (!Stack.empty()) {
1408 const Constant *C = Stack.pop_back_val();
1409 if (visitUsers(&GV, C)) {
1410 UsedLDS.insert(&GV);
1411 Stack.clear();
1412 break;
1413 }
1414 }
1415 }
1416
1417 const DataLayout &DL = Mod->getDataLayout();
1418 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1419 AllocatedSizes.reserve(UsedLDS.size());
1420
1421 for (const GlobalVariable *GV : UsedLDS) {
1422 Align Alignment =
1423 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1424 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1425
1426 // HIP uses an extern unsized array in local address space for dynamically
1427 // allocated shared memory. In that case, we have to disable the promotion.
1428 if (GV->hasExternalLinkage() && AllocSize == 0) {
1429 LocalMemLimit = 0;
1430 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1431 "local memory. Promoting to local memory "
1432 "disabled.\n");
1433 return false;
1434 }
1435
1436 AllocatedSizes.emplace_back(AllocSize, Alignment);
1437 }
1438
1439 // Sort to try to estimate the worst case alignment padding
1440 //
1441 // FIXME: We should really do something to fix the addresses to a more optimal
1442 // value instead
1443 llvm::sort(AllocatedSizes, llvm::less_second());
1444
1445 // Check how much local memory is being used by global objects
1446 CurrentLocalMemUsage = 0;
1447
1448 // FIXME: Try to account for padding here. The real padding and address is
1449 // currently determined from the inverse order of uses in the function when
1450 // legalizing, which could also potentially change. We try to estimate the
1451 // worst case here, but we probably should fix the addresses earlier.
1452 for (auto Alloc : AllocatedSizes) {
1453 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1454 CurrentLocalMemUsage += Alloc.first;
1455 }
1456
1457 unsigned MaxOccupancy =
1458 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1459 .second;
1460
1461 // Round up to the next tier of usage.
1462 unsigned MaxSizeWithWaveCount =
1463 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1464
1465 // Program may already use more LDS than is usable at maximum occupancy.
1466 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1467 return false;
1468
1469 LocalMemLimit = MaxSizeWithWaveCount;
1470
1471 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1472 << " bytes of LDS\n"
1473 << " Rounding size to " << MaxSizeWithWaveCount
1474 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1475 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1476 << " available for promotion\n");
1477
1478 return true;
1479}
1480
1481// FIXME: Should try to pick the most likely to be profitable allocas first.
1482bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1483 bool SufficientLDS) {
1484 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1485
1486 if (DisablePromoteAllocaToLDS) {
1487 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1488 return false;
1489 }
1490
1491 const DataLayout &DL = Mod->getDataLayout();
1492 IRBuilder<> Builder(&I);
1493
1494 const Function &ContainingFunction = *I.getFunction();
1495 CallingConv::ID CC = ContainingFunction.getCallingConv();
1496
1497 // Don't promote the alloca to LDS for shader calling conventions as the work
1498 // item ID intrinsics are not supported for these calling conventions.
1499 // Furthermore not all LDS is available for some of the stages.
1500 switch (CC) {
1503 break;
1504 default:
1505 LLVM_DEBUG(
1506 dbgs()
1507 << " promote alloca to LDS not supported with calling convention.\n");
1508 return false;
1509 }
1510
1511 // Not likely to have sufficient local memory for promotion.
1512 if (!SufficientLDS)
1513 return false;
1514
1515 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1516 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1517
1518 Align Alignment =
1519 DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
1520
1521 // FIXME: This computed padding is likely wrong since it depends on inverse
1522 // usage order.
1523 //
1524 // FIXME: It is also possible that if we're allowed to use all of the memory
1525 // could end up using more than the maximum due to alignment padding.
1526
1527 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1528 uint32_t AllocSize =
1529 WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
1530 NewSize += AllocSize;
1531
1532 if (NewSize > LocalMemLimit) {
1533 LLVM_DEBUG(dbgs() << " " << AllocSize
1534 << " bytes of local memory not available to promote\n");
1535 return false;
1536 }
1537
1538 CurrentLocalMemUsage = NewSize;
1539
1540 std::vector<Value *> WorkList;
1541
1542 if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
1543 LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1544 return false;
1545 }
1546
1547 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1548
1549 Function *F = I.getFunction();
1550
1551 Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
1554 Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1557 GV->setAlignment(I.getAlign());
1558
1559 Value *TCntY, *TCntZ;
1560
1561 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1562 Value *TIdX = getWorkitemID(Builder, 0);
1563 Value *TIdY = getWorkitemID(Builder, 1);
1564 Value *TIdZ = getWorkitemID(Builder, 2);
1565
1566 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1567 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1568 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1569 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1570 TID = Builder.CreateAdd(TID, TIdZ);
1571
1572 LLVMContext &Context = Mod->getContext();
1574
1575 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1576 I.mutateType(Offset->getType());
1577 I.replaceAllUsesWith(Offset);
1578 I.eraseFromParent();
1579
1580 SmallVector<IntrinsicInst *> DeferredIntrs;
1581
1583
1584 for (Value *V : WorkList) {
1586 if (!Call) {
1587 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1588 Value *LHS = CI->getOperand(0);
1589 Value *RHS = CI->getOperand(1);
1590
1591 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1593 CI->setOperand(0, Constant::getNullValue(NewTy));
1594
1596 CI->setOperand(1, Constant::getNullValue(NewTy));
1597
1598 continue;
1599 }
1600
1601 // The operand's value should be corrected on its own and we don't want to
1602 // touch the users.
1604 continue;
1605
1606 assert(V->getType()->isPtrOrPtrVectorTy());
1607
1608 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1609 V->mutateType(NewTy);
1610
1611 // Adjust the types of any constant operands.
1614 SI->setOperand(1, Constant::getNullValue(NewTy));
1615
1617 SI->setOperand(2, Constant::getNullValue(NewTy));
1618 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1619 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1621 Phi->getIncomingValue(I)))
1622 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1623 }
1624 }
1625
1626 continue;
1627 }
1628
1630 Builder.SetInsertPoint(Intr);
1631 switch (Intr->getIntrinsicID()) {
1632 case Intrinsic::lifetime_start:
1633 case Intrinsic::lifetime_end:
1634 // These intrinsics are for address space 0 only
1635 Intr->eraseFromParent();
1636 continue;
1637 case Intrinsic::memcpy:
1638 case Intrinsic::memmove:
1639 // These have 2 pointer operands. In case if second pointer also needs
1640 // to be replaced we defer processing of these intrinsics until all
1641 // other values are processed.
1642 DeferredIntrs.push_back(Intr);
1643 continue;
1644 case Intrinsic::memset: {
1645 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1646 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1647 MemSet->getLength(), MemSet->getDestAlign(),
1648 MemSet->isVolatile());
1649 Intr->eraseFromParent();
1650 continue;
1651 }
1652 case Intrinsic::invariant_start:
1653 case Intrinsic::invariant_end:
1654 case Intrinsic::launder_invariant_group:
1655 case Intrinsic::strip_invariant_group: {
1657 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1658 Args.emplace_back(Intr->getArgOperand(0));
1659 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1660 Args.emplace_back(Intr->getArgOperand(0));
1661 Args.emplace_back(Intr->getArgOperand(1));
1662 }
1663 Args.emplace_back(Offset);
1665 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1666 CallInst *NewIntr =
1667 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1668 Intr->mutateType(NewIntr->getType());
1669 Intr->replaceAllUsesWith(NewIntr);
1670 Intr->eraseFromParent();
1671 continue;
1672 }
1673 case Intrinsic::objectsize: {
1674 Value *Src = Intr->getOperand(0);
1675
1676 CallInst *NewCall = Builder.CreateIntrinsic(
1677 Intrinsic::objectsize,
1679 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1680 Intr->replaceAllUsesWith(NewCall);
1681 Intr->eraseFromParent();
1682 continue;
1683 }
1684 default:
1685 Intr->print(errs());
1686 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1687 }
1688 }
1689
1690 for (IntrinsicInst *Intr : DeferredIntrs) {
1691 Builder.SetInsertPoint(Intr);
1693 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1694
1696 auto *B = Builder.CreateMemTransferInst(
1697 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1698 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1699
1700 for (unsigned I = 0; I != 2; ++I) {
1701 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1702 B->addDereferenceableParamAttr(I, Bytes);
1703 }
1704 }
1705
1706 Intr->eraseFromParent();
1707 }
1708
1709 return true;
1710}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL, SmallVector< Instruction * > &NewInsts)
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap< MemTransferInst *, MemTransferInfo > &TransferInfo, std::map< GetElementPtrInst *, WeakTrackingVH > &GEPVectorIdx, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
static void collectAllocaUses(AllocaInst &Alloca, SmallVectorImpl< Use * > &Uses)
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, WeakTrackingVH > &GEPIdx)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
This class represents a freeze function that returns random concrete value if an operand is either a ...
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
const Function & getFunction() const
Definition Function.h:164
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1996
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1078
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
Value * FindValueForBlock(BasicBlock *BB) const
Return the value for the specified block if the SSAUpdater has one, otherwise return nullptr.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:927
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2070
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446