LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(32));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(4));
81
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(4));
87
88// We support vector indices of the form (A * stride) + B
89// All parts are optional.
90struct GEPToVectorIndex {
91 Value *VarIndex = nullptr; // defaults to 0
92 ConstantInt *VarMul = nullptr; // defaults to 1
93 ConstantInt *ConstIndex = nullptr; // defaults to 0
94 Value *Full = nullptr;
95};
96
97struct MemTransferInfo {
98 ConstantInt *SrcIndex = nullptr;
99 ConstantInt *DestIndex = nullptr;
100};
101
102// Analysis for planning the different strategies of alloca promotion.
103struct AllocaAnalysis {
104 AllocaInst *Alloca = nullptr;
105 DenseSet<Value *> Pointers;
107 unsigned Score = 0;
108 bool HaveSelectOrPHI = false;
109 struct {
110 FixedVectorType *Ty = nullptr;
112 SmallVector<Instruction *> UsersToRemove;
115 } Vector;
116 struct {
117 bool Enable = false;
118 SmallVector<User *> Worklist;
119 } LDS;
120
121 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
122};
123
124// Shared implementation which can do both promotion to vector and to LDS.
125class AMDGPUPromoteAllocaImpl {
126private:
127 const TargetMachine &TM;
128 LoopInfo &LI;
129 Module *Mod = nullptr;
130 const DataLayout *DL = nullptr;
131
132 // FIXME: This should be per-kernel.
133 uint32_t LocalMemLimit = 0;
134 uint32_t CurrentLocalMemUsage = 0;
135 unsigned MaxVGPRs;
136 unsigned VGPRBudgetRatio;
137 unsigned MaxVectorRegs;
138
139 bool IsAMDGCN = false;
140 bool IsAMDHSA = false;
141
142 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
143 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
144
145 bool collectAllocaUses(AllocaAnalysis &AA) const;
146
147 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
148 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
149 /// Returns true if both operands are derived from the same alloca. Val should
150 /// be the same value as one of the input operands of UseInst.
151 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
152 Instruction *UseInst, int OpIdx0,
153 int OpIdx1) const;
154
155 /// Check whether we have enough local memory for promotion.
156 bool hasSufficientLocalMem(const Function &F);
157
158 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
159 void analyzePromoteToVector(AllocaAnalysis &AA) const;
160 void promoteAllocaToVector(AllocaAnalysis &AA);
161 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
162 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
163 SetVector<IntrinsicInst *> &DeferredIntrs);
164 void
165 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
166
167 void scoreAlloca(AllocaAnalysis &AA) const;
168
169 void setFunctionLimits(const Function &F);
170
171public:
172 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
173
174 const Triple &TT = TM.getTargetTriple();
175 IsAMDGCN = TT.isAMDGCN();
176 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
177 }
178
179 bool run(Function &F, bool PromoteToLDS);
180};
181
182// FIXME: This can create globals so should be a module pass.
183class AMDGPUPromoteAlloca : public FunctionPass {
184public:
185 static char ID;
186
187 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
188
189 bool runOnFunction(Function &F) override {
190 if (skipFunction(F))
191 return false;
192 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
193 return AMDGPUPromoteAllocaImpl(
194 TPC->getTM<TargetMachine>(),
195 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
196 .run(F, /*PromoteToLDS*/ true);
197 return false;
198 }
199
200 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
201
202 void getAnalysisUsage(AnalysisUsage &AU) const override {
203 AU.setPreservesCFG();
206 }
207};
208
209static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
210 const Function &F) {
211 if (!TM.getTargetTriple().isAMDGCN())
212 return 128;
213
214 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
215
216 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
217 // Temporarily check both the attribute and the subtarget feature, until the
218 // latter is removed.
219 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
220 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
221
222 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
223 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
224 DynamicVGPRBlockSize);
225
226 // A non-entry function has only 32 caller preserved registers.
227 // Do not promote alloca which will force spilling unless we know the function
228 // will be inlined.
229 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
230 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
231 MaxVGPRs = std::min(MaxVGPRs, 32u);
232 return MaxVGPRs;
233}
234
235} // end anonymous namespace
236
237char AMDGPUPromoteAlloca::ID = 0;
238
240 "AMDGPU promote alloca to vector or LDS", false, false)
241// Move LDS uses from functions to kernels before promote alloca for accurate
242// estimation of LDS available
243INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
245INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
246 "AMDGPU promote alloca to vector or LDS", false, false)
247
248char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
249
252 auto &LI = AM.getResult<LoopAnalysis>(F);
253 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
254 if (Changed) {
257 return PA;
258 }
259 return PreservedAnalyses::all();
260}
261
264 auto &LI = AM.getResult<LoopAnalysis>(F);
265 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
266 if (Changed) {
269 return PA;
270 }
271 return PreservedAnalyses::all();
272}
273
275 return new AMDGPUPromoteAlloca();
276}
277
278bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
279 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
280 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
281 << " " << *Inst << "\n");
282 return false;
283 };
284
285 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
286 while (!WorkList.empty()) {
287 auto *Cur = WorkList.pop_back_val();
288 if (find(AA.Pointers, Cur) != AA.Pointers.end())
289 continue;
290 AA.Pointers.insert(Cur);
291 for (auto &U : Cur->uses()) {
292 auto *Inst = cast<Instruction>(U.getUser());
293 if (isa<StoreInst>(Inst)) {
294 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
295 return RejectUser(Inst, "pointer escapes via store");
296 }
297 }
298 AA.Uses.push_back(&U);
299
300 if (isa<GetElementPtrInst>(U.getUser())) {
301 WorkList.push_back(Inst);
302 } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
303 // Only promote a select if we know that the other select operand is
304 // from another pointer that will also be promoted.
305 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
306 return RejectUser(Inst, "select from mixed objects");
307 WorkList.push_back(Inst);
308 AA.HaveSelectOrPHI = true;
309 } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
310 // Repeat for phis.
311
312 // TODO: Handle more complex cases. We should be able to replace loops
313 // over arrays.
314 switch (Phi->getNumIncomingValues()) {
315 case 1:
316 break;
317 case 2:
318 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
319 return RejectUser(Inst, "phi from mixed objects");
320 break;
321 default:
322 return RejectUser(Inst, "phi with too many operands");
323 }
324
325 WorkList.push_back(Inst);
326 AA.HaveSelectOrPHI = true;
327 }
328 }
329 }
330 return true;
331}
332
333void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
334 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
335 unsigned Score = 0;
336 // Increment score by one for each user + a bonus for users within loops.
337 for (auto *U : AA.Uses) {
338 Instruction *Inst = cast<Instruction>(U->getUser());
339 if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
340 isa<PHINode>(Inst))
341 continue;
342 unsigned UserScore =
343 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
344 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
345 Score += UserScore;
346 }
347 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
348 AA.Score = Score;
349}
350
351void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
352 // Load per function limits, overriding with global options where appropriate.
353 // R600 register tuples/aliasing are fragile with large vector promotions so
354 // apply architecture specific limit here.
355 const int R600MaxVectorRegs = 16;
356 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
357 "amdgpu-promote-alloca-to-vector-max-regs",
358 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
359 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
360 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
361 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
362 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
363 PromoteAllocaToVectorVGPRRatio);
364 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
365 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
366}
367
368bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
369 Mod = F.getParent();
370 DL = &Mod->getDataLayout();
371
373 if (!ST.isPromoteAllocaEnabled())
374 return false;
375
376 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
377 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
378 setFunctionLimits(F);
379
380 unsigned VectorizationBudget =
381 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
382 : (MaxVGPRs * 32)) /
383 VGPRBudgetRatio;
384
385 std::vector<AllocaAnalysis> Allocas;
386 for (Instruction &I : F.getEntryBlock()) {
387 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
388 // Array allocations are probably not worth handling, since an allocation
389 // of the array type is the canonical form.
390 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
391 continue;
392
393 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
394
395 AllocaAnalysis AA{AI};
396 if (collectAllocaUses(AA)) {
397 analyzePromoteToVector(AA);
398 if (PromoteToLDS)
399 analyzePromoteToLDS(AA);
400 if (AA.Vector.Ty || AA.LDS.Enable) {
401 scoreAlloca(AA);
402 Allocas.push_back(std::move(AA));
403 }
404 }
405 }
406 }
407
408 stable_sort(Allocas,
409 [](const auto &A, const auto &B) { return A.Score > B.Score; });
410
411 // clang-format off
413 dbgs() << "Sorted Worklist:\n";
414 for (const auto &AA : Allocas)
415 dbgs() << " " << *AA.Alloca << "\n";
416 );
417 // clang-format on
418
419 bool Changed = false;
420 SetVector<IntrinsicInst *> DeferredIntrs;
421 for (AllocaAnalysis &AA : Allocas) {
422 if (AA.Vector.Ty) {
423 const unsigned AllocaCost =
424 DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
425 // First, check if we have enough budget to vectorize this alloca.
426 if (AllocaCost <= VectorizationBudget) {
427 promoteAllocaToVector(AA);
428 Changed = true;
429 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
430 "Underflow!");
431 VectorizationBudget -= AllocaCost;
432 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
433 << VectorizationBudget << "\n");
434 continue;
435 } else {
436 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
437 << AllocaCost << ", budget:" << VectorizationBudget
438 << "): " << *AA.Alloca << "\n");
439 }
440 }
441
442 if (AA.LDS.Enable &&
443 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
444 Changed = true;
445 }
446 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
447
448 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
449 // dangling pointers. If we want to reuse it past this point, the loop above
450 // would need to be updated to remove successfully promoted allocas.
451
452 return Changed;
453}
454
455// Checks if the instruction I is a memset user of the alloca AI that we can
456// deal with. Currently, only non-volatile memsets that affect the whole alloca
457// are handled.
459 const DataLayout &DL) {
460 using namespace PatternMatch;
461 // For now we only care about non-volatile memsets that affect the whole type
462 // (start at index 0 and fill the whole alloca).
463 //
464 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
465 // (except maybe volatile ones?) - we just need to use shufflevector if it
466 // only affects a subset of the vector.
467 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
468 return I->getOperand(0) == AI &&
469 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
470}
471
472static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
473 IRBuilder<> B(Ptr->getContext());
474
475 Ptr = Ptr->stripPointerCasts();
476 if (Ptr == AA.Alloca)
477 return B.getInt32(0);
478
479 auto *GEP = cast<GetElementPtrInst>(Ptr);
480 auto I = AA.Vector.GEPVectorIdx.find(GEP);
481 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
482
483 if (!I->second.Full) {
484 Value *Result = nullptr;
485 B.SetInsertPoint(GEP);
486
487 if (I->second.VarIndex) {
488 Result = I->second.VarIndex;
489 Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
490
491 if (I->second.VarMul)
492 Result = B.CreateMul(Result, I->second.VarMul);
493 }
494
495 if (I->second.ConstIndex) {
496 if (Result)
497 Result = B.CreateAdd(Result, I->second.ConstIndex);
498 else
499 Result = I->second.ConstIndex;
500 }
501
502 if (!Result)
503 Result = B.getInt32(0);
504
505 I->second.Full = Result;
506 }
507
508 return I->second.Full;
509}
510
511static std::optional<GEPToVectorIndex>
513 Type *VecElemTy, const DataLayout &DL) {
514 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
515 // helper.
516 LLVMContext &Ctx = GEP->getContext();
517 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
519 APInt ConstOffset(BW, 0);
520
521 // Walk backwards through nested GEPs to collect both constant and variable
522 // offsets, so that nested vector GEP chains can be lowered in one step.
523 //
524 // Given this IR fragment as input:
525 //
526 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
527 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
528 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
529 // %3 = load i32, ptr addrspace(5) %2, align 4
530 //
531 // Combine both GEP operations in a single pass, producing:
532 // BasePtr = %0
533 // ConstOffset = 4
534 // VarOffsets = { %j -> element_size(<2 x i32>) }
535 //
536 // That lets us emit a single buffer_load directly into a VGPR, without ever
537 // allocating scratch memory for the intermediate pointer.
538 Value *CurPtr = GEP;
539 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
540 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
541 return {};
542
543 // Move to the next outer pointer.
544 CurPtr = CurGEP->getPointerOperand();
545 }
546
547 assert(CurPtr == Alloca && "GEP not based on alloca");
548
549 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
550 if (VarOffsets.size() > 1)
551 return {};
552
553 APInt IndexQuot;
554 int64_t Rem;
555 APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
556 if (Rem != 0)
557 return {};
558
559 GEPToVectorIndex Result;
560
561 if (!ConstOffset.isZero())
562 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
563
564 if (VarOffsets.empty())
565 return Result;
566
567 const auto &VarOffset = VarOffsets.front();
568 APInt OffsetQuot;
569 APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
570 if (Rem != 0 || OffsetQuot.isZero())
571 return {};
572
573 Result.VarIndex = VarOffset.first;
574 auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
575 if (!OffsetType)
576 return {};
577
578 if (!OffsetQuot.isOne())
579 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
580
581 return Result;
582}
583
584/// Promotes a single user of the alloca to a vector form.
585///
586/// \param Inst Instruction to be promoted.
587/// \param DL Module Data Layout.
588/// \param AA Alloca Analysis.
589/// \param VecStoreSize Size of \p VectorTy in bytes.
590/// \param ElementSize Size of \p VectorTy element type in bytes.
591/// \param CurVal Current value of the vector (e.g. last stored value)
592/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
593/// be promoted now. This happens when promoting requires \p
594/// CurVal, but \p CurVal is nullptr.
595/// \return the stored value if \p Inst would have written to the alloca, or
596/// nullptr otherwise.
598 AllocaAnalysis &AA,
599 unsigned VecStoreSize,
600 unsigned ElementSize,
601 function_ref<Value *()> GetCurVal) {
602 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
603 // to do more folding, especially in the case of vector splats.
606 Builder.SetInsertPoint(Inst);
607
608 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
609 Type *PtrTy) -> Value * {
610 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
611 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
612 if (!PtrTy->isVectorTy())
613 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
614 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
615 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
616 // first cast the ptr vector to <2 x i64>.
617 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
618 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
619 return Builder.CreateBitOrPointerCast(
620 Val, FixedVectorType::get(EltTy, NumPtrElts));
621 };
622
623 Type *VecEltTy = AA.Vector.Ty->getElementType();
624
625 switch (Inst->getOpcode()) {
626 case Instruction::Load: {
627 Value *CurVal = GetCurVal();
628 Value *Index =
630
631 // We're loading the full vector.
632 Type *AccessTy = Inst->getType();
633 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
634 if (Constant *CI = dyn_cast<Constant>(Index)) {
635 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
636 if (AccessTy->isPtrOrPtrVectorTy())
637 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
638 else if (CurVal->getType()->isPtrOrPtrVectorTy())
639 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
640 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
641 Inst->replaceAllUsesWith(NewVal);
642 return nullptr;
643 }
644 }
645
646 // Loading a subvector.
647 if (isa<FixedVectorType>(AccessTy)) {
648 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
649 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
650 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
651 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
652
653 Value *SubVec = PoisonValue::get(SubVecTy);
654 for (unsigned K = 0; K < NumLoadedElts; ++K) {
655 Value *CurIdx =
656 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
657 SubVec = Builder.CreateInsertElement(
658 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
659 }
660
661 if (AccessTy->isPtrOrPtrVectorTy())
662 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
663 else if (SubVecTy->isPtrOrPtrVectorTy())
664 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
665
666 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
667 Inst->replaceAllUsesWith(SubVec);
668 return nullptr;
669 }
670
671 // We're loading one element.
672 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
673 if (AccessTy != VecEltTy)
674 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
675
676 Inst->replaceAllUsesWith(ExtractElement);
677 return nullptr;
678 }
679 case Instruction::Store: {
680 // For stores, it's a bit trickier and it depends on whether we're storing
681 // the full vector or not. If we're storing the full vector, we don't need
682 // to know the current value. If this is a store of a single element, we
683 // need to know the value.
685 Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
686 Value *Val = SI->getValueOperand();
687
688 // We're storing the full vector, we can handle this without knowing CurVal.
689 Type *AccessTy = Val->getType();
690 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
691 if (Constant *CI = dyn_cast<Constant>(Index)) {
692 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
693 if (AccessTy->isPtrOrPtrVectorTy())
694 Val = CreateTempPtrIntCast(Val, AccessTy);
695 else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
696 Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
697 return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
698 }
699 }
700
701 // Storing a subvector.
702 if (isa<FixedVectorType>(AccessTy)) {
703 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
704 const unsigned NumWrittenElts =
705 AccessSize / DL.getTypeStoreSize(VecEltTy);
706 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
707 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
708 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
709
710 if (SubVecTy->isPtrOrPtrVectorTy())
711 Val = CreateTempPtrIntCast(Val, SubVecTy);
712 else if (AccessTy->isPtrOrPtrVectorTy())
713 Val = CreateTempPtrIntCast(Val, AccessTy);
714
715 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
716
717 Value *CurVec = GetCurVal();
718 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
719 K < NumElts; ++K) {
720 Value *CurIdx =
721 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
722 CurVec = Builder.CreateInsertElement(
723 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
724 }
725 return CurVec;
726 }
727
728 if (Val->getType() != VecEltTy)
729 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
730 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
731 }
732 case Instruction::Call: {
733 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
734 // For memcpy, we need to know curval.
735 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
736 unsigned NumCopied = Length->getZExtValue() / ElementSize;
737 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
738 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
739 unsigned DestBegin = TI->DestIndex->getZExtValue();
740
741 SmallVector<int> Mask;
742 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
743 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
744 Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
745 ? SrcBegin++
747 } else {
748 Mask.push_back(Idx);
749 }
750 }
751
752 return Builder.CreateShuffleVector(GetCurVal(), Mask);
753 }
754
755 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
756 // For memset, we don't need to know the previous value because we
757 // currently only allow memsets that cover the whole alloca.
758 Value *Elt = MSI->getOperand(1);
759 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
760 if (BytesPerElt > 1) {
761 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
762
763 // If the element type of the vector is a pointer, we need to first cast
764 // to an integer, then use a PtrCast.
765 if (VecEltTy->isPointerTy()) {
766 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
767 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
768 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
769 } else
770 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
771 }
772
773 return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
774 }
775
776 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
777 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
778 Intr->replaceAllUsesWith(
779 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
780 DL.getTypeAllocSize(AA.Vector.Ty)));
781 return nullptr;
782 }
783 }
784
785 llvm_unreachable("Unsupported call when promoting alloca to vector");
786 }
787
788 default:
789 llvm_unreachable("Inconsistency in instructions promotable to vector");
790 }
791
792 llvm_unreachable("Did not return after promoting instruction!");
793}
794
795static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
796 const DataLayout &DL) {
797 // Access as a vector type can work if the size of the access vector is a
798 // multiple of the size of the alloca's vector element type.
799 //
800 // Examples:
801 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
802 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
803 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
804 // - 3*32 is not a multiple of 64
805 //
806 // We could handle more complicated cases, but it'd make things a lot more
807 // complicated.
808 if (isa<FixedVectorType>(AccessTy)) {
809 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
810 // If the type size and the store size don't match, we would need to do more
811 // than just bitcast to translate between an extracted/insertable subvectors
812 // and the accessed value.
813 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
814 return false;
815 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
816 return AccTS.isKnownMultipleOf(VecTS);
817 }
818
820 DL);
821}
822
823/// Iterates over an instruction worklist that may contain multiple instructions
824/// from the same basic block, but in a different order.
825template <typename InstContainer>
826static void forEachWorkListItem(const InstContainer &WorkList,
827 std::function<void(Instruction *)> Fn) {
828 // Bucket up uses of the alloca by the block they occur in.
829 // This is important because we have to handle multiple defs/uses in a block
830 // ourselves: SSAUpdater is purely for cross-block references.
832 for (Instruction *User : WorkList)
833 UsesByBlock[User->getParent()].insert(User);
834
835 for (Instruction *User : WorkList) {
836 BasicBlock *BB = User->getParent();
837 auto &BlockUses = UsesByBlock[BB];
838
839 // Already processed, skip.
840 if (BlockUses.empty())
841 continue;
842
843 // Only user in the block, directly process it.
844 if (BlockUses.size() == 1) {
845 Fn(User);
846 continue;
847 }
848
849 // Multiple users in the block, do a linear scan to see users in order.
850 for (Instruction &Inst : *BB) {
851 if (!BlockUses.contains(&Inst))
852 continue;
853
854 Fn(&Inst);
855 }
856
857 // Clear the block so we know it's been processed.
858 BlockUses.clear();
859 }
860}
861
862/// Find an insert point after an alloca, after all other allocas clustered at
863/// the start of the block.
866 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
867 ;
868 return I;
869}
870
872AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
873 if (DisablePromoteAllocaToVector) {
874 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
875 return nullptr;
876 }
877
878 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
879 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
880 uint64_t NumElems = 1;
881 Type *ElemTy;
882 do {
883 NumElems *= ArrayTy->getNumElements();
884 ElemTy = ArrayTy->getElementType();
885 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
886
887 // Check for array of vectors
888 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
889 if (InnerVectorTy) {
890 NumElems *= InnerVectorTy->getNumElements();
891 ElemTy = InnerVectorTy->getElementType();
892 }
893
894 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
895 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
896 if (ElementSize > 0) {
897 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
898 // Expand vector if required to match padding of inner type,
899 // i.e. odd size subvectors.
900 // Storage size of new vector must match that of alloca for correct
901 // behaviour of byte offsets and GEP computation.
902 if (NumElems * ElementSize != AllocaSize)
903 NumElems = AllocaSize / ElementSize;
904 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
905 VectorTy = FixedVectorType::get(ElemTy, NumElems);
906 }
907 }
908 }
909 if (!VectorTy) {
910 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
911 return nullptr;
912 }
913
914 const unsigned MaxElements =
915 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
916
917 if (VectorTy->getNumElements() > MaxElements ||
918 VectorTy->getNumElements() < 2) {
919 LLVM_DEBUG(dbgs() << " " << *VectorTy
920 << " has an unsupported number of elements\n");
921 return nullptr;
922 }
923
924 Type *VecEltTy = VectorTy->getElementType();
925 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
926 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
927 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
928 "does not match the type's size\n");
929 return nullptr;
930 }
931
932 return VectorTy;
933}
934
935void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
936 if (AA.HaveSelectOrPHI) {
937 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
938 return;
939 }
940
941 Type *AllocaTy = AA.Alloca->getAllocatedType();
942 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
943 if (!AA.Vector.Ty)
944 return;
945
946 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
947 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
948 << " " << *Inst << "\n");
949 AA.Vector.Ty = nullptr;
950 };
951
952 Type *VecEltTy = AA.Vector.Ty->getElementType();
953 unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
954 assert(ElementSize > 0);
955 for (auto *U : AA.Uses) {
956 Instruction *Inst = cast<Instruction>(U->getUser());
957
958 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
959 assert(!isa<StoreInst>(Inst) ||
960 U->getOperandNo() == StoreInst::getPointerOperandIndex());
961
962 Type *AccessTy = getLoadStoreType(Inst);
963 if (AccessTy->isAggregateType())
964 return RejectUser(Inst, "unsupported load/store as aggregate");
965 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
966
967 // Check that this is a simple access of a vector element.
968 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
969 : cast<StoreInst>(Inst)->isSimple();
970 if (!IsSimple)
971 return RejectUser(Inst, "not a simple load or store");
972
973 Ptr = Ptr->stripPointerCasts();
974
975 // Alloca already accessed as vector.
976 if (Ptr == AA.Alloca &&
977 DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
978 DL->getTypeStoreSize(AccessTy)) {
979 AA.Vector.Worklist.push_back(Inst);
980 continue;
981 }
982
983 if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
984 return RejectUser(Inst, "not a supported access type");
985
986 AA.Vector.Worklist.push_back(Inst);
987 continue;
988 }
989
990 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
991 // If we can't compute a vector index from this GEP, then we can't
992 // promote this alloca to vector.
993 auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
994 if (!Index)
995 return RejectUser(Inst, "cannot compute vector index for GEP");
996
997 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
998 AA.Vector.UsersToRemove.push_back(Inst);
999 continue;
1000 }
1001
1002 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
1003 MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
1004 AA.Vector.Worklist.push_back(Inst);
1005 continue;
1006 }
1007
1008 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
1009 if (TransferInst->isVolatile())
1010 return RejectUser(Inst, "mem transfer inst is volatile");
1011
1012 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
1013 if (!Len || (Len->getZExtValue() % ElementSize))
1014 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1015 "not a multiple of the vector element size");
1016
1017 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1018 if (Ptr == AA.Alloca)
1019 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1020
1022 const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
1023 if (GEPI.VarIndex)
1024 return nullptr;
1025 if (GEPI.ConstIndex)
1026 return GEPI.ConstIndex;
1027 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1028 };
1029
1030 MemTransferInfo *TI =
1031 &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1032 unsigned OpNum = U->getOperandNo();
1033 if (OpNum == 0) {
1034 Value *Dest = TransferInst->getDest();
1035 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1036 if (!Index)
1037 return RejectUser(Inst, "could not calculate constant dest index");
1038 TI->DestIndex = Index;
1039 } else {
1040 assert(OpNum == 1);
1041 Value *Src = TransferInst->getSource();
1042 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1043 if (!Index)
1044 return RejectUser(Inst, "could not calculate constant src index");
1045 TI->SrcIndex = Index;
1046 }
1047 continue;
1048 }
1049
1050 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
1051 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1052 AA.Vector.Worklist.push_back(Inst);
1053 continue;
1054 }
1055 }
1056
1057 // Ignore assume-like intrinsics and comparisons used in assumes.
1058 if (isAssumeLikeIntrinsic(Inst)) {
1059 if (!Inst->use_empty())
1060 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1061 AA.Vector.UsersToRemove.push_back(Inst);
1062 continue;
1063 }
1064
1065 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
1066 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1067 })) {
1068 AA.Vector.UsersToRemove.push_back(Inst);
1069 continue;
1070 }
1071
1072 return RejectUser(Inst, "unhandled alloca user");
1073 }
1074
1075 // Follow-up check to ensure we've seen both sides of all transfer insts.
1076 for (const auto &Entry : AA.Vector.TransferInfo) {
1077 const MemTransferInfo &TI = Entry.second;
1078 if (!TI.SrcIndex || !TI.DestIndex)
1079 return RejectUser(Entry.first,
1080 "mem transfer inst between different objects");
1081 AA.Vector.Worklist.push_back(Entry.first);
1082 }
1083}
1084
1085void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1086 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1087 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1088 << " -> " << *AA.Vector.Ty << '\n');
1089 const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
1090
1091 Type *VecEltTy = AA.Vector.Ty->getElementType();
1092 const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
1093
1094 // Alloca is uninitialized memory. Imitate that by making the first value
1095 // undef.
1096 SSAUpdater Updater;
1097 Updater.Initialize(AA.Vector.Ty, "promotealloca");
1098
1099 BasicBlock *EntryBB = AA.Alloca->getParent();
1100 BasicBlock::iterator InitInsertPos =
1101 skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
1102 IRBuilder<> Builder(&*InitInsertPos);
1103 Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
1104 AllocaInitValue->takeName(AA.Alloca);
1105
1106 Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
1107
1108 // First handle the initial worklist, in basic block order.
1109 //
1110 // Insert a placeholder whenever we need the vector value at the top of a
1111 // basic block.
1112 SmallVector<Instruction *> Placeholders;
1113 forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
1114 BasicBlock *BB = I->getParent();
1115 auto GetCurVal = [&]() -> Value * {
1116 if (Value *CurVal = Updater.FindValueForBlock(BB))
1117 return CurVal;
1118
1119 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1120 return Placeholders.back();
1121
1122 // If the current value in the basic block is not yet known, insert a
1123 // placeholder that we will replace later.
1124 IRBuilder<> Builder(I);
1125 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1126 PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
1127 Placeholders.push_back(Placeholder);
1128 return Placeholders.back();
1129 };
1130
1131 Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
1132 ElementSize, GetCurVal);
1133 if (Result)
1134 Updater.AddAvailableValue(BB, Result);
1135 });
1136
1137 // Now fixup the placeholders.
1138 for (Instruction *Placeholder : Placeholders) {
1139 Placeholder->replaceAllUsesWith(
1140 Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
1141 Placeholder->eraseFromParent();
1142 }
1143
1144 // Delete all instructions.
1145 for (Instruction *I : AA.Vector.Worklist) {
1146 assert(I->use_empty());
1147 I->eraseFromParent();
1148 }
1149
1150 // Delete all the users that are known to be removeable.
1151 for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
1152 I->dropDroppableUses();
1153 assert(I->use_empty());
1154 I->eraseFromParent();
1155 }
1156
1157 // Alloca should now be dead too.
1158 assert(AA.Alloca->use_empty());
1159 AA.Alloca->eraseFromParent();
1160}
1161
1162std::pair<Value *, Value *>
1163AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1164 Function &F = *Builder.GetInsertBlock()->getParent();
1166
1167 if (!IsAMDHSA) {
1168 CallInst *LocalSizeY =
1169 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1170 CallInst *LocalSizeZ =
1171 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1172
1173 ST.makeLIDRangeMetadata(LocalSizeY);
1174 ST.makeLIDRangeMetadata(LocalSizeZ);
1175
1176 return std::pair(LocalSizeY, LocalSizeZ);
1177 }
1178
1179 // We must read the size out of the dispatch pointer.
1180 assert(IsAMDGCN);
1181
1182 // We are indexing into this struct, and want to extract the workgroup_size_*
1183 // fields.
1184 //
1185 // typedef struct hsa_kernel_dispatch_packet_s {
1186 // uint16_t header;
1187 // uint16_t setup;
1188 // uint16_t workgroup_size_x ;
1189 // uint16_t workgroup_size_y;
1190 // uint16_t workgroup_size_z;
1191 // uint16_t reserved0;
1192 // uint32_t grid_size_x ;
1193 // uint32_t grid_size_y ;
1194 // uint32_t grid_size_z;
1195 //
1196 // uint32_t private_segment_size;
1197 // uint32_t group_segment_size;
1198 // uint64_t kernel_object;
1199 //
1200 // #ifdef HSA_LARGE_MODEL
1201 // void *kernarg_address;
1202 // #elif defined HSA_LITTLE_ENDIAN
1203 // void *kernarg_address;
1204 // uint32_t reserved1;
1205 // #else
1206 // uint32_t reserved1;
1207 // void *kernarg_address;
1208 // #endif
1209 // uint64_t reserved2;
1210 // hsa_signal_t completion_signal; // uint64_t wrapper
1211 // } hsa_kernel_dispatch_packet_t
1212 //
1213 CallInst *DispatchPtr =
1214 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1215 DispatchPtr->addRetAttr(Attribute::NoAlias);
1216 DispatchPtr->addRetAttr(Attribute::NonNull);
1217 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1218
1219 // Size of the dispatch packet struct.
1220 DispatchPtr->addDereferenceableRetAttr(64);
1221
1222 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1223
1224 // We could do a single 64-bit load here, but it's likely that the basic
1225 // 32-bit and extract sequence is already present, and it is probably easier
1226 // to CSE this. The loads should be mergeable later anyway.
1227 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1228 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1229
1230 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1231 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1232
1233 MDNode *MD = MDNode::get(Mod->getContext(), {});
1234 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1235 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1236 ST.makeLIDRangeMetadata(LoadZU);
1237
1238 // Extract y component. Upper half of LoadZU should be zero already.
1239 Value *Y = Builder.CreateLShr(LoadXY, 16);
1240
1241 return std::pair(Y, LoadZU);
1242}
1243
1244Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1245 unsigned N) {
1246 Function *F = Builder.GetInsertBlock()->getParent();
1249 StringRef AttrName;
1250
1251 switch (N) {
1252 case 0:
1253 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1254 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1255 AttrName = "amdgpu-no-workitem-id-x";
1256 break;
1257 case 1:
1258 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1259 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1260 AttrName = "amdgpu-no-workitem-id-y";
1261 break;
1262
1263 case 2:
1264 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1265 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1266 AttrName = "amdgpu-no-workitem-id-z";
1267 break;
1268 default:
1269 llvm_unreachable("invalid dimension");
1270 }
1271
1272 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1273 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1274 ST.makeLIDRangeMetadata(CI);
1275 F->removeFnAttr(AttrName);
1276
1277 return CI;
1278}
1279
1280static bool isCallPromotable(CallInst *CI) {
1282 if (!II)
1283 return false;
1284
1285 switch (II->getIntrinsicID()) {
1286 case Intrinsic::memcpy:
1287 case Intrinsic::memmove:
1288 case Intrinsic::memset:
1289 case Intrinsic::lifetime_start:
1290 case Intrinsic::lifetime_end:
1291 case Intrinsic::invariant_start:
1292 case Intrinsic::invariant_end:
1293 case Intrinsic::launder_invariant_group:
1294 case Intrinsic::strip_invariant_group:
1295 case Intrinsic::objectsize:
1296 return true;
1297 default:
1298 return false;
1299 }
1300}
1301
1302bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1303 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1304 int OpIdx1) const {
1305 // Figure out which operand is the one we might not be promoting.
1306 Value *OtherOp = Inst->getOperand(OpIdx0);
1307 if (Val == OtherOp)
1308 OtherOp = Inst->getOperand(OpIdx1);
1309
1311 return true;
1312
1313 // TODO: getUnderlyingObject will not work on a vector getelementptr
1314 Value *OtherObj = getUnderlyingObject(OtherOp);
1315 if (!isa<AllocaInst>(OtherObj))
1316 return false;
1317
1318 // TODO: We should be able to replace undefs with the right pointer type.
1319
1320 // TODO: If we know the other base object is another promotable
1321 // alloca, not necessarily this alloca, we can do this. The
1322 // important part is both must have the same address space at
1323 // the end.
1324 if (OtherObj != BaseAlloca) {
1325 LLVM_DEBUG(
1326 dbgs() << "Found a binary instruction with another alloca object\n");
1327 return false;
1328 }
1329
1330 return true;
1331}
1332
1333void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1334 if (DisablePromoteAllocaToLDS) {
1335 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1336 return;
1337 }
1338
1339 // Don't promote the alloca to LDS for shader calling conventions as the work
1340 // item ID intrinsics are not supported for these calling conventions.
1341 // Furthermore not all LDS is available for some of the stages.
1342 const Function &ContainingFunction = *AA.Alloca->getFunction();
1343 CallingConv::ID CC = ContainingFunction.getCallingConv();
1344
1345 switch (CC) {
1348 break;
1349 default:
1350 LLVM_DEBUG(
1351 dbgs()
1352 << " promote alloca to LDS not supported with calling convention.\n");
1353 return;
1354 }
1355
1356 for (Use *Use : AA.Uses) {
1357 auto *User = Use->getUser();
1358
1359 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1360 if (!isCallPromotable(CI))
1361 return;
1362
1363 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1364 AA.LDS.Worklist.push_back(User);
1365 continue;
1366 }
1367
1369 if (UseInst->getOpcode() == Instruction::PtrToInt)
1370 return;
1371
1372 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1373 if (LI->isVolatile())
1374 return;
1375 continue;
1376 }
1377
1378 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1379 if (SI->isVolatile())
1380 return;
1381 continue;
1382 }
1383
1384 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1385 if (RMW->isVolatile())
1386 return;
1387 continue;
1388 }
1389
1390 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1391 if (CAS->isVolatile())
1392 return;
1393 continue;
1394 }
1395
1396 // Only promote a select if we know that the other select operand
1397 // is from another pointer that will also be promoted.
1398 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1399 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
1400 return;
1401
1402 // May need to rewrite constant operands.
1403 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1404 AA.LDS.Worklist.push_back(ICmp);
1405 continue;
1406 }
1407
1409 // Be conservative if an address could be computed outside the bounds of
1410 // the alloca.
1411 if (!GEP->isInBounds())
1412 return;
1414 // Do not promote vector/aggregate type instructions. It is hard to track
1415 // their users.
1416
1417 // Do not promote addrspacecast.
1418 //
1419 // TODO: If we know the address is only observed through flat pointers, we
1420 // could still promote.
1421 return;
1422 }
1423
1424 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1425 AA.LDS.Worklist.push_back(User);
1426 }
1427
1428 AA.LDS.Enable = true;
1429}
1430
1431bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1432
1433 FunctionType *FTy = F.getFunctionType();
1435
1436 // If the function has any arguments in the local address space, then it's
1437 // possible these arguments require the entire local memory space, so
1438 // we cannot use local memory in the pass.
1439 for (Type *ParamTy : FTy->params()) {
1440 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1441 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1442 LocalMemLimit = 0;
1443 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1444 "local memory disabled.\n");
1445 return false;
1446 }
1447 }
1448
1449 LocalMemLimit = ST.getAddressableLocalMemorySize();
1450 if (LocalMemLimit == 0)
1451 return false;
1452
1454 SmallPtrSet<const Constant *, 8> VisitedConstants;
1456
1457 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1458 for (const User *U : Val->users()) {
1459 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1460 if (Use->getFunction() == &F)
1461 return true;
1462 } else {
1463 const Constant *C = cast<Constant>(U);
1464 if (VisitedConstants.insert(C).second)
1465 Stack.push_back(C);
1466 }
1467 }
1468
1469 return false;
1470 };
1471
1472 for (GlobalVariable &GV : Mod->globals()) {
1474 continue;
1475
1476 if (visitUsers(&GV, &GV)) {
1477 UsedLDS.insert(&GV);
1478 Stack.clear();
1479 continue;
1480 }
1481
1482 // For any ConstantExpr uses, we need to recursively search the users until
1483 // we see a function.
1484 while (!Stack.empty()) {
1485 const Constant *C = Stack.pop_back_val();
1486 if (visitUsers(&GV, C)) {
1487 UsedLDS.insert(&GV);
1488 Stack.clear();
1489 break;
1490 }
1491 }
1492 }
1493
1494 const DataLayout &DL = Mod->getDataLayout();
1495 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1496 AllocatedSizes.reserve(UsedLDS.size());
1497
1498 for (const GlobalVariable *GV : UsedLDS) {
1499 Align Alignment =
1500 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1501 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1502
1503 // HIP uses an extern unsized array in local address space for dynamically
1504 // allocated shared memory. In that case, we have to disable the promotion.
1505 if (GV->hasExternalLinkage() && AllocSize == 0) {
1506 LocalMemLimit = 0;
1507 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1508 "local memory. Promoting to local memory "
1509 "disabled.\n");
1510 return false;
1511 }
1512
1513 AllocatedSizes.emplace_back(AllocSize, Alignment);
1514 }
1515
1516 // Sort to try to estimate the worst case alignment padding
1517 //
1518 // FIXME: We should really do something to fix the addresses to a more optimal
1519 // value instead
1520 llvm::sort(AllocatedSizes, llvm::less_second());
1521
1522 // Check how much local memory is being used by global objects
1523 CurrentLocalMemUsage = 0;
1524
1525 // FIXME: Try to account for padding here. The real padding and address is
1526 // currently determined from the inverse order of uses in the function when
1527 // legalizing, which could also potentially change. We try to estimate the
1528 // worst case here, but we probably should fix the addresses earlier.
1529 for (auto Alloc : AllocatedSizes) {
1530 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1531 CurrentLocalMemUsage += Alloc.first;
1532 }
1533
1534 unsigned MaxOccupancy =
1535 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1536 .second;
1537
1538 // Round up to the next tier of usage.
1539 unsigned MaxSizeWithWaveCount =
1540 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1541
1542 // Program may already use more LDS than is usable at maximum occupancy.
1543 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1544 return false;
1545
1546 LocalMemLimit = MaxSizeWithWaveCount;
1547
1548 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1549 << " bytes of LDS\n"
1550 << " Rounding size to " << MaxSizeWithWaveCount
1551 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1552 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1553 << " available for promotion\n");
1554
1555 return true;
1556}
1557
1558// FIXME: Should try to pick the most likely to be profitable allocas first.
1559bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1560 AllocaAnalysis &AA, bool SufficientLDS,
1561 SetVector<IntrinsicInst *> &DeferredIntrs) {
1562 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1563
1564 // Not likely to have sufficient local memory for promotion.
1565 if (!SufficientLDS)
1566 return false;
1567
1568 const DataLayout &DL = Mod->getDataLayout();
1569 IRBuilder<> Builder(AA.Alloca);
1570
1571 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1572 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1573 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1574
1575 Align Alignment = DL.getValueOrABITypeAlignment(
1576 AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
1577
1578 // FIXME: This computed padding is likely wrong since it depends on inverse
1579 // usage order.
1580 //
1581 // FIXME: It is also possible that if we're allowed to use all of the memory
1582 // could end up using more than the maximum due to alignment padding.
1583
1584 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1585 uint32_t AllocSize =
1586 WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
1587 NewSize += AllocSize;
1588
1589 if (NewSize > LocalMemLimit) {
1590 LLVM_DEBUG(dbgs() << " " << AllocSize
1591 << " bytes of local memory not available to promote\n");
1592 return false;
1593 }
1594
1595 CurrentLocalMemUsage = NewSize;
1596
1597 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1598
1599 Function *F = AA.Alloca->getFunction();
1600
1601 Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
1604 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1607 GV->setAlignment(AA.Alloca->getAlign());
1608
1609 Value *TCntY, *TCntZ;
1610
1611 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1612 Value *TIdX = getWorkitemID(Builder, 0);
1613 Value *TIdY = getWorkitemID(Builder, 1);
1614 Value *TIdZ = getWorkitemID(Builder, 2);
1615
1616 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1617 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1618 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1619 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1620 TID = Builder.CreateAdd(TID, TIdZ);
1621
1622 LLVMContext &Context = Mod->getContext();
1624
1625 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1626 AA.Alloca->mutateType(Offset->getType());
1627 AA.Alloca->replaceAllUsesWith(Offset);
1628 AA.Alloca->eraseFromParent();
1629
1631
1632 for (Value *V : AA.LDS.Worklist) {
1634 if (!Call) {
1635 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1636 Value *LHS = CI->getOperand(0);
1637 Value *RHS = CI->getOperand(1);
1638
1639 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1641 CI->setOperand(0, Constant::getNullValue(NewTy));
1642
1644 CI->setOperand(1, Constant::getNullValue(NewTy));
1645
1646 continue;
1647 }
1648
1649 // The operand's value should be corrected on its own and we don't want to
1650 // touch the users.
1652 continue;
1653
1654 assert(V->getType()->isPtrOrPtrVectorTy());
1655
1656 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1657 V->mutateType(NewTy);
1658
1659 // Adjust the types of any constant operands.
1662 SI->setOperand(1, Constant::getNullValue(NewTy));
1663
1665 SI->setOperand(2, Constant::getNullValue(NewTy));
1666 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1667 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1669 Phi->getIncomingValue(I)))
1670 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1671 }
1672 }
1673
1674 continue;
1675 }
1676
1678 Builder.SetInsertPoint(Intr);
1679 switch (Intr->getIntrinsicID()) {
1680 case Intrinsic::lifetime_start:
1681 case Intrinsic::lifetime_end:
1682 // These intrinsics are for address space 0 only
1683 Intr->eraseFromParent();
1684 continue;
1685 case Intrinsic::memcpy:
1686 case Intrinsic::memmove:
1687 // These have 2 pointer operands. In case if second pointer also needs
1688 // to be replaced we defer processing of these intrinsics until all
1689 // other values are processed.
1690 DeferredIntrs.insert(Intr);
1691 continue;
1692 case Intrinsic::memset: {
1693 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1694 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1695 MemSet->getLength(), MemSet->getDestAlign(),
1696 MemSet->isVolatile());
1697 Intr->eraseFromParent();
1698 continue;
1699 }
1700 case Intrinsic::invariant_start:
1701 case Intrinsic::invariant_end:
1702 case Intrinsic::launder_invariant_group:
1703 case Intrinsic::strip_invariant_group: {
1705 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1706 Args.emplace_back(Intr->getArgOperand(0));
1707 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1708 Args.emplace_back(Intr->getArgOperand(0));
1709 Args.emplace_back(Intr->getArgOperand(1));
1710 }
1711 Args.emplace_back(Offset);
1713 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1714 CallInst *NewIntr =
1715 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1716 Intr->mutateType(NewIntr->getType());
1717 Intr->replaceAllUsesWith(NewIntr);
1718 Intr->eraseFromParent();
1719 continue;
1720 }
1721 case Intrinsic::objectsize: {
1722 Value *Src = Intr->getOperand(0);
1723
1724 CallInst *NewCall = Builder.CreateIntrinsic(
1725 Intrinsic::objectsize,
1727 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1728 Intr->replaceAllUsesWith(NewCall);
1729 Intr->eraseFromParent();
1730 continue;
1731 }
1732 default:
1733 Intr->print(errs());
1734 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1735 }
1736 }
1737
1738 return true;
1739}
1740
1741void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1742 SetVector<IntrinsicInst *> &DeferredIntrs) {
1743
1744 for (IntrinsicInst *Intr : DeferredIntrs) {
1745 IRBuilder<> Builder(Intr);
1746 Builder.SetInsertPoint(Intr);
1748 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1749
1751 auto *B = Builder.CreateMemTransferInst(
1752 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1753 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1754
1755 for (unsigned I = 0; I != 2; ++I) {
1756 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1757 B->addDereferenceableParamAttr(I, Bytes);
1758 }
1759 }
1760
1761 Intr->eraseFromParent();
1762 }
1763}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
AMDGPU promote alloca to vector or LDS
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static std::optional< GEPToVectorIndex > computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL)
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1996
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1078
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:928
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2079
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446