LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(32));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(4));
81
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(4));
87
88// Shared implementation which can do both promotion to vector and to LDS.
89class AMDGPUPromoteAllocaImpl {
90private:
91 const TargetMachine &TM;
92 LoopInfo &LI;
93 Module *Mod = nullptr;
94 const DataLayout *DL = nullptr;
95
96 // FIXME: This should be per-kernel.
97 uint32_t LocalMemLimit = 0;
98 uint32_t CurrentLocalMemUsage = 0;
99 unsigned MaxVGPRs;
100 unsigned VGPRBudgetRatio;
101 unsigned MaxVectorRegs;
102
103 bool IsAMDGCN = false;
104 bool IsAMDHSA = false;
105
106 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
107 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
108
109 /// BaseAlloca is the alloca root the search started from.
110 /// Val may be that alloca or a recursive user of it.
111 bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
112 std::vector<Value *> &WorkList) const;
113
114 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116 /// Returns true if both operands are derived from the same alloca. Val should
117 /// be the same value as one of the input operands of UseInst.
118 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
119 Instruction *UseInst, int OpIdx0,
120 int OpIdx1) const;
121
122 /// Check whether we have enough local memory for promotion.
123 bool hasSufficientLocalMem(const Function &F);
124
125 bool tryPromoteAllocaToVector(AllocaInst &I);
126 bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
127
128 void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
129
130 void setFunctionLimits(const Function &F);
131
132public:
133 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
134
135 const Triple &TT = TM.getTargetTriple();
136 IsAMDGCN = TT.isAMDGCN();
137 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
138 }
139
140 bool run(Function &F, bool PromoteToLDS);
141};
142
143// FIXME: This can create globals so should be a module pass.
144class AMDGPUPromoteAlloca : public FunctionPass {
145public:
146 static char ID;
147
148 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
149
150 bool runOnFunction(Function &F) override {
151 if (skipFunction(F))
152 return false;
153 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154 return AMDGPUPromoteAllocaImpl(
155 TPC->getTM<TargetMachine>(),
156 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
157 .run(F, /*PromoteToLDS*/ true);
158 return false;
159 }
160
161 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
162
163 void getAnalysisUsage(AnalysisUsage &AU) const override {
164 AU.setPreservesCFG();
167 }
168};
169
170static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
171 const Function &F) {
172 if (!TM.getTargetTriple().isAMDGCN())
173 return 128;
174
175 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
176
177 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
178 // Temporarily check both the attribute and the subtarget feature, until the
179 // latter is removed.
180 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
181 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
182
183 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
185 DynamicVGPRBlockSize);
186
187 // A non-entry function has only 32 caller preserved registers.
188 // Do not promote alloca which will force spilling unless we know the function
189 // will be inlined.
190 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
191 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
192 MaxVGPRs = std::min(MaxVGPRs, 32u);
193 return MaxVGPRs;
194}
195
196} // end anonymous namespace
197
198char AMDGPUPromoteAlloca::ID = 0;
199
201 "AMDGPU promote alloca to vector or LDS", false, false)
202// Move LDS uses from functions to kernels before promote alloca for accurate
203// estimation of LDS available
204INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
206INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
207 "AMDGPU promote alloca to vector or LDS", false, false)
208
209char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
210
213 auto &LI = AM.getResult<LoopAnalysis>(F);
214 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
215 if (Changed) {
218 return PA;
219 }
220 return PreservedAnalyses::all();
221}
222
225 auto &LI = AM.getResult<LoopAnalysis>(F);
226 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
227 if (Changed) {
230 return PA;
231 }
232 return PreservedAnalyses::all();
233}
234
236 return new AMDGPUPromoteAlloca();
237}
238
239static void collectAllocaUses(AllocaInst &Alloca,
241 SmallVector<Instruction *, 4> WorkList({&Alloca});
242 while (!WorkList.empty()) {
243 auto *Cur = WorkList.pop_back_val();
244 for (auto &U : Cur->uses()) {
245 Uses.push_back(&U);
246
247 if (isa<GetElementPtrInst>(U.getUser()))
248 WorkList.push_back(cast<Instruction>(U.getUser()));
249 }
250 }
251}
252
253void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
256
257 for (auto *Alloca : Allocas) {
258 LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
259 unsigned &Score = Scores[Alloca];
260 // Increment score by one for each user + a bonus for users within loops.
262 collectAllocaUses(*Alloca, Uses);
263 for (auto *U : Uses) {
264 Instruction *Inst = cast<Instruction>(U->getUser());
265 if (isa<GetElementPtrInst>(Inst))
266 continue;
267 unsigned UserScore =
268 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
269 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
270 Score += UserScore;
271 }
272 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
273 }
274
275 stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
276 return Scores.at(A) > Scores.at(B);
277 });
278
279 // clang-format off
281 dbgs() << "Sorted Worklist:\n";
282 for (auto *A: Allocas)
283 dbgs() << " " << *A << "\n";
284 );
285 // clang-format on
286}
287
288void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289 // Load per function limits, overriding with global options where appropriate.
290 // R600 register tuples/aliasing are fragile with large vector promotions so
291 // apply architecture specific limit here.
292 const int R600MaxVectorRegs = 16;
293 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
294 "amdgpu-promote-alloca-to-vector-max-regs",
295 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
296 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
297 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
298 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
299 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
300 PromoteAllocaToVectorVGPRRatio);
301 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
302 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
303}
304
305bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
306 Mod = F.getParent();
307 DL = &Mod->getDataLayout();
308
310 if (!ST.isPromoteAllocaEnabled())
311 return false;
312
313 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
314 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
315 setFunctionLimits(F);
316
317 unsigned VectorizationBudget =
318 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
319 : (MaxVGPRs * 32)) /
320 VGPRBudgetRatio;
321
323 for (Instruction &I : F.getEntryBlock()) {
324 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
325 // Array allocations are probably not worth handling, since an allocation
326 // of the array type is the canonical form.
327 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
328 continue;
329 Allocas.push_back(AI);
330 }
331 }
332
333 sortAllocasToPromote(Allocas);
334
335 bool Changed = false;
336 for (AllocaInst *AI : Allocas) {
337 const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
338 // First, check if we have enough budget to vectorize this alloca.
339 if (AllocaCost <= VectorizationBudget) {
340 // If we do, attempt vectorization, otherwise, fall through and try
341 // promoting to LDS instead.
342 if (tryPromoteAllocaToVector(*AI)) {
343 Changed = true;
344 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345 "Underflow!");
346 VectorizationBudget -= AllocaCost;
347 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
348 << VectorizationBudget << "\n");
349 continue;
350 }
351 } else {
352 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
353 << AllocaCost << ", budget:" << VectorizationBudget
354 << "): " << *AI << "\n");
355 }
356
357 if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
358 Changed = true;
359 }
360
361 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
362 // dangling pointers. If we want to reuse it past this point, the loop above
363 // would need to be updated to remove successfully promoted allocas.
364
365 return Changed;
366}
367
371};
372
373// Checks if the instruction I is a memset user of the alloca AI that we can
374// deal with. Currently, only non-volatile memsets that affect the whole alloca
375// are handled.
377 const DataLayout &DL) {
378 using namespace PatternMatch;
379 // For now we only care about non-volatile memsets that affect the whole type
380 // (start at index 0 and fill the whole alloca).
381 //
382 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
383 // (except maybe volatile ones?) - we just need to use shufflevector if it
384 // only affects a subset of the vector.
385 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
386 return I->getOperand(0) == AI &&
387 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
388}
389
391 Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
392 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
393 if (!GEP)
394 return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
395
396 auto I = GEPIdx.find(GEP);
397 assert(I != GEPIdx.end() && "Must have entry for GEP!");
398
399 Value *IndexValue = I->second;
400 assert(IndexValue && "index value missing from GEP index map");
401 return IndexValue;
402}
403
405 Type *VecElemTy, const DataLayout &DL,
406 SmallVector<Instruction *> &NewInsts) {
407 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
408 // helper.
409 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
411 APInt ConstOffset(BW, 0);
412
413 // Walk backwards through nested GEPs to collect both constant and variable
414 // offsets, so that nested vector GEP chains can be lowered in one step.
415 //
416 // Given this IR fragment as input:
417 //
418 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
419 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
420 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
421 // %3 = load i32, ptr addrspace(5) %2, align 4
422 //
423 // Combine both GEP operations in a single pass, producing:
424 // BasePtr = %0
425 // ConstOffset = 4
426 // VarOffsets = { %j -> element_size(<2 x i32>) }
427 //
428 // That lets us emit a single buffer_load directly into a VGPR, without ever
429 // allocating scratch memory for the intermediate pointer.
430 Value *CurPtr = GEP;
431 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
432 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
433 return nullptr;
434
435 // Move to the next outer pointer.
436 CurPtr = CurGEP->getPointerOperand();
437 }
438
439 assert(CurPtr == Alloca && "GEP not based on alloca");
440
441 unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
442 if (VarOffsets.size() > 1)
443 return nullptr;
444
445 APInt IndexQuot;
446 APInt Rem;
447 APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
448 IndexQuot, Rem);
449 if (!Rem.isZero())
450 return nullptr;
451 if (VarOffsets.size() == 0)
452 return ConstantInt::get(GEP->getContext(), IndexQuot);
453
454 IRBuilder<> Builder(GEP);
455
456 const auto &VarOffset = VarOffsets.front();
457 APInt OffsetQuot;
458 APInt::sdivrem(VarOffset.second,
459 APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
460 Rem);
461 if (!Rem.isZero() || OffsetQuot.isZero())
462 return nullptr;
463
464 Value *Offset = VarOffset.first;
465 auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
466 if (!OffsetType)
467 return nullptr;
468
469 if (!OffsetQuot.isOne()) {
470 ConstantInt *ConstMul =
471 ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
472 Offset = Builder.CreateMul(Offset, ConstMul);
474 NewInsts.push_back(NewInst);
475 }
476 if (ConstOffset.isZero())
477 return Offset;
478
479 ConstantInt *ConstIndex =
480 ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
481 Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
482 if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
483 NewInsts.push_back(NewInst);
484 return IndexAdd;
485}
486
487/// Promotes a single user of the alloca to a vector form.
488///
489/// \param Inst Instruction to be promoted.
490/// \param DL Module Data Layout.
491/// \param VectorTy Vectorized Type.
492/// \param VecStoreSize Size of \p VectorTy in bytes.
493/// \param ElementSize Size of \p VectorTy element type in bytes.
494/// \param TransferInfo MemTransferInst info map.
495/// \param GEPVectorIdx GEP -> VectorIdx cache.
496/// \param CurVal Current value of the vector (e.g. last stored value)
497/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
498/// be promoted now. This happens when promoting requires \p
499/// CurVal, but \p CurVal is nullptr.
500/// \return the stored value if \p Inst would have written to the alloca, or
501/// nullptr otherwise.
503 Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
504 unsigned VecStoreSize, unsigned ElementSize,
506 std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
507 SmallVectorImpl<LoadInst *> &DeferredLoads) {
508 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
509 // to do more folding, especially in the case of vector splats.
512 Builder.SetInsertPoint(Inst);
513
514 const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
515 if (CurVal)
516 return CurVal;
517
518 // If the current value is not known, insert a dummy load and lower it on
519 // the second pass.
520 LoadInst *Dummy =
521 Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
522 "promotealloca.dummyload");
523 DeferredLoads.push_back(Dummy);
524 return Dummy;
525 };
526
527 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
528 Type *PtrTy) -> Value * {
529 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
530 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
531 if (!PtrTy->isVectorTy())
532 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
533 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
534 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
535 // first cast the ptr vector to <2 x i64>.
536 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
537 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
538 return Builder.CreateBitOrPointerCast(
539 Val, FixedVectorType::get(EltTy, NumPtrElts));
540 };
541
542 Type *VecEltTy = VectorTy->getElementType();
543
544 switch (Inst->getOpcode()) {
545 case Instruction::Load: {
546 // Loads can only be lowered if the value is known.
547 if (!CurVal) {
548 DeferredLoads.push_back(cast<LoadInst>(Inst));
549 return nullptr;
550 }
551
553 cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
554
555 // We're loading the full vector.
556 Type *AccessTy = Inst->getType();
557 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
558 if (Constant *CI = dyn_cast<Constant>(Index)) {
559 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
560 if (AccessTy->isPtrOrPtrVectorTy())
561 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
562 else if (CurVal->getType()->isPtrOrPtrVectorTy())
563 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
564 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
565 Inst->replaceAllUsesWith(NewVal);
566 return nullptr;
567 }
568 }
569
570 // Loading a subvector.
571 if (isa<FixedVectorType>(AccessTy)) {
572 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
573 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
574 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
575 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
576
577 Value *SubVec = PoisonValue::get(SubVecTy);
578 for (unsigned K = 0; K < NumLoadedElts; ++K) {
579 Value *CurIdx =
580 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
581 SubVec = Builder.CreateInsertElement(
582 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
583 }
584
585 if (AccessTy->isPtrOrPtrVectorTy())
586 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
587 else if (SubVecTy->isPtrOrPtrVectorTy())
588 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
589
590 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
591 Inst->replaceAllUsesWith(SubVec);
592 return nullptr;
593 }
594
595 // We're loading one element.
596 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
597 if (AccessTy != VecEltTy)
598 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
599
600 Inst->replaceAllUsesWith(ExtractElement);
601 return nullptr;
602 }
603 case Instruction::Store: {
604 // For stores, it's a bit trickier and it depends on whether we're storing
605 // the full vector or not. If we're storing the full vector, we don't need
606 // to know the current value. If this is a store of a single element, we
607 // need to know the value.
609 Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
610 Value *Val = SI->getValueOperand();
611
612 // We're storing the full vector, we can handle this without knowing CurVal.
613 Type *AccessTy = Val->getType();
614 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
615 if (Constant *CI = dyn_cast<Constant>(Index)) {
616 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
617 if (AccessTy->isPtrOrPtrVectorTy())
618 Val = CreateTempPtrIntCast(Val, AccessTy);
619 else if (VectorTy->isPtrOrPtrVectorTy())
620 Val = CreateTempPtrIntCast(Val, VectorTy);
621 return Builder.CreateBitOrPointerCast(Val, VectorTy);
622 }
623 }
624
625 // Storing a subvector.
626 if (isa<FixedVectorType>(AccessTy)) {
627 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
628 const unsigned NumWrittenElts =
629 AccessSize / DL.getTypeStoreSize(VecEltTy);
630 const unsigned NumVecElts = VectorTy->getNumElements();
631 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
632 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
633
634 if (SubVecTy->isPtrOrPtrVectorTy())
635 Val = CreateTempPtrIntCast(Val, SubVecTy);
636 else if (AccessTy->isPtrOrPtrVectorTy())
637 Val = CreateTempPtrIntCast(Val, AccessTy);
638
639 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
640
641 Value *CurVec = GetOrLoadCurrentVectorValue();
642 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
643 K < NumElts; ++K) {
644 Value *CurIdx =
645 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
646 CurVec = Builder.CreateInsertElement(
647 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
648 }
649 return CurVec;
650 }
651
652 if (Val->getType() != VecEltTy)
653 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
654 return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
655 Index);
656 }
657 case Instruction::Call: {
658 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
659 // For memcpy, we need to know curval.
660 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
661 unsigned NumCopied = Length->getZExtValue() / ElementSize;
662 MemTransferInfo *TI = &TransferInfo[MTI];
663 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
664 unsigned DestBegin = TI->DestIndex->getZExtValue();
665
666 SmallVector<int> Mask;
667 for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
668 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
669 Mask.push_back(SrcBegin < VectorTy->getNumElements()
670 ? SrcBegin++
672 } else {
673 Mask.push_back(Idx);
674 }
675 }
676
677 return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
678 }
679
680 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
681 // For memset, we don't need to know the previous value because we
682 // currently only allow memsets that cover the whole alloca.
683 Value *Elt = MSI->getOperand(1);
684 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
685 if (BytesPerElt > 1) {
686 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
687
688 // If the element type of the vector is a pointer, we need to first cast
689 // to an integer, then use a PtrCast.
690 if (VecEltTy->isPointerTy()) {
691 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
692 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
693 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
694 } else
695 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
696 }
697
698 return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
699 }
700
701 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
702 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
703 Intr->replaceAllUsesWith(
704 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
705 DL.getTypeAllocSize(VectorTy)));
706 return nullptr;
707 }
708 }
709
710 llvm_unreachable("Unsupported call when promoting alloca to vector");
711 }
712
713 default:
714 llvm_unreachable("Inconsistency in instructions promotable to vector");
715 }
716
717 llvm_unreachable("Did not return after promoting instruction!");
718}
719
720static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
721 const DataLayout &DL) {
722 // Access as a vector type can work if the size of the access vector is a
723 // multiple of the size of the alloca's vector element type.
724 //
725 // Examples:
726 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
727 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
728 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
729 // - 3*32 is not a multiple of 64
730 //
731 // We could handle more complicated cases, but it'd make things a lot more
732 // complicated.
733 if (isa<FixedVectorType>(AccessTy)) {
734 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
735 // If the type size and the store size don't match, we would need to do more
736 // than just bitcast to translate between an extracted/insertable subvectors
737 // and the accessed value.
738 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
739 return false;
740 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
741 return AccTS.isKnownMultipleOf(VecTS);
742 }
743
745 DL);
746}
747
748/// Iterates over an instruction worklist that may contain multiple instructions
749/// from the same basic block, but in a different order.
750template <typename InstContainer>
751static void forEachWorkListItem(const InstContainer &WorkList,
752 std::function<void(Instruction *)> Fn) {
753 // Bucket up uses of the alloca by the block they occur in.
754 // This is important because we have to handle multiple defs/uses in a block
755 // ourselves: SSAUpdater is purely for cross-block references.
757 for (Instruction *User : WorkList)
758 UsesByBlock[User->getParent()].insert(User);
759
760 for (Instruction *User : WorkList) {
761 BasicBlock *BB = User->getParent();
762 auto &BlockUses = UsesByBlock[BB];
763
764 // Already processed, skip.
765 if (BlockUses.empty())
766 continue;
767
768 // Only user in the block, directly process it.
769 if (BlockUses.size() == 1) {
770 Fn(User);
771 continue;
772 }
773
774 // Multiple users in the block, do a linear scan to see users in order.
775 for (Instruction &Inst : *BB) {
776 if (!BlockUses.contains(&Inst))
777 continue;
778
779 Fn(&Inst);
780 }
781
782 // Clear the block so we know it's been processed.
783 BlockUses.clear();
784 }
785}
786
787/// Find an insert point after an alloca, after all other allocas clustered at
788/// the start of the block.
791 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
792 ;
793 return I;
794}
795
796// FIXME: Should try to pick the most likely to be profitable allocas first.
797bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
798 LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
799
800 if (DisablePromoteAllocaToVector) {
801 LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
802 return false;
803 }
804
805 Type *AllocaTy = Alloca.getAllocatedType();
806 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
807 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
808 uint64_t NumElems = 1;
809 Type *ElemTy;
810 do {
811 NumElems *= ArrayTy->getNumElements();
812 ElemTy = ArrayTy->getElementType();
813 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
814
815 // Check for array of vectors
816 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
817 if (InnerVectorTy) {
818 NumElems *= InnerVectorTy->getNumElements();
819 ElemTy = InnerVectorTy->getElementType();
820 }
821
822 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
823 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
824 if (ElementSize > 0) {
825 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
826 // Expand vector if required to match padding of inner type,
827 // i.e. odd size subvectors.
828 // Storage size of new vector must match that of alloca for correct
829 // behaviour of byte offsets and GEP computation.
830 if (NumElems * ElementSize != AllocaSize)
831 NumElems = AllocaSize / ElementSize;
832 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
833 VectorTy = FixedVectorType::get(ElemTy, NumElems);
834 }
835 }
836 }
837
838 if (!VectorTy) {
839 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
840 return false;
841 }
842
843 const unsigned MaxElements =
844 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
845
846 if (VectorTy->getNumElements() > MaxElements ||
847 VectorTy->getNumElements() < 2) {
848 LLVM_DEBUG(dbgs() << " " << *VectorTy
849 << " has an unsupported number of elements\n");
850 return false;
851 }
852
853 std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
855 SmallVector<Instruction *> UsersToRemove;
856 SmallVector<Instruction *> DeferredInsts;
857 SmallVector<Instruction *> NewGEPInsts;
859
860 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
861 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
862 << " " << *Inst << "\n");
863 for (auto *Inst : reverse(NewGEPInsts))
864 Inst->eraseFromParent();
865 return false;
866 };
867
869 collectAllocaUses(Alloca, Uses);
870
871 LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
872
873 Type *VecEltTy = VectorTy->getElementType();
874 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
875 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
876 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
877 "does not match the type's size\n");
878 return false;
879 }
880 unsigned ElementSize = ElementSizeInBits / 8;
881 assert(ElementSize > 0);
882 for (auto *U : Uses) {
883 Instruction *Inst = cast<Instruction>(U->getUser());
884
885 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
886 // This is a store of the pointer, not to the pointer.
887 if (isa<StoreInst>(Inst) &&
888 U->getOperandNo() != StoreInst::getPointerOperandIndex())
889 return RejectUser(Inst, "pointer is being stored");
890
891 Type *AccessTy = getLoadStoreType(Inst);
892 if (AccessTy->isAggregateType())
893 return RejectUser(Inst, "unsupported load/store as aggregate");
894 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
895
896 // Check that this is a simple access of a vector element.
897 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
898 : cast<StoreInst>(Inst)->isSimple();
899 if (!IsSimple)
900 return RejectUser(Inst, "not a simple load or store");
901
902 Ptr = Ptr->stripPointerCasts();
903
904 // Alloca already accessed as vector.
905 if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
906 DL->getTypeStoreSize(AccessTy)) {
907 WorkList.push_back(Inst);
908 continue;
909 }
910
911 if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
912 return RejectUser(Inst, "not a supported access type");
913
914 WorkList.push_back(Inst);
915 continue;
916 }
917
918 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
919 // If we can't compute a vector index from this GEP, then we can't
920 // promote this alloca to vector.
921 Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
922 if (!Index)
923 return RejectUser(Inst, "cannot compute vector index for GEP");
924
925 GEPVectorIdx[GEP] = Index;
926 UsersToRemove.push_back(Inst);
927 continue;
928 }
929
930 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
931 MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
932 WorkList.push_back(Inst);
933 continue;
934 }
935
936 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
937 if (TransferInst->isVolatile())
938 return RejectUser(Inst, "mem transfer inst is volatile");
939
940 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
941 if (!Len || (Len->getZExtValue() % ElementSize))
942 return RejectUser(Inst, "mem transfer inst length is non-constant or "
943 "not a multiple of the vector element size");
944
945 if (TransferInfo.try_emplace(TransferInst).second) {
946 DeferredInsts.push_back(Inst);
947 WorkList.push_back(Inst);
948 }
949
950 auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
952 if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
953 return nullptr;
954
955 return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
956 };
957
958 unsigned OpNum = U->getOperandNo();
959 MemTransferInfo *TI = &TransferInfo[TransferInst];
960 if (OpNum == 0) {
961 Value *Dest = TransferInst->getDest();
962 ConstantInt *Index = getPointerIndexOfAlloca(Dest);
963 if (!Index)
964 return RejectUser(Inst, "could not calculate constant dest index");
965 TI->DestIndex = Index;
966 } else {
967 assert(OpNum == 1);
968 Value *Src = TransferInst->getSource();
969 ConstantInt *Index = getPointerIndexOfAlloca(Src);
970 if (!Index)
971 return RejectUser(Inst, "could not calculate constant src index");
972 TI->SrcIndex = Index;
973 }
974 continue;
975 }
976
977 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
978 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
979 WorkList.push_back(Inst);
980 continue;
981 }
982 }
983
984 // Ignore assume-like intrinsics and comparisons used in assumes.
985 if (isAssumeLikeIntrinsic(Inst)) {
986 if (!Inst->use_empty())
987 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
988 UsersToRemove.push_back(Inst);
989 continue;
990 }
991
992 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
993 return isAssumeLikeIntrinsic(cast<Instruction>(U));
994 })) {
995 UsersToRemove.push_back(Inst);
996 continue;
997 }
998
999 return RejectUser(Inst, "unhandled alloca user");
1000 }
1001
1002 while (!DeferredInsts.empty()) {
1003 Instruction *Inst = DeferredInsts.pop_back_val();
1004 MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
1005 // TODO: Support the case if the pointers are from different alloca or
1006 // from different address spaces.
1007 MemTransferInfo &Info = TransferInfo[TransferInst];
1008 if (!Info.SrcIndex || !Info.DestIndex)
1009 return RejectUser(
1010 Inst, "mem transfer inst is missing constant src and/or dst index");
1011 }
1012
1013 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
1014 << *VectorTy << '\n');
1015 const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
1016
1017 // Alloca is uninitialized memory. Imitate that by making the first value
1018 // undef.
1019 SSAUpdater Updater;
1020 Updater.Initialize(VectorTy, "promotealloca");
1021
1022 BasicBlock *EntryBB = Alloca.getParent();
1023 BasicBlock::iterator InitInsertPos =
1024 skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
1025 // Alloca memory is undefined to begin, not poison.
1026 Value *AllocaInitValue =
1027 new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
1028 AllocaInitValue->takeName(&Alloca);
1029
1030 Updater.AddAvailableValue(EntryBB, AllocaInitValue);
1031
1032 // First handle the initial worklist.
1033 SmallVector<LoadInst *, 4> DeferredLoads;
1034 forEachWorkListItem(WorkList, [&](Instruction *I) {
1035 BasicBlock *BB = I->getParent();
1036 // On the first pass, we only take values that are trivially known, i.e.
1037 // where AddAvailableValue was already called in this block.
1039 I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1040 Updater.FindValueForBlock(BB), DeferredLoads);
1041 if (Result)
1042 Updater.AddAvailableValue(BB, Result);
1043 });
1044
1045 // Then handle deferred loads.
1046 forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
1048 BasicBlock *BB = I->getParent();
1049 // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1050 // get a value, inserting PHIs as needed.
1052 I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1053 Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
1054 if (Result)
1055 Updater.AddAvailableValue(BB, Result);
1056 assert(NewDLs.empty() && "No more deferred loads should be queued!");
1057 });
1058
1059 // Delete all instructions. On the first pass, new dummy loads may have been
1060 // added so we need to collect them too.
1061 DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1062 InstsToDelete.insert_range(DeferredLoads);
1063 for (Instruction *I : InstsToDelete) {
1064 assert(I->use_empty());
1065 I->eraseFromParent();
1066 }
1067
1068 // Delete all the users that are known to be removeable.
1069 for (Instruction *I : reverse(UsersToRemove)) {
1070 I->dropDroppableUses();
1071 assert(I->use_empty());
1072 I->eraseFromParent();
1073 }
1074
1075 // Alloca should now be dead too.
1076 assert(Alloca.use_empty());
1077 Alloca.eraseFromParent();
1078 return true;
1079}
1080
1081std::pair<Value *, Value *>
1082AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1083 Function &F = *Builder.GetInsertBlock()->getParent();
1085
1086 if (!IsAMDHSA) {
1087 CallInst *LocalSizeY =
1088 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1089 CallInst *LocalSizeZ =
1090 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1091
1092 ST.makeLIDRangeMetadata(LocalSizeY);
1093 ST.makeLIDRangeMetadata(LocalSizeZ);
1094
1095 return std::pair(LocalSizeY, LocalSizeZ);
1096 }
1097
1098 // We must read the size out of the dispatch pointer.
1099 assert(IsAMDGCN);
1100
1101 // We are indexing into this struct, and want to extract the workgroup_size_*
1102 // fields.
1103 //
1104 // typedef struct hsa_kernel_dispatch_packet_s {
1105 // uint16_t header;
1106 // uint16_t setup;
1107 // uint16_t workgroup_size_x ;
1108 // uint16_t workgroup_size_y;
1109 // uint16_t workgroup_size_z;
1110 // uint16_t reserved0;
1111 // uint32_t grid_size_x ;
1112 // uint32_t grid_size_y ;
1113 // uint32_t grid_size_z;
1114 //
1115 // uint32_t private_segment_size;
1116 // uint32_t group_segment_size;
1117 // uint64_t kernel_object;
1118 //
1119 // #ifdef HSA_LARGE_MODEL
1120 // void *kernarg_address;
1121 // #elif defined HSA_LITTLE_ENDIAN
1122 // void *kernarg_address;
1123 // uint32_t reserved1;
1124 // #else
1125 // uint32_t reserved1;
1126 // void *kernarg_address;
1127 // #endif
1128 // uint64_t reserved2;
1129 // hsa_signal_t completion_signal; // uint64_t wrapper
1130 // } hsa_kernel_dispatch_packet_t
1131 //
1132 CallInst *DispatchPtr =
1133 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1134 DispatchPtr->addRetAttr(Attribute::NoAlias);
1135 DispatchPtr->addRetAttr(Attribute::NonNull);
1136 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1137
1138 // Size of the dispatch packet struct.
1139 DispatchPtr->addDereferenceableRetAttr(64);
1140
1141 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1142
1143 // We could do a single 64-bit load here, but it's likely that the basic
1144 // 32-bit and extract sequence is already present, and it is probably easier
1145 // to CSE this. The loads should be mergeable later anyway.
1146 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1147 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1148
1149 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1150 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1151
1152 MDNode *MD = MDNode::get(Mod->getContext(), {});
1153 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1154 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1155 ST.makeLIDRangeMetadata(LoadZU);
1156
1157 // Extract y component. Upper half of LoadZU should be zero already.
1158 Value *Y = Builder.CreateLShr(LoadXY, 16);
1159
1160 return std::pair(Y, LoadZU);
1161}
1162
1163Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1164 unsigned N) {
1165 Function *F = Builder.GetInsertBlock()->getParent();
1168 StringRef AttrName;
1169
1170 switch (N) {
1171 case 0:
1172 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1173 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1174 AttrName = "amdgpu-no-workitem-id-x";
1175 break;
1176 case 1:
1177 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1178 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1179 AttrName = "amdgpu-no-workitem-id-y";
1180 break;
1181
1182 case 2:
1183 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1184 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1185 AttrName = "amdgpu-no-workitem-id-z";
1186 break;
1187 default:
1188 llvm_unreachable("invalid dimension");
1189 }
1190
1191 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1192 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1193 ST.makeLIDRangeMetadata(CI);
1194 F->removeFnAttr(AttrName);
1195
1196 return CI;
1197}
1198
1199static bool isCallPromotable(CallInst *CI) {
1201 if (!II)
1202 return false;
1203
1204 switch (II->getIntrinsicID()) {
1205 case Intrinsic::memcpy:
1206 case Intrinsic::memmove:
1207 case Intrinsic::memset:
1208 case Intrinsic::lifetime_start:
1209 case Intrinsic::lifetime_end:
1210 case Intrinsic::invariant_start:
1211 case Intrinsic::invariant_end:
1212 case Intrinsic::launder_invariant_group:
1213 case Intrinsic::strip_invariant_group:
1214 case Intrinsic::objectsize:
1215 return true;
1216 default:
1217 return false;
1218 }
1219}
1220
1221bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1222 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1223 int OpIdx1) const {
1224 // Figure out which operand is the one we might not be promoting.
1225 Value *OtherOp = Inst->getOperand(OpIdx0);
1226 if (Val == OtherOp)
1227 OtherOp = Inst->getOperand(OpIdx1);
1228
1230 return true;
1231
1232 // TODO: getUnderlyingObject will not work on a vector getelementptr
1233 Value *OtherObj = getUnderlyingObject(OtherOp);
1234 if (!isa<AllocaInst>(OtherObj))
1235 return false;
1236
1237 // TODO: We should be able to replace undefs with the right pointer type.
1238
1239 // TODO: If we know the other base object is another promotable
1240 // alloca, not necessarily this alloca, we can do this. The
1241 // important part is both must have the same address space at
1242 // the end.
1243 if (OtherObj != BaseAlloca) {
1244 LLVM_DEBUG(
1245 dbgs() << "Found a binary instruction with another alloca object\n");
1246 return false;
1247 }
1248
1249 return true;
1250}
1251
1252bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1253 Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
1254
1255 for (User *User : Val->users()) {
1256 if (is_contained(WorkList, User))
1257 continue;
1258
1259 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1260 if (!isCallPromotable(CI))
1261 return false;
1262
1263 WorkList.push_back(User);
1264 continue;
1265 }
1266
1268 if (UseInst->getOpcode() == Instruction::PtrToInt)
1269 return false;
1270
1271 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1272 if (LI->isVolatile())
1273 return false;
1274 continue;
1275 }
1276
1277 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1278 if (SI->isVolatile())
1279 return false;
1280
1281 // Reject if the stored value is not the pointer operand.
1282 if (SI->getPointerOperand() != Val)
1283 return false;
1284 continue;
1285 }
1286
1287 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1288 if (RMW->isVolatile())
1289 return false;
1290 continue;
1291 }
1292
1293 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1294 if (CAS->isVolatile())
1295 return false;
1296 continue;
1297 }
1298
1299 // Only promote a select if we know that the other select operand
1300 // is from another pointer that will also be promoted.
1301 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1302 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
1303 return false;
1304
1305 // May need to rewrite constant operands.
1306 WorkList.push_back(ICmp);
1307 continue;
1308 }
1309
1311 // Be conservative if an address could be computed outside the bounds of
1312 // the alloca.
1313 if (!GEP->isInBounds())
1314 return false;
1315 } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
1316 // Only promote a select if we know that the other select operand is from
1317 // another pointer that will also be promoted.
1318 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
1319 return false;
1320 } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
1321 // Repeat for phis.
1322
1323 // TODO: Handle more complex cases. We should be able to replace loops
1324 // over arrays.
1325 switch (Phi->getNumIncomingValues()) {
1326 case 1:
1327 break;
1328 case 2:
1329 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
1330 return false;
1331 break;
1332 default:
1333 return false;
1334 }
1335 } else if (!isa<ExtractElementInst>(User)) {
1336 // Do not promote vector/aggregate type instructions. It is hard to track
1337 // their users.
1338
1339 // Do not promote addrspacecast.
1340 //
1341 // TODO: If we know the address is only observed through flat pointers, we
1342 // could still promote.
1343 return false;
1344 }
1345
1346 WorkList.push_back(User);
1347 if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
1348 return false;
1349 }
1350
1351 return true;
1352}
1353
1354bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1355
1356 FunctionType *FTy = F.getFunctionType();
1358
1359 // If the function has any arguments in the local address space, then it's
1360 // possible these arguments require the entire local memory space, so
1361 // we cannot use local memory in the pass.
1362 for (Type *ParamTy : FTy->params()) {
1363 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1364 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1365 LocalMemLimit = 0;
1366 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1367 "local memory disabled.\n");
1368 return false;
1369 }
1370 }
1371
1372 LocalMemLimit = ST.getAddressableLocalMemorySize();
1373 if (LocalMemLimit == 0)
1374 return false;
1375
1377 SmallPtrSet<const Constant *, 8> VisitedConstants;
1379
1380 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1381 for (const User *U : Val->users()) {
1382 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1383 if (Use->getParent()->getParent() == &F)
1384 return true;
1385 } else {
1386 const Constant *C = cast<Constant>(U);
1387 if (VisitedConstants.insert(C).second)
1388 Stack.push_back(C);
1389 }
1390 }
1391
1392 return false;
1393 };
1394
1395 for (GlobalVariable &GV : Mod->globals()) {
1397 continue;
1398
1399 if (visitUsers(&GV, &GV)) {
1400 UsedLDS.insert(&GV);
1401 Stack.clear();
1402 continue;
1403 }
1404
1405 // For any ConstantExpr uses, we need to recursively search the users until
1406 // we see a function.
1407 while (!Stack.empty()) {
1408 const Constant *C = Stack.pop_back_val();
1409 if (visitUsers(&GV, C)) {
1410 UsedLDS.insert(&GV);
1411 Stack.clear();
1412 break;
1413 }
1414 }
1415 }
1416
1417 const DataLayout &DL = Mod->getDataLayout();
1418 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1419 AllocatedSizes.reserve(UsedLDS.size());
1420
1421 for (const GlobalVariable *GV : UsedLDS) {
1422 Align Alignment =
1423 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1424 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1425
1426 // HIP uses an extern unsized array in local address space for dynamically
1427 // allocated shared memory. In that case, we have to disable the promotion.
1428 if (GV->hasExternalLinkage() && AllocSize == 0) {
1429 LocalMemLimit = 0;
1430 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1431 "local memory. Promoting to local memory "
1432 "disabled.\n");
1433 return false;
1434 }
1435
1436 AllocatedSizes.emplace_back(AllocSize, Alignment);
1437 }
1438
1439 // Sort to try to estimate the worst case alignment padding
1440 //
1441 // FIXME: We should really do something to fix the addresses to a more optimal
1442 // value instead
1443 llvm::sort(AllocatedSizes, llvm::less_second());
1444
1445 // Check how much local memory is being used by global objects
1446 CurrentLocalMemUsage = 0;
1447
1448 // FIXME: Try to account for padding here. The real padding and address is
1449 // currently determined from the inverse order of uses in the function when
1450 // legalizing, which could also potentially change. We try to estimate the
1451 // worst case here, but we probably should fix the addresses earlier.
1452 for (auto Alloc : AllocatedSizes) {
1453 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1454 CurrentLocalMemUsage += Alloc.first;
1455 }
1456
1457 unsigned MaxOccupancy =
1458 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1459 .second;
1460
1461 // Round up to the next tier of usage.
1462 unsigned MaxSizeWithWaveCount =
1463 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1464
1465 // Program may already use more LDS than is usable at maximum occupancy.
1466 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1467 return false;
1468
1469 LocalMemLimit = MaxSizeWithWaveCount;
1470
1471 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1472 << " bytes of LDS\n"
1473 << " Rounding size to " << MaxSizeWithWaveCount
1474 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1475 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1476 << " available for promotion\n");
1477
1478 return true;
1479}
1480
1481// FIXME: Should try to pick the most likely to be profitable allocas first.
1482bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1483 bool SufficientLDS) {
1484 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1485
1486 if (DisablePromoteAllocaToLDS) {
1487 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1488 return false;
1489 }
1490
1491 const DataLayout &DL = Mod->getDataLayout();
1492 IRBuilder<> Builder(&I);
1493
1494 const Function &ContainingFunction = *I.getParent()->getParent();
1495 CallingConv::ID CC = ContainingFunction.getCallingConv();
1496
1497 // Don't promote the alloca to LDS for shader calling conventions as the work
1498 // item ID intrinsics are not supported for these calling conventions.
1499 // Furthermore not all LDS is available for some of the stages.
1500 switch (CC) {
1503 break;
1504 default:
1505 LLVM_DEBUG(
1506 dbgs()
1507 << " promote alloca to LDS not supported with calling convention.\n");
1508 return false;
1509 }
1510
1511 // Not likely to have sufficient local memory for promotion.
1512 if (!SufficientLDS)
1513 return false;
1514
1515 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1516 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1517
1518 Align Alignment =
1519 DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
1520
1521 // FIXME: This computed padding is likely wrong since it depends on inverse
1522 // usage order.
1523 //
1524 // FIXME: It is also possible that if we're allowed to use all of the memory
1525 // could end up using more than the maximum due to alignment padding.
1526
1527 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1528 uint32_t AllocSize =
1529 WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
1530 NewSize += AllocSize;
1531
1532 if (NewSize > LocalMemLimit) {
1533 LLVM_DEBUG(dbgs() << " " << AllocSize
1534 << " bytes of local memory not available to promote\n");
1535 return false;
1536 }
1537
1538 CurrentLocalMemUsage = NewSize;
1539
1540 std::vector<Value *> WorkList;
1541
1542 if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
1543 LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1544 return false;
1545 }
1546
1547 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1548
1549 Function *F = I.getParent()->getParent();
1550
1551 Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
1554 Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1557 GV->setAlignment(I.getAlign());
1558
1559 Value *TCntY, *TCntZ;
1560
1561 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1562 Value *TIdX = getWorkitemID(Builder, 0);
1563 Value *TIdY = getWorkitemID(Builder, 1);
1564 Value *TIdZ = getWorkitemID(Builder, 2);
1565
1566 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1567 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1568 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1569 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1570 TID = Builder.CreateAdd(TID, TIdZ);
1571
1572 LLVMContext &Context = Mod->getContext();
1574
1575 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1576 I.mutateType(Offset->getType());
1577 I.replaceAllUsesWith(Offset);
1578 I.eraseFromParent();
1579
1580 SmallVector<IntrinsicInst *> DeferredIntrs;
1581
1583
1584 for (Value *V : WorkList) {
1586 if (!Call) {
1587 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1588 Value *LHS = CI->getOperand(0);
1589 Value *RHS = CI->getOperand(1);
1590
1591 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1593 CI->setOperand(0, Constant::getNullValue(NewTy));
1594
1596 CI->setOperand(1, Constant::getNullValue(NewTy));
1597
1598 continue;
1599 }
1600
1601 // The operand's value should be corrected on its own and we don't want to
1602 // touch the users.
1604 continue;
1605
1606 assert(V->getType()->isPtrOrPtrVectorTy());
1607
1608 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1609 V->mutateType(NewTy);
1610
1611 // Adjust the types of any constant operands.
1614 SI->setOperand(1, Constant::getNullValue(NewTy));
1615
1617 SI->setOperand(2, Constant::getNullValue(NewTy));
1618 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1619 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1621 Phi->getIncomingValue(I)))
1622 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1623 }
1624 }
1625
1626 continue;
1627 }
1628
1630 Builder.SetInsertPoint(Intr);
1631 switch (Intr->getIntrinsicID()) {
1632 case Intrinsic::lifetime_start:
1633 case Intrinsic::lifetime_end:
1634 // These intrinsics are for address space 0 only
1635 Intr->eraseFromParent();
1636 continue;
1637 case Intrinsic::memcpy:
1638 case Intrinsic::memmove:
1639 // These have 2 pointer operands. In case if second pointer also needs
1640 // to be replaced we defer processing of these intrinsics until all
1641 // other values are processed.
1642 DeferredIntrs.push_back(Intr);
1643 continue;
1644 case Intrinsic::memset: {
1645 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1646 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1647 MemSet->getLength(), MemSet->getDestAlign(),
1648 MemSet->isVolatile());
1649 Intr->eraseFromParent();
1650 continue;
1651 }
1652 case Intrinsic::invariant_start:
1653 case Intrinsic::invariant_end:
1654 case Intrinsic::launder_invariant_group:
1655 case Intrinsic::strip_invariant_group: {
1657 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1658 Args.emplace_back(Intr->getArgOperand(0));
1659 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1660 Args.emplace_back(Intr->getArgOperand(0));
1661 Args.emplace_back(Intr->getArgOperand(1));
1662 }
1663 Args.emplace_back(Offset);
1665 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1666 CallInst *NewIntr =
1667 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1668 Intr->mutateType(NewIntr->getType());
1669 Intr->replaceAllUsesWith(NewIntr);
1670 Intr->eraseFromParent();
1671 continue;
1672 }
1673 case Intrinsic::objectsize: {
1674 Value *Src = Intr->getOperand(0);
1675
1676 CallInst *NewCall = Builder.CreateIntrinsic(
1677 Intrinsic::objectsize,
1679 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1680 Intr->replaceAllUsesWith(NewCall);
1681 Intr->eraseFromParent();
1682 continue;
1683 }
1684 default:
1685 Intr->print(errs());
1686 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1687 }
1688 }
1689
1690 for (IntrinsicInst *Intr : DeferredIntrs) {
1691 Builder.SetInsertPoint(Intr);
1693 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1694
1696 auto *B = Builder.CreateMemTransferInst(
1697 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1698 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1699
1700 for (unsigned I = 0; I != 2; ++I) {
1701 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1702 B->addDereferenceableParamAttr(I, Bytes);
1703 }
1704 }
1705
1706 Intr->eraseFromParent();
1707 }
1708
1709 return true;
1710}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL, SmallVector< Instruction * > &NewInsts)
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap< MemTransferInst *, MemTransferInfo > &TransferInfo, std::map< GetElementPtrInst *, WeakTrackingVH > &GEPVectorIdx, Value *CurVal, SmallVectorImpl< LoadInst * > &DeferredLoads)
Promotes a single user of the alloca to a vector form.
static void collectAllocaUses(AllocaInst &Alloca, SmallVectorImpl< Use * > &Uses)
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, WeakTrackingVH > &GEPIdx)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:205
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Implements a dense probed hash-table based set.
Definition DenseSet.h:261
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
This class represents a freeze function that returns random concrete value if an operand is either a ...
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1864
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1931
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1993
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:570
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:597
Metadata node.
Definition Metadata.h:1077
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1565
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
Value * FindValueForBlock(BasicBlock *BB) const
Return the value for the specified block if the SSAUpdater has one, otherwise return nullptr.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
void stable_sort(R &&Range)
Definition STLExtras.h:2060
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1652
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:249
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1464