LLVM 20.0.0git
AMDGPUAttributor.cpp
Go to the documentation of this file.
1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
18#include "llvm/IR/IntrinsicsAMDGPU.h"
19#include "llvm/IR/IntrinsicsR600.h"
22
23#define DEBUG_TYPE "amdgpu-attributor"
24
25namespace llvm {
27} // namespace llvm
28
29using namespace llvm;
30
32 "amdgpu-kernarg-preload-count",
33 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
34
36 "amdgpu-indirect-call-specialization-threshold",
38 "A threshold controls whether an indirect call will be specialized"),
39 cl::init(3));
40
41#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
42
44#include "AMDGPUAttributes.def"
46};
47
48#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
49
52#include "AMDGPUAttributes.def"
54};
55
56#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
57static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
59#include "AMDGPUAttributes.def"
60};
61
62// We do not need to note the x workitem or workgroup id because they are always
63// initialized.
64//
65// TODO: We should not add the attributes if the known compile time workgroup
66// size is 1 for y/z.
68intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
69 bool HasApertureRegs, bool SupportsGetDoorBellID,
70 unsigned CodeObjectVersion) {
71 switch (ID) {
72 case Intrinsic::amdgcn_workitem_id_x:
73 NonKernelOnly = true;
74 return WORKITEM_ID_X;
75 case Intrinsic::amdgcn_workgroup_id_x:
76 NonKernelOnly = true;
77 return WORKGROUP_ID_X;
78 case Intrinsic::amdgcn_workitem_id_y:
79 case Intrinsic::r600_read_tidig_y:
80 return WORKITEM_ID_Y;
81 case Intrinsic::amdgcn_workitem_id_z:
82 case Intrinsic::r600_read_tidig_z:
83 return WORKITEM_ID_Z;
84 case Intrinsic::amdgcn_workgroup_id_y:
85 case Intrinsic::r600_read_tgid_y:
86 return WORKGROUP_ID_Y;
87 case Intrinsic::amdgcn_workgroup_id_z:
88 case Intrinsic::r600_read_tgid_z:
89 return WORKGROUP_ID_Z;
90 case Intrinsic::amdgcn_lds_kernel_id:
91 return LDS_KERNEL_ID;
92 case Intrinsic::amdgcn_dispatch_ptr:
93 return DISPATCH_PTR;
94 case Intrinsic::amdgcn_dispatch_id:
95 return DISPATCH_ID;
96 case Intrinsic::amdgcn_implicitarg_ptr:
97 return IMPLICIT_ARG_PTR;
98 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
99 // queue_ptr.
100 case Intrinsic::amdgcn_queue_ptr:
101 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
102 return QUEUE_PTR;
103 case Intrinsic::amdgcn_is_shared:
104 case Intrinsic::amdgcn_is_private:
105 if (HasApertureRegs)
106 return NOT_IMPLICIT_INPUT;
107 // Under V5, we need implicitarg_ptr + offsets to access private_base or
108 // shared_base. For pre-V5, however, need to access them through queue_ptr +
109 // offsets.
110 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
111 : QUEUE_PTR;
112 case Intrinsic::trap:
113 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
114 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
115 : QUEUE_PTR;
116 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
117 return QUEUE_PTR;
118 default:
119 return NOT_IMPLICIT_INPUT;
120 }
121}
122
123static bool castRequiresQueuePtr(unsigned SrcAS) {
124 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
125}
126
127static bool isDSAddress(const Constant *C) {
128 const GlobalValue *GV = dyn_cast<GlobalValue>(C);
129 if (!GV)
130 return false;
131 unsigned AS = GV->getAddressSpace();
133}
134
135/// Returns true if the function requires the implicit argument be passed
136/// regardless of the function contents.
137static bool funcRequiresHostcallPtr(const Function &F) {
138 // Sanitizers require the hostcall buffer passed in the implicit arguments.
139 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
140 F.hasFnAttribute(Attribute::SanitizeThread) ||
141 F.hasFnAttribute(Attribute::SanitizeMemory) ||
142 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
143 F.hasFnAttribute(Attribute::SanitizeMemTag);
144}
145
146namespace {
147class AMDGPUInformationCache : public InformationCache {
148public:
149 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
150 BumpPtrAllocator &Allocator,
153 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
154
156
157 enum ConstantStatus : uint8_t {
158 NONE = 0,
159 DS_GLOBAL = 1 << 0,
160 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
161 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
162 ADDR_SPACE_CAST_BOTH_TO_FLAT =
163 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
164 };
165
166 /// Check if the subtarget has aperture regs.
167 bool hasApertureRegs(Function &F) {
168 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
169 return ST.hasApertureRegs();
170 }
171
172 /// Check if the subtarget supports GetDoorbellID.
173 bool supportsGetDoorbellID(Function &F) {
174 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
175 return ST.supportsGetDoorbellID();
176 }
177
178 std::optional<std::pair<unsigned, unsigned>>
179 getFlatWorkGroupSizeAttr(const Function &F) const {
180 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
181 if (!R)
182 return std::nullopt;
183 return std::make_pair(R->first, *(R->second));
184 }
185
186 std::pair<unsigned, unsigned>
187 getDefaultFlatWorkGroupSize(const Function &F) const {
188 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
189 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
190 }
191
192 std::pair<unsigned, unsigned>
193 getMaximumFlatWorkGroupRange(const Function &F) {
194 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
195 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
196 }
197
198 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
199 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
200 return ST.getMaxNumWorkGroups(F);
201 }
202
203 /// Get code object version.
204 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
205
206 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
207 /// accounting for the interaction with the passed value to use for
208 /// "amdgpu-flat-work-group-size".
209 std::pair<unsigned, unsigned>
210 getWavesPerEU(const Function &F,
211 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
212 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
213 return ST.getWavesPerEU(F, FlatWorkGroupSize);
214 }
215
216 std::optional<std::pair<unsigned, unsigned>>
217 getWavesPerEUAttr(const Function &F) {
218 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
219 /*OnlyFirstRequired=*/true);
220 if (!Val)
221 return std::nullopt;
222 if (!Val->second) {
223 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
224 Val->second = ST.getMaxWavesPerEU();
225 }
226 return std::make_pair(Val->first, *(Val->second));
227 }
228
229 std::pair<unsigned, unsigned>
230 getEffectiveWavesPerEU(const Function &F,
231 std::pair<unsigned, unsigned> WavesPerEU,
232 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
233 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
234 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
235 }
236
237 unsigned getMaxWavesPerEU(const Function &F) {
238 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
239 return ST.getMaxWavesPerEU();
240 }
241
242private:
243 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
244 /// local to flat. These casts may require the queue pointer.
245 static uint8_t visitConstExpr(const ConstantExpr *CE) {
247
248 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
249 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
250 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
251 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
252 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
253 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
254 }
255
256 return Status;
257 }
258
259 /// Get the constant access bitmap for \p C.
260 uint8_t getConstantAccess(const Constant *C,
262 auto It = ConstantStatus.find(C);
263 if (It != ConstantStatus.end())
264 return It->second;
265
266 uint8_t Result = 0;
267 if (isDSAddress(C))
268 Result = DS_GLOBAL;
269
270 if (const auto *CE = dyn_cast<ConstantExpr>(C))
271 Result |= visitConstExpr(CE);
272
273 for (const Use &U : C->operands()) {
274 const auto *OpC = dyn_cast<Constant>(U);
275 if (!OpC || !Visited.insert(OpC).second)
276 continue;
277
278 Result |= getConstantAccess(OpC, Visited);
279 }
280 return Result;
281 }
282
283public:
284 /// Returns true if \p Fn needs the queue pointer because of \p C.
285 bool needsQueuePtr(const Constant *C, Function &Fn) {
286 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
287 bool HasAperture = hasApertureRegs(Fn);
288
289 // No need to explore the constants.
290 if (!IsNonEntryFunc && HasAperture)
291 return false;
292
294 uint8_t Access = getConstantAccess(C, Visited);
295
296 // We need to trap on DS globals in non-entry functions.
297 if (IsNonEntryFunc && (Access & DS_GLOBAL))
298 return true;
299
300 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
301 }
302
303 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
305 uint8_t Access = getConstantAccess(C, Visited);
306 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
307 }
308
309private:
310 /// Used to determine if the Constant needs the queue pointer.
312 const unsigned CodeObjectVersion;
313};
314
315struct AAAMDAttributes
316 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
317 AbstractAttribute> {
320
321 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
322
323 /// Create an abstract attribute view for the position \p IRP.
324 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
325 Attributor &A);
326
327 /// See AbstractAttribute::getName().
328 const std::string getName() const override { return "AAAMDAttributes"; }
329
330 /// See AbstractAttribute::getIdAddr().
331 const char *getIdAddr() const override { return &ID; }
332
333 /// This function should return true if the type of the \p AA is
334 /// AAAMDAttributes.
335 static bool classof(const AbstractAttribute *AA) {
336 return (AA->getIdAddr() == &ID);
337 }
338
339 /// Unique ID (due to the unique address)
340 static const char ID;
341};
342const char AAAMDAttributes::ID = 0;
343
344struct AAUniformWorkGroupSize
345 : public StateWrapper<BooleanState, AbstractAttribute> {
347 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
348
349 /// Create an abstract attribute view for the position \p IRP.
350 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
351 Attributor &A);
352
353 /// See AbstractAttribute::getName().
354 const std::string getName() const override {
355 return "AAUniformWorkGroupSize";
356 }
357
358 /// See AbstractAttribute::getIdAddr().
359 const char *getIdAddr() const override { return &ID; }
360
361 /// This function should return true if the type of the \p AA is
362 /// AAAMDAttributes.
363 static bool classof(const AbstractAttribute *AA) {
364 return (AA->getIdAddr() == &ID);
365 }
366
367 /// Unique ID (due to the unique address)
368 static const char ID;
369};
370const char AAUniformWorkGroupSize::ID = 0;
371
372struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
373 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
374 : AAUniformWorkGroupSize(IRP, A) {}
375
376 void initialize(Attributor &A) override {
377 Function *F = getAssociatedFunction();
378 CallingConv::ID CC = F->getCallingConv();
379
381 return;
382
383 bool InitialValue = false;
384 if (F->hasFnAttribute("uniform-work-group-size"))
385 InitialValue =
386 F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
387 "true";
388
389 if (InitialValue)
390 indicateOptimisticFixpoint();
391 else
392 indicatePessimisticFixpoint();
393 }
394
395 ChangeStatus updateImpl(Attributor &A) override {
396 ChangeStatus Change = ChangeStatus::UNCHANGED;
397
398 auto CheckCallSite = [&](AbstractCallSite CS) {
399 Function *Caller = CS.getInstruction()->getFunction();
400 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
401 << "->" << getAssociatedFunction()->getName() << "\n");
402
403 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
404 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
405 if (!CallerInfo || !CallerInfo->isValidState())
406 return false;
407
408 Change = Change | clampStateAndIndicateChange(this->getState(),
409 CallerInfo->getState());
410
411 return true;
412 };
413
414 bool AllCallSitesKnown = true;
415 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
416 return indicatePessimisticFixpoint();
417
418 return Change;
419 }
420
421 ChangeStatus manifest(Attributor &A) override {
423 LLVMContext &Ctx = getAssociatedFunction()->getContext();
424
425 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
426 getAssumed() ? "true" : "false"));
427 return A.manifestAttrs(getIRPosition(), AttrList,
428 /* ForceReplace */ true);
429 }
430
431 bool isValidState() const override {
432 // This state is always valid, even when the state is false.
433 return true;
434 }
435
436 const std::string getAsStr(Attributor *) const override {
437 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
438 }
439
440 /// See AbstractAttribute::trackStatistics()
441 void trackStatistics() const override {}
442};
443
444AAUniformWorkGroupSize &
445AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
446 Attributor &A) {
448 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
450 "AAUniformWorkGroupSize is only valid for function position");
451}
452
453struct AAAMDAttributesFunction : public AAAMDAttributes {
454 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
455 : AAAMDAttributes(IRP, A) {}
456
457 void initialize(Attributor &A) override {
458 Function *F = getAssociatedFunction();
459
460 // If the function requires the implicit arg pointer due to sanitizers,
461 // assume it's needed even if explicitly marked as not requiring it.
462 const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
463 if (NeedsHostcall) {
464 removeAssumedBits(IMPLICIT_ARG_PTR);
465 removeAssumedBits(HOSTCALL_PTR);
466 }
467
468 for (auto Attr : ImplicitAttrs) {
469 if (NeedsHostcall &&
470 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
471 continue;
472
473 if (F->hasFnAttribute(Attr.second))
474 addKnownBits(Attr.first);
475 }
476
477 if (F->isDeclaration())
478 return;
479
480 // Ignore functions with graphics calling conventions, these are currently
481 // not allowed to have kernel arguments.
482 if (AMDGPU::isGraphics(F->getCallingConv())) {
483 indicatePessimisticFixpoint();
484 return;
485 }
486 }
487
488 ChangeStatus updateImpl(Attributor &A) override {
489 Function *F = getAssociatedFunction();
490 // The current assumed state used to determine a change.
491 auto OrigAssumed = getAssumed();
492
493 // Check for Intrinsics and propagate attributes.
494 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
495 *this, this->getIRPosition(), DepClassTy::REQUIRED);
496 if (!AAEdges || !AAEdges->isValidState() ||
497 AAEdges->hasNonAsmUnknownCallee())
498 return indicatePessimisticFixpoint();
499
500 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
501
502 bool NeedsImplicit = false;
503 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
504 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
505 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
506 unsigned COV = InfoCache.getCodeObjectVersion();
507
508 for (Function *Callee : AAEdges->getOptimisticEdges()) {
509 Intrinsic::ID IID = Callee->getIntrinsicID();
510 if (IID == Intrinsic::not_intrinsic) {
511 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
512 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
513 if (!AAAMD || !AAAMD->isValidState())
514 return indicatePessimisticFixpoint();
515 *this &= *AAAMD;
516 continue;
517 }
518
519 bool NonKernelOnly = false;
520 ImplicitArgumentMask AttrMask =
521 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
522 HasApertureRegs, SupportsGetDoorbellID, COV);
523 if (AttrMask != NOT_IMPLICIT_INPUT) {
524 if ((IsNonEntryFunc || !NonKernelOnly))
525 removeAssumedBits(AttrMask);
526 }
527 }
528
529 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
530 if (NeedsImplicit)
531 removeAssumedBits(IMPLICIT_ARG_PTR);
532
533 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
534 // Under V5, we need implicitarg_ptr + offsets to access private_base or
535 // shared_base. We do not actually need queue_ptr.
536 if (COV >= 5)
537 removeAssumedBits(IMPLICIT_ARG_PTR);
538 else
539 removeAssumedBits(QUEUE_PTR);
540 }
541
542 if (funcRetrievesMultigridSyncArg(A, COV)) {
543 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
544 "multigrid_sync_arg needs implicitarg_ptr");
545 removeAssumedBits(MULTIGRID_SYNC_ARG);
546 }
547
548 if (funcRetrievesHostcallPtr(A, COV)) {
549 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
550 removeAssumedBits(HOSTCALL_PTR);
551 }
552
553 if (funcRetrievesHeapPtr(A, COV)) {
554 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
555 removeAssumedBits(HEAP_PTR);
556 }
557
558 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
559 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
560 removeAssumedBits(QUEUE_PTR);
561 }
562
563 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
564 removeAssumedBits(LDS_KERNEL_ID);
565 }
566
567 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
568 removeAssumedBits(DEFAULT_QUEUE);
569
570 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
571 removeAssumedBits(COMPLETION_ACTION);
572
573 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
574 removeAssumedBits(FLAT_SCRATCH_INIT);
575
576 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
577 : ChangeStatus::UNCHANGED;
578 }
579
580 ChangeStatus manifest(Attributor &A) override {
582 LLVMContext &Ctx = getAssociatedFunction()->getContext();
583
584 for (auto Attr : ImplicitAttrs) {
585 if (isKnown(Attr.first))
586 AttrList.push_back(Attribute::get(Ctx, Attr.second));
587 }
588
589 return A.manifestAttrs(getIRPosition(), AttrList,
590 /* ForceReplace */ true);
591 }
592
593 const std::string getAsStr(Attributor *) const override {
594 std::string Str;
596 OS << "AMDInfo[";
597 for (auto Attr : ImplicitAttrs)
598 if (isAssumed(Attr.first))
599 OS << ' ' << Attr.second;
600 OS << " ]";
601 return OS.str();
602 }
603
604 /// See AbstractAttribute::trackStatistics()
605 void trackStatistics() const override {}
606
607private:
608 bool checkForQueuePtr(Attributor &A) {
609 Function *F = getAssociatedFunction();
610 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
611
612 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
613
614 bool NeedsQueuePtr = false;
615
616 auto CheckAddrSpaceCasts = [&](Instruction &I) {
617 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
618 if (castRequiresQueuePtr(SrcAS)) {
619 NeedsQueuePtr = true;
620 return false;
621 }
622 return true;
623 };
624
625 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
626
627 // `checkForAllInstructions` is much more cheaper than going through all
628 // instructions, try it first.
629
630 // The queue pointer is not needed if aperture regs is present.
631 if (!HasApertureRegs) {
632 bool UsedAssumedInformation = false;
633 A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
634 {Instruction::AddrSpaceCast},
635 UsedAssumedInformation);
636 }
637
638 // If we found that we need the queue pointer, nothing else to do.
639 if (NeedsQueuePtr)
640 return true;
641
642 if (!IsNonEntryFunc && HasApertureRegs)
643 return false;
644
645 for (BasicBlock &BB : *F) {
646 for (Instruction &I : BB) {
647 for (const Use &U : I.operands()) {
648 if (const auto *C = dyn_cast<Constant>(U)) {
649 if (InfoCache.needsQueuePtr(C, *F))
650 return true;
651 }
652 }
653 }
654 }
655
656 return false;
657 }
658
659 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
661 AA::RangeTy Range(Pos, 8);
662 return funcRetrievesImplicitKernelArg(A, Range);
663 }
664
665 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
667 AA::RangeTy Range(Pos, 8);
668 return funcRetrievesImplicitKernelArg(A, Range);
669 }
670
671 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
673 AA::RangeTy Range(Pos, 8);
674 return funcRetrievesImplicitKernelArg(A, Range);
675 }
676
677 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
679 AA::RangeTy Range(Pos, 8);
680 return funcRetrievesImplicitKernelArg(A, Range);
681 }
682
683 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
684 if (COV < 5)
685 return false;
687 return funcRetrievesImplicitKernelArg(A, Range);
688 }
689
690 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
691 if (COV < 5)
692 return false;
694 return funcRetrievesImplicitKernelArg(A, Range);
695 }
696
697 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
698 // Check if this is a call to the implicitarg_ptr builtin and it
699 // is used to retrieve the hostcall pointer. The implicit arg for
700 // hostcall is not used only if every use of the implicitarg_ptr
701 // is a load that clearly does not retrieve any byte of the
702 // hostcall pointer. We check this by tracing all the uses of the
703 // initial call to the implicitarg_ptr intrinsic.
704 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
705 auto &Call = cast<CallBase>(I);
706 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
707 return true;
708
709 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
710 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
711 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
712 return false;
713
714 return PointerInfoAA->forallInterferingAccesses(
715 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
716 return Acc.getRemoteInst()->isDroppable();
717 });
718 };
719
720 bool UsedAssumedInformation = false;
721 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
722 UsedAssumedInformation);
723 }
724
725 bool funcRetrievesLDSKernelId(Attributor &A) {
726 auto DoesNotRetrieve = [&](Instruction &I) {
727 auto &Call = cast<CallBase>(I);
728 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
729 };
730 bool UsedAssumedInformation = false;
731 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
732 UsedAssumedInformation);
733 }
734
735 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
736 // not to be set.
737 bool needFlatScratchInit(Attributor &A) {
738 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
739
740 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
741 // there is a cast from PRIVATE_ADDRESS.
742 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
743 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
745 };
746
747 bool UsedAssumedInformation = false;
748 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
749 {Instruction::AddrSpaceCast},
750 UsedAssumedInformation))
751 return true;
752
753 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
754 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
755
756 Function *F = getAssociatedFunction();
757 for (Instruction &I : instructions(F)) {
758 for (const Use &U : I.operands()) {
759 if (const auto *C = dyn_cast<Constant>(U)) {
760 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
761 return true;
762 }
763 }
764 }
765
766 // Finally check callees.
767
768 // This is called on each callee; false means callee shouldn't have
769 // no-flat-scratch-init.
770 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
771 const auto &CB = cast<CallBase>(I);
772 const Function *Callee = CB.getCalledFunction();
773
774 // Callee == 0 for inline asm or indirect call with known callees.
775 // In the latter case, updateImpl() already checked the callees and we
776 // know their FLAT_SCRATCH_INIT bit is set.
777 // If function has indirect call with unknown callees, the bit is
778 // already removed in updateImpl() and execution won't reach here.
779 if (!Callee)
780 return true;
781
782 return Callee->getIntrinsicID() !=
783 Intrinsic::amdgcn_addrspacecast_nonnull;
784 };
785
786 UsedAssumedInformation = false;
787 // If any callee is false (i.e. need FlatScratchInit),
788 // checkForAllCallLikeInstructions returns false, in which case this
789 // function returns true.
790 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
791 UsedAssumedInformation);
792 }
793};
794
795AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
796 Attributor &A) {
798 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
799 llvm_unreachable("AAAMDAttributes is only valid for function position");
800}
801
802/// Base class to derive different size ranges.
803struct AAAMDSizeRangeAttribute
804 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
806
807 StringRef AttrName;
808
809 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
810 StringRef AttrName)
811 : Base(IRP, 32), AttrName(AttrName) {}
812
813 /// See AbstractAttribute::trackStatistics()
814 void trackStatistics() const override {}
815
816 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
818
819 auto CheckCallSite = [&](AbstractCallSite CS) {
820 Function *Caller = CS.getInstruction()->getFunction();
821 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
822 << "->" << getAssociatedFunction()->getName() << '\n');
823
824 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
826 if (!CallerInfo || !CallerInfo->isValidState())
827 return false;
828
829 Change |=
830 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
831
832 return true;
833 };
834
835 bool AllCallSitesKnown = true;
836 if (!A.checkForAllCallSites(CheckCallSite, *this,
837 /*RequireAllCallSites=*/true,
838 AllCallSitesKnown))
839 return indicatePessimisticFixpoint();
840
841 return Change;
842 }
843
844 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
845 /// attribute if it is not same as default.
847 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
848 std::pair<unsigned, unsigned> Default) {
849 auto [Min, Max] = Default;
850 unsigned Lower = getAssumed().getLower().getZExtValue();
851 unsigned Upper = getAssumed().getUpper().getZExtValue();
852
853 // Clamp the range to the default value.
854 if (Lower < Min)
855 Lower = Min;
856 if (Upper > Max + 1)
857 Upper = Max + 1;
858
859 // No manifest if the value is invalid or same as default after clamp.
860 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
862
863 Function *F = getAssociatedFunction();
864 LLVMContext &Ctx = F->getContext();
865 SmallString<10> Buffer;
866 raw_svector_ostream OS(Buffer);
867 OS << Lower << ',' << Upper - 1;
868 return A.manifestAttrs(getIRPosition(),
869 {Attribute::get(Ctx, AttrName, OS.str())},
870 /*ForceReplace=*/true);
871 }
872
873 const std::string getAsStr(Attributor *) const override {
874 std::string Str;
876 OS << getName() << '[';
877 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
878 OS << ']';
879 return OS.str();
880 }
881};
882
883/// Propagate amdgpu-flat-work-group-size attribute.
884struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
885 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
886 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
887
888 void initialize(Attributor &A) override {
889 Function *F = getAssociatedFunction();
890 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
891
892 bool HasAttr = false;
893 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
894 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
895
896 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
897 // We only consider an attribute that is not max range because the front
898 // end always emits the attribute, unfortunately, and sometimes it emits
899 // the max range.
900 if (*Attr != MaxRange) {
901 Range = *Attr;
902 HasAttr = true;
903 }
904 }
905
906 // We don't want to directly clamp the state if it's the max range because
907 // that is basically the worst state.
908 if (Range == MaxRange)
909 return;
910
911 auto [Min, Max] = Range;
912 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
913 IntegerRangeState IRS(CR);
914 clampStateAndIndicateChange(this->getState(), IRS);
915
916 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
917 indicateOptimisticFixpoint();
918 }
919
920 ChangeStatus updateImpl(Attributor &A) override {
921 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
922 }
923
924 /// Create an abstract attribute view for the position \p IRP.
925 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
926 Attributor &A);
927
928 ChangeStatus manifest(Attributor &A) override {
929 Function *F = getAssociatedFunction();
930 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
931 return emitAttributeIfNotDefaultAfterClamp(
932 A, InfoCache.getMaximumFlatWorkGroupRange(*F));
933 }
934
935 /// See AbstractAttribute::getName()
936 const std::string getName() const override {
937 return "AAAMDFlatWorkGroupSize";
938 }
939
940 /// See AbstractAttribute::getIdAddr()
941 const char *getIdAddr() const override { return &ID; }
942
943 /// This function should return true if the type of the \p AA is
944 /// AAAMDFlatWorkGroupSize
945 static bool classof(const AbstractAttribute *AA) {
946 return (AA->getIdAddr() == &ID);
947 }
948
949 /// Unique ID (due to the unique address)
950 static const char ID;
951};
952
953const char AAAMDFlatWorkGroupSize::ID = 0;
954
955AAAMDFlatWorkGroupSize &
956AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
957 Attributor &A) {
959 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
961 "AAAMDFlatWorkGroupSize is only valid for function position");
962}
963
964struct TupleDecIntegerRangeState : public AbstractState {
966
967 bool isValidState() const override {
968 return X.isValidState() && Y.isValidState() && Z.isValidState();
969 }
970
971 bool isAtFixpoint() const override {
972 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
973 }
974
976 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
977 Z.indicateOptimisticFixpoint();
978 }
979
981 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
982 Z.indicatePessimisticFixpoint();
983 }
984
985 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
986 X ^= Other.X;
987 Y ^= Other.Y;
988 Z ^= Other.Z;
989 return *this;
990 }
991
992 bool operator==(const TupleDecIntegerRangeState &Other) const {
993 return X == Other.X && Y == Other.Y && Z == Other.Z;
994 }
995
996 TupleDecIntegerRangeState &getAssumed() { return *this; }
997 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
998};
999
1000using AAAMDMaxNumWorkgroupsState =
1002
1003/// Propagate amdgpu-max-num-workgroups attribute.
1004struct AAAMDMaxNumWorkgroups
1005 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1007
1008 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1009
1010 void initialize(Attributor &A) override {
1011 Function *F = getAssociatedFunction();
1012 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1013
1014 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1015
1016 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1017 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1018 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1019
1020 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1021 indicatePessimisticFixpoint();
1022 }
1023
1024 ChangeStatus updateImpl(Attributor &A) override {
1026
1027 auto CheckCallSite = [&](AbstractCallSite CS) {
1028 Function *Caller = CS.getInstruction()->getFunction();
1029 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1030 << "->" << getAssociatedFunction()->getName() << '\n');
1031
1032 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1034 if (!CallerInfo || !CallerInfo->isValidState())
1035 return false;
1036
1037 Change |=
1038 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1039 return true;
1040 };
1041
1042 bool AllCallSitesKnown = true;
1043 if (!A.checkForAllCallSites(CheckCallSite, *this,
1044 /*RequireAllCallSites=*/true,
1045 AllCallSitesKnown))
1046 return indicatePessimisticFixpoint();
1047
1048 return Change;
1049 }
1050
1051 /// Create an abstract attribute view for the position \p IRP.
1052 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1053 Attributor &A);
1054
1055 ChangeStatus manifest(Attributor &A) override {
1056 Function *F = getAssociatedFunction();
1057 LLVMContext &Ctx = F->getContext();
1058 SmallString<32> Buffer;
1059 raw_svector_ostream OS(Buffer);
1060 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1061
1062 // TODO: Should annotate loads of the group size for this to do anything
1063 // useful.
1064 return A.manifestAttrs(
1065 getIRPosition(),
1066 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1067 /* ForceReplace= */ true);
1068 }
1069
1070 const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
1071
1072 const std::string getAsStr(Attributor *) const override {
1073 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1074 raw_string_ostream OS(Buffer);
1075 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1076 << ']';
1077 return OS.str();
1078 }
1079
1080 const char *getIdAddr() const override { return &ID; }
1081
1082 /// This function should return true if the type of the \p AA is
1083 /// AAAMDMaxNumWorkgroups
1084 static bool classof(const AbstractAttribute *AA) {
1085 return (AA->getIdAddr() == &ID);
1086 }
1087
1088 void trackStatistics() const override {}
1089
1090 /// Unique ID (due to the unique address)
1091 static const char ID;
1092};
1093
1094const char AAAMDMaxNumWorkgroups::ID = 0;
1095
1096AAAMDMaxNumWorkgroups &
1097AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1099 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1100 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1101}
1102
1103/// Propagate amdgpu-waves-per-eu attribute.
1104struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1105 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1106 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1107
1108 void initialize(Attributor &A) override {
1109 Function *F = getAssociatedFunction();
1110 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1111
1112 auto TakeRange = [&](std::pair<unsigned, unsigned> R) {
1113 auto [Min, Max] = R;
1114 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1115 IntegerRangeState RangeState(Range);
1116 clampStateAndIndicateChange(this->getState(), RangeState);
1117 indicateOptimisticFixpoint();
1118 };
1119
1120 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1121 1U, InfoCache.getMaxWavesPerEU(*F)};
1122
1123 // If the attribute exists, we will honor it if it is not the default.
1124 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1125 if (*Attr != MaxWavesPerEURange) {
1126 TakeRange(*Attr);
1127 return;
1128 }
1129 }
1130
1131 // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1132 // calculation of waves per EU involves flat work group size, we can't
1133 // simply use an assumed flat work group size as a start point, because the
1134 // update of flat work group size is in an inverse direction of waves per
1135 // EU. However, we can still do something if it is an entry function. Since
1136 // an entry function is a terminal node, and flat work group size either
1137 // from attribute or default will be used anyway, we can take that value and
1138 // calculate the waves per EU based on it. This result can't be updated by
1139 // no means, but that could still allow us to propagate it.
1140 if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1141 std::pair<unsigned, unsigned> FlatWorkGroupSize;
1142 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
1143 FlatWorkGroupSize = *Attr;
1144 else
1145 FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
1146 TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
1147 FlatWorkGroupSize));
1148 }
1149 }
1150
1151 ChangeStatus updateImpl(Attributor &A) override {
1152 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1153 ChangeStatus Change = ChangeStatus::UNCHANGED;
1154
1155 auto CheckCallSite = [&](AbstractCallSite CS) {
1156 Function *Caller = CS.getInstruction()->getFunction();
1157 Function *Func = getAssociatedFunction();
1158 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1159 << "->" << Func->getName() << '\n');
1160
1161 const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
1162 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1163 const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
1164 *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
1165 if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() ||
1166 !AssumedGroupSize->isValidState())
1167 return false;
1168
1169 unsigned Min, Max;
1170 std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
1171 *Caller,
1172 {CallerInfo->getAssumed().getLower().getZExtValue(),
1173 CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
1174 {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
1175 AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
1176 ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
1177 IntegerRangeState CallerRangeState(CallerRange);
1178 Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
1179
1180 return true;
1181 };
1182
1183 bool AllCallSitesKnown = true;
1184 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1185 return indicatePessimisticFixpoint();
1186
1187 return Change;
1188 }
1189
1190 /// Create an abstract attribute view for the position \p IRP.
1191 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1192 Attributor &A);
1193
1194 ChangeStatus manifest(Attributor &A) override {
1195 Function *F = getAssociatedFunction();
1196 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1197 return emitAttributeIfNotDefaultAfterClamp(
1198 A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1199 }
1200
1201 /// See AbstractAttribute::getName()
1202 const std::string getName() const override { return "AAAMDWavesPerEU"; }
1203
1204 /// See AbstractAttribute::getIdAddr()
1205 const char *getIdAddr() const override { return &ID; }
1206
1207 /// This function should return true if the type of the \p AA is
1208 /// AAAMDWavesPerEU
1209 static bool classof(const AbstractAttribute *AA) {
1210 return (AA->getIdAddr() == &ID);
1211 }
1212
1213 /// Unique ID (due to the unique address)
1214 static const char ID;
1215};
1216
1217const char AAAMDWavesPerEU::ID = 0;
1218
1219AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1220 Attributor &A) {
1222 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1223 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1224}
1225
1226static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1227 for (const auto &CI : IA->ParseConstraints()) {
1228 for (StringRef Code : CI.Codes) {
1229 Code.consume_front("{");
1230 if (Code.starts_with("a"))
1231 return true;
1232 }
1233 }
1234
1235 return false;
1236}
1237
1238struct AAAMDGPUNoAGPR
1239 : public IRAttribute<Attribute::NoUnwind,
1240 StateWrapper<BooleanState, AbstractAttribute>,
1241 AAAMDGPUNoAGPR> {
1242 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1243
1244 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1245 Attributor &A) {
1247 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1248 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1249 }
1250
1251 void initialize(Attributor &A) override {
1252 Function *F = getAssociatedFunction();
1253 if (F->hasFnAttribute("amdgpu-no-agpr"))
1254 indicateOptimisticFixpoint();
1255 }
1256
1257 const std::string getAsStr(Attributor *A) const override {
1258 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1259 }
1260
1261 void trackStatistics() const override {}
1262
1263 ChangeStatus updateImpl(Attributor &A) override {
1264 // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1265
1266 auto CheckForNoAGPRs = [&](Instruction &I) {
1267 const auto &CB = cast<CallBase>(I);
1268 const Value *CalleeOp = CB.getCalledOperand();
1269 const Function *Callee = dyn_cast<Function>(CalleeOp);
1270 if (!Callee) {
1271 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1272 return !inlineAsmUsesAGPRs(IA);
1273 return false;
1274 }
1275
1276 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1277 // required to use AGPRs.
1278 if (Callee->isIntrinsic())
1279 return true;
1280
1281 // TODO: Handle callsite attributes
1282 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1284 return CalleeInfo && CalleeInfo->isValidState() &&
1285 CalleeInfo->getAssumed();
1286 };
1287
1288 bool UsedAssumedInformation = false;
1289 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1290 UsedAssumedInformation))
1291 return indicatePessimisticFixpoint();
1293 }
1294
1295 ChangeStatus manifest(Attributor &A) override {
1296 if (!getAssumed())
1298 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1299 return A.manifestAttrs(getIRPosition(),
1300 {Attribute::get(Ctx, "amdgpu-no-agpr")});
1301 }
1302
1303 const std::string getName() const override { return "AAAMDGPUNoAGPR"; }
1304 const char *getIdAddr() const override { return &ID; }
1305
1306 /// This function should return true if the type of the \p AA is
1307 /// AAAMDGPUNoAGPRs
1308 static bool classof(const AbstractAttribute *AA) {
1309 return (AA->getIdAddr() == &ID);
1310 }
1311
1312 static const char ID;
1313};
1314
1315const char AAAMDGPUNoAGPR::ID = 0;
1316
1317static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1318 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1319 for (unsigned I = 0;
1320 I < F.arg_size() &&
1321 I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
1322 ++I) {
1323 Argument &Arg = *F.getArg(I);
1324 // Check for incompatible attributes.
1325 if (Arg.hasByRefAttr() || Arg.hasNestAttr())
1326 break;
1327
1328 Arg.addAttr(Attribute::InReg);
1329 }
1330}
1331
1332static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1334 SetVector<Function *> Functions;
1335 for (Function &F : M) {
1336 if (!F.isIntrinsic())
1337 Functions.insert(&F);
1338 }
1339
1340 CallGraphUpdater CGUpdater;
1342 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1344 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1345 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1346 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1350
1351 AttributorConfig AC(CGUpdater);
1352 AC.IsClosedWorldModule = Options.IsClosedWorld;
1353 AC.Allowed = &Allowed;
1354 AC.IsModulePass = true;
1355 AC.DefaultInitializeLiveInternals = false;
1356 AC.IndirectCalleeSpecializationCallback =
1357 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1358 Function &Callee, unsigned NumAssumedCallees) {
1359 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1360 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1361 };
1362 AC.IPOAmendableCB = [](const Function &F) {
1363 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1364 };
1365
1366 Attributor A(Functions, InfoCache, AC);
1367
1368 LLVM_DEBUG(dbgs() << "[AMDGPUAttributor] Module " << M.getName() << " is "
1369 << (AC.IsClosedWorldModule ? "" : "not ")
1370 << "assumed to be a closed world.\n");
1371
1372 for (auto *F : Functions) {
1373 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1374 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1375 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1376 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1377 CallingConv::ID CC = F->getCallingConv();
1379 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1380 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1381 } else if (CC == CallingConv::AMDGPU_KERNEL) {
1382 addPreloadKernArgHint(*F, TM);
1383 }
1384
1385 for (auto &I : instructions(F)) {
1386 if (auto *LI = dyn_cast<LoadInst>(&I)) {
1387 A.getOrCreateAAFor<AAAddressSpace>(
1388 IRPosition::value(*LI->getPointerOperand()));
1389 } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
1390 A.getOrCreateAAFor<AAAddressSpace>(
1391 IRPosition::value(*SI->getPointerOperand()));
1392 } else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) {
1393 A.getOrCreateAAFor<AAAddressSpace>(
1394 IRPosition::value(*RMW->getPointerOperand()));
1395 } else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) {
1396 A.getOrCreateAAFor<AAAddressSpace>(
1397 IRPosition::value(*CmpX->getPointerOperand()));
1398 }
1399 }
1400 }
1401
1402 ChangeStatus Change = A.run();
1403 return Change == ChangeStatus::CHANGED;
1404}
1405
1406class AMDGPUAttributorLegacy : public ModulePass {
1407public:
1408 AMDGPUAttributorLegacy() : ModulePass(ID) {}
1409
1410 /// doInitialization - Virtual method overridden by subclasses to do
1411 /// any necessary initialization before any pass is run.
1412 bool doInitialization(Module &) override {
1413 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1414 if (!TPC)
1415 report_fatal_error("TargetMachine is required");
1416
1417 TM = &TPC->getTM<TargetMachine>();
1418 return false;
1419 }
1420
1421 bool runOnModule(Module &M) override {
1422 AnalysisGetter AG(this);
1423 return runImpl(M, AG, *TM, /*Options=*/{});
1424 }
1425
1426 void getAnalysisUsage(AnalysisUsage &AU) const override {
1428 }
1429
1430 StringRef getPassName() const override { return "AMDGPU Attributor"; }
1432 static char ID;
1433};
1434} // namespace
1435
1438
1441 AnalysisGetter AG(FAM);
1442
1443 // TODO: Probably preserves CFG
1444 return runImpl(M, AG, TM, Options) ? PreservedAnalyses::none()
1446}
1447
1448char AMDGPUAttributorLegacy::ID = 0;
1449
1451 return new AMDGPUAttributorLegacy();
1452}
1453INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1454 false, false)
1456INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1457 false, false)
static cl::opt< unsigned > KernargPreloadCount("amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0))
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
ImplicitArgumentMask
@ NOT_IMPLICIT_INPUT
@ ALL_ARGUMENT_MASK
static bool funcRequiresHostcallPtr(const Function &F)
Returns true if the function requires the implicit argument be passed regardless of the function cont...
ImplicitArgumentPositions
@ LAST_ARG_POS
static bool castRequiresQueuePtr(unsigned SrcAS)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
DXIL Resource Access
#define LLVM_DEBUG(...)
Definition: Debug.h:106
@ Default
Definition: DwarfDebug.cpp:87
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1315
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static StringRef getName(Value *V)
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Target-Independent Code Generator Pass Configuration Options pass.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Class for arbitrary precision integers.
Definition: APInt.h:78
AbstractCallSite.
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
bool hasByRefAttr() const
Return true if this argument has the byref attribute.
Definition: Function.cpp:149
void addAttr(Attribute::AttrKind Kind)
Definition: Function.cpp:331
bool hasNestAttr() const
Return true if this argument has the nest attribute.
Definition: Function.cpp:278
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:66
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:1108
This class represents a range of values.
Definition: ConstantRange.h:47
This is an important base class in LLVM.
Definition: Constant.h:42
Legacy analysis pass which computes a CycleInfo.
Definition: CycleAnalysis.h:25
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:251
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:37
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
Definition: Pass.h:119
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Definition: User.cpp:115
LLVM Value Representation.
Definition: Value.h:74
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
bool isGraphics(CallingConv::ID cc)
E & operator^=(E &LHS, E RHS)
Definition: BitmaskEnum.h:195
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
NodeAddr< CodeNode * > Code
Definition: RDFGraph.h:388
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void initializeCycleInfoWrapperPassPass(PassRegistry &)
@ NONE
Definition: Attributor.h:6476
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ CGSCC
Definition: Attributor.h:6478
Pass * createAMDGPUAttributorLegacyPass()
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
Definition: Attributor.h:3464
ChangeStatus
{
Definition: Attributor.h:489
@ REQUIRED
The target cannot be valid if the source is not.
@ Default
The result values are uniform if and only if all operands are uniform.
An abstract interface for address space information.
Definition: Attributor.h:6286
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6322
An abstract state for querying live call edges.
Definition: Attributor.h:5487
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:5530
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6441
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:4343
An access description.
Definition: Attributor.h:5988
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
Definition: Attributor.h:6088
An abstract interface for struct information.
Definition: Attributor.h:5755
virtual bool forallInterferingAccesses(AA::RangeTy Range, function_ref< bool(const Access &, bool)> CB) const =0
Call CB on all accesses that might interfere with Range and return true if all such accesses were kno...
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6204
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:5304
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:5341
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:6274
Helper to represent an access offset and size, with logic to deal with uncertainty and check for over...
Definition: Attributor.h:237
Base struct for all "concrete attribute" deductions.
Definition: Attributor.h:3284
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Definition: Attributor.h:2604
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Definition: Attributor.h:1127
Configuration for the Attributor.
Definition: Attributor.h:1422
The fixpoint analysis framework that orchestrates the attribute deduction.
Definition: Attributor.h:1516
Class to accumulate and hold information about a callee.
Specialization of the integer state for a decreasing value, hence 0 is the best state and ~0u the wor...
Definition: Attributor.h:2854
Helper class that provides common functionality to manifest IR attributes.
Definition: Attributor.h:3189
ChangeStatus manifest(Attributor &A) override
See AbstractAttribute::manifest(...).
Definition: Attributor.h:3221
Helper to describe and deal with positions in the LLVM-IR.
Definition: Attributor.h:586
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
Definition: Attributor.h:654
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
Definition: Attributor.h:610
@ IRP_FUNCTION
An attribute for a function (scope).
Definition: Attributor.h:598
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Definition: Attributor.h:629
Kind getPositionKind() const
Return the associated position kind.
Definition: Attributor.h:882
Data structure to hold cached (LLVM-IR) information.
Definition: Attributor.h:1203
State for an integer range.
Definition: Attributor.h:2930
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
Definition: Attributor.h:2663
Helper to tie a abstract state implementation to an abstract attribute.
Definition: Attributor.h:3173
StateType & getState() override
See AbstractAttribute::getState(...).
Definition: Attributor.h:3181