65#define DEBUG_TYPE "amdgpu-split-module"
70 "amdgpu-module-splitting-large-function-threshold",
cl::init(2.0f),
73 "consider a function as large and needing special treatment when the "
74 "cost of importing it into a partition"
75 "exceeds the average cost of a partition by this factor; e;g. 2.0 "
76 "means if the function and its dependencies is 2 times bigger than "
77 "an average partition; 0 disables large functions handling entirely"));
80 "amdgpu-module-splitting-large-function-merge-overlap",
cl::init(0.8f),
83 "defines how much overlap between two large function's dependencies "
84 "is needed to put them in the same partition"));
87 "amdgpu-module-splitting-no-externalize-globals",
cl::Hidden,
88 cl::desc(
"disables externalization of global variable with local linkage; "
89 "may cause globals to be duplicated which increases binary size"));
92 LogDirOpt(
"amdgpu-module-splitting-log-dir",
cl::Hidden,
93 cl::desc(
"output directory for AMDGPU module splitting logs"));
96 LogPrivate(
"amdgpu-module-splitting-log-private",
cl::Hidden,
97 cl::desc(
"hash value names before printing them in the AMDGPU "
98 "module splitting logs"));
109 static bool HideNames;
114 HideNames = LogPrivate;
116 const auto EV = sys::Process::GetEnv(
"AMD_SPLIT_MODULE_LOG_PRIVATE");
117 HideNames = (EV.value_or(
"0") !=
"0");
122 return V.getName().str();
123 return toHex(
SHA256::hash(arrayRefFromStringRef(V.getName())),
160class SplitModuleLogger {
162 SplitModuleLogger(
const Module &M) {
163 std::string LogDir = LogDirOpt;
181 "': " + Err.message(),
185 FileOS = std::make_unique<raw_fd_ostream>(Fd,
true);
188 bool hasLogFile()
const {
return FileOS !=
nullptr; }
191 assert(FileOS &&
"no logfile!");
198 operator bool()
const {
203 std::unique_ptr<raw_fd_ostream> FileOS;
206template <
typename Ty>
207static SplitModuleLogger &
operator<<(SplitModuleLogger &SML,
const Ty &Val) {
209 !std::is_same_v<Ty, Value>,
210 "do not print values to logs directly, use handleName instead!");
212 if (SML.hasLogFile())
213 SML.logfile() << Val;
224calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI,
Module &M,
226 CostType ModuleCost = 0;
227 CostType KernelCost = 0;
230 if (Fn.isDeclaration())
234 const auto &
TTI = GetTTI(Fn);
235 for (
const auto &BB : Fn) {
236 for (
const auto &
I : BB) {
243 assert((FnCost + CostVal) >= FnCost &&
"Overflow!");
250 CostMap[&Fn] = FnCost;
251 assert((ModuleCost + FnCost) >= ModuleCost &&
"Overflow!");
252 ModuleCost += FnCost;
255 KernelCost += FnCost;
258 CostType FnCost = (ModuleCost - KernelCost);
259 SML <<
"=> Total Module Cost: " << ModuleCost <<
'\n'
260 <<
" => KernelCost: " << KernelCost <<
" ("
261 <<
format(
"%0.2f", (
float(KernelCost) / ModuleCost) * 100) <<
"%)\n"
262 <<
" => FnsCost: " << FnCost <<
" ("
263 <<
format(
"%0.2f", (
float(FnCost) / ModuleCost) * 100) <<
"%)\n";
268static bool canBeIndirectlyCalled(
const Function &
F) {
271 return !
F.hasLocalLinkage() ||
272 F.hasAddressTaken(
nullptr,
289static void addAllIndirectCallDependencies(
const Module &M,
291 for (
const auto &Fn : M) {
292 if (canBeIndirectlyCalled(Fn))
309static void addAllDependencies(SplitModuleLogger &SML,
const CallGraph &CG,
312 bool &HadIndirectCall) {
317 while (!WorkList.empty()) {
319 assert(!CurFn.isDeclaration());
325 for (
auto &CGEntry : *CG[&CurFn]) {
326 auto *CGNode = CGEntry.second;
327 auto *Callee = CGNode->getFunction();
340 SML <<
"Indirect call detected in " <<
getName(CurFn)
341 <<
" - treating all non-entrypoint functions as "
342 "potential dependencies\n";
345 addAllIndirectCallDependencies(M, Fns);
346 HadIndirectCall =
true;
350 if (Callee->isDeclaration())
353 auto [It, Inserted] = Fns.
insert(Callee);
355 WorkList.push_back(Callee);
363struct FunctionWithDependencies {
364 FunctionWithDependencies(SplitModuleLogger &SML,
CallGraph &CG,
371 addAllDependencies(SML, CG, *Fn, Dependencies,
373 TotalCost = FnCosts.
at(Fn);
374 for (
const auto *Dep : Dependencies) {
375 TotalCost += FnCosts.
at(Dep);
379 HasNonDuplicatableDependecy |=
380 (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
387 bool HasIndirectCall =
false;
389 bool HasNonDuplicatableDependecy =
false;
391 CostType TotalCost = 0;
395 bool isLarge(CostType Threshold)
const {
396 return TotalCost > Threshold && !Dependencies.
empty();
406 for (
const auto *
F :
A) {
414 unsigned NumCommon = 0;
415 for (
const auto *
F :
B) {
419 auto [It, Inserted] =
Total.insert(
F);
424 return static_cast<float>(NumCommon) /
Total.size();
435static std::vector<DenseSet<const Function *>>
436doPartitioning(SplitModuleLogger &SML,
Module &M,
unsigned NumParts,
441 SML <<
"\n--Partitioning Starts--\n";
450 const CostType LargeFnThreshold =
451 LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
452 : std::numeric_limits<CostType>::max();
454 std::vector<DenseSet<const Function *>> Partitions;
455 Partitions.resize(NumParts);
466 auto ComparePartitions = [](
const std::pair<PartitionID, CostType> &a,
467 const std::pair<PartitionID, CostType> &b) {
471 if (a.second == b.second)
472 return a.first < b.first;
473 return a.second > b.second;
480 std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
481 for (
unsigned I = 0;
I < NumParts; ++
I)
482 BalancingQueue.push_back(std::make_pair(
I, 0));
486 const auto AssignToPartition = [&](PartitionID PID,
487 const FunctionWithDependencies &
FWD) {
488 auto &FnsInPart = Partitions[PID];
489 FnsInPart.insert(
FWD.Fn);
490 FnsInPart.insert(
FWD.Dependencies.begin(),
FWD.Dependencies.end());
492 SML <<
"assign " <<
getName(*
FWD.Fn) <<
" to P" << PID <<
"\n -> ";
493 if (!
FWD.Dependencies.empty()) {
494 SML <<
FWD.Dependencies.size() <<
" dependencies added\n";
499 for (
auto &[QueuePID,
Cost] :
reverse(BalancingQueue)) {
500 if (QueuePID == PID) {
501 CostType NewCost = 0;
502 for (
auto *Fn : Partitions[PID])
503 NewCost += FnCosts.
at(Fn);
505 SML <<
"[Updating P" << PID <<
" Cost]:" <<
Cost <<
" -> " << NewCost;
507 SML <<
" (" <<
unsigned(((
float(NewCost) /
Cost) - 1) * 100)
516 sort(BalancingQueue, ComparePartitions);
519 for (
auto &CurFn : WorkList) {
523 if (CurFn.HasIndirectCall) {
524 SML <<
"Function with indirect call(s): " <<
getName(*CurFn.Fn)
525 <<
" defaulting to P0\n";
526 AssignToPartition(0, CurFn);
534 if (CurFn.HasNonDuplicatableDependecy) {
535 SML <<
"Function with externally visible dependency "
536 <<
getName(*CurFn.Fn) <<
" defaulting to P0\n";
537 AssignToPartition(0, CurFn);
542 if (CurFn.isLarge(LargeFnThreshold)) {
543 assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
544 SML <<
"Large Function: " <<
getName(*CurFn.Fn)
545 <<
" - looking for partition with at least "
546 <<
format(
"%0.2f", LargeFnOverlapForMerge * 100) <<
"% overlap\n";
548 bool Assigned =
false;
549 for (
const auto &[PID, Fns] :
enumerate(Partitions)) {
550 float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
551 SML <<
" => " <<
format(
"%0.2f", Overlap * 100) <<
"% overlap with P"
553 if (Overlap > LargeFnOverlapForMerge) {
554 SML <<
" selecting P" << PID <<
'\n';
555 AssignToPartition(PID, CurFn);
565 auto [PID, CurCost] = BalancingQueue.back();
566 AssignToPartition(PID, CurFn);
572 for (
auto *Fn : Part)
574 SML <<
"P" <<
Idx <<
" has a total cost of " <<
Cost <<
" ("
575 <<
format(
"%0.2f", (
float(
Cost) / ModuleCost) * 100)
576 <<
"% of source module)\n";
579 SML <<
"--Partitioning Done--\n\n";
585 for (
const auto &Part : Partitions)
586 AllFunctions.
insert(Part.begin(), Part.end());
607 GV.
setName(
"__llvmsplit_unnamed");
610static bool hasDirectCaller(
const Function &Fn) {
611 for (
auto &U : Fn.
uses()) {
612 if (
auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
618static void splitAMDGPUModule(
619 GetTTIFn GetTTI,
Module &M,
unsigned N,
620 function_ref<
void(std::unique_ptr<Module> MPart)> ModuleCallback) {
622 SplitModuleLogger SML(M);
640 SML <<
"[externalize] " << Fn.
getName()
641 <<
" because its address is taken\n";
649 if (!NoExternalizeGlobals) {
650 for (
auto &GV : M.globals()) {
652 SML <<
"[externalize] GV " << GV.
getName() <<
'\n';
660 const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
674 for (
const auto &
FWD : WorkList) {
676 SeenFunctions.
insert(
FWD.Dependencies.begin(),
FWD.Dependencies.end());
683 !SeenFunctions.
count(&Fn) && !hasDirectCaller(Fn)) {
684 WorkList.emplace_back(SML, CG, FnCosts, &Fn);
690 sort(WorkList, [&](
auto &
A,
auto &
B) {
693 if (
A.TotalCost ==
B.TotalCost)
694 return A.Fn->getName() <
B.Fn->getName();
695 return A.TotalCost >
B.TotalCost;
700 for (
const auto &
FWD : WorkList) {
701 SML <<
"[root] " <<
getName(*
FWD.Fn) <<
" (totalCost:" <<
FWD.TotalCost
702 <<
" indirect:" <<
FWD.HasIndirectCall
703 <<
" hasNonDuplicatableDep:" <<
FWD.HasNonDuplicatableDependecy
707 SortedDepNames.
reserve(
FWD.Dependencies.size());
708 for (
const auto *Dep :
FWD.Dependencies)
710 sort(SortedDepNames);
712 for (
const auto &
Name : SortedDepNames)
713 SML <<
" [dependency] " <<
Name <<
'\n';
718 auto Partitions = doPartitioning(SML, M,
N, ModuleCost, FnCosts, WorkList);
719 assert(Partitions.size() ==
N);
724 const auto NeedsConservativeImport = [&](
const GlobalValue *GV) {
727 const auto *Var = dyn_cast<GlobalVariable>(GV);
728 return Var && Var->hasLocalLinkage();
731 SML <<
"Creating " <<
N <<
" modules...\n";
732 unsigned TotalFnImpls = 0;
733 for (
unsigned I = 0;
I <
N; ++
I) {
734 const auto &FnsInPart = Partitions[
I];
737 std::unique_ptr<Module> MPart(
740 if (
const auto *Fn = dyn_cast<Function>(GV))
741 return FnsInPart.contains(Fn);
743 if (NeedsConservativeImport(GV))
752 if (NeedsConservativeImport(&GV) && GV.
use_empty())
756 unsigned NumAllFns = 0, NumKernels = 0;
757 for (
auto &Cur : *MPart) {
758 if (!Cur.isDeclaration()) {
764 TotalFnImpls += NumAllFns;
765 SML <<
" - Module " <<
I <<
" with " << NumAllFns <<
" functions ("
766 << NumKernels <<
" kernels)\n";
767 ModuleCallback(std::move(MPart));
770 SML << TotalFnImpls <<
" function definitions across all modules ("
771 <<
format(
"%0.2f", (
float(TotalFnImpls) / FnCosts.
size()) * 100)
772 <<
"% of original module)\n";
783 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
Module.h This file contains the declarations for the Module class.
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
Provides a library for accessing information about this process and other processes on the operating ...
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
The basic data container for the call graph of a Module of IR.
CallGraphNode * getCallsExternalNode() const
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Implements a dense probed hash-table based set.
bool hasAddressTaken(const User **=nullptr, bool IgnoreCallbackUses=false, bool IgnoreAssumeLikeCalls=true, bool IngoreLLVMUsed=false, bool IgnoreARCAttachedCall=false, bool IgnoreCastedDirectCall=false) const
hasAddressTaken - returns true if there are any uses of this function other than direct calls or invo...
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
@ HiddenVisibility
The GV is hidden.
void setVisibility(VisibilityTypes V)
@ ExternalLinkage
Externally visible function.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static std::array< uint8_t, 32 > hash(ArrayRef< uint8_t > Data)
Returns a raw 256-bit SHA256 hash for the given data.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef str() const
Explicit conversion to StringRef.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Analysis pass providing the TargetTransformInfo.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
LLVM Value Representation.
void setName(const Twine &Name)
Change the name of the value.
iterator_range< use_iterator > uses()
StringRef getName() const
Return a constant reference to the value's name.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
static std::optional< std::string > GetEnv(StringRef name)
bool isEntryFunctionCC(CallingConv::ID CC)
initializer< Ty > init(const Ty &Val)
std::error_code createUniqueFile(const Twine &Model, int &ResultFD, SmallVectorImpl< char > &ResultPath, OpenFlags Flags=OF_None, unsigned Mode=all_read|all_write)
Create a uniquely named file.
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
This is an optimization pass for GlobalISel generic memory operations.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool DebugFlag
This boolean is set to true if the '-debug' command line option is specified.
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
bool isEntryPoint(const Function &F)
bool isCurrentDebugType(const char *Type)
isCurrentDebugType - Return true if the specified string is the debug type specified on the command l...
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
The llvm::once_flag structure.