69#define DEBUG_TYPE "amdgpu-split-module"
75 "amdgpu-module-splitting-max-depth",
77 "maximum search depth. 0 forces a greedy approach. "
78 "warning: the algorithm is up to O(2^N), where N is the max depth."),
81static cl::opt<float> LargeFnFactor(
84 "when max depth is reached and we can no longer branch out, this "
85 "value determines if a function is worth merging into an already "
86 "existing partition to reduce code duplication. This is a factor "
87 "of the ideal partition size, e.g. 2.0 means we consider the "
88 "function for merging if its cost (including its callees) is 2x the "
89 "size of an ideal partition."));
91static cl::opt<float> LargeFnOverlapForMerge(
93 cl::desc(
"when a function is considered for merging into a partition that "
94 "already contains some of its callees, do the merge if at least "
95 "n% of the code it can reach is already present inside the "
96 "partition; e.g. 0.7 means only merge >70%"));
98static cl::opt<bool> NoExternalizeGlobals(
99 "amdgpu-module-splitting-no-externalize-globals",
cl::Hidden,
100 cl::desc(
"disables externalization of global variable with local linkage; "
101 "may cause globals to be duplicated which increases binary size"));
103static cl::opt<bool> NoExternalizeOnAddrTaken(
104 "amdgpu-module-splitting-no-externalize-address-taken",
cl::Hidden,
106 "disables externalization of functions whose addresses are taken"));
108static cl::opt<std::string>
109 ModuleDotCfgOutput(
"amdgpu-module-splitting-print-module-dotcfg",
111 cl::desc(
"output file to write out the dotgraph "
112 "representation of the input module"));
114static cl::opt<std::string> PartitionSummariesOutput(
115 "amdgpu-module-splitting-print-partition-summaries",
cl::Hidden,
116 cl::desc(
"output file to write out a summary of "
117 "the partitions created for each module"));
121 UseLockFile(
"amdgpu-module-splitting-serial-execution",
cl::Hidden,
122 cl::desc(
"use a lock file so only one process in the system "
123 "can run this pass at once. useful to avoid mangled "
124 "debug output in multithreaded environments."));
127 DebugProposalSearch(
"amdgpu-module-splitting-debug-proposal-search",
129 cl::desc(
"print all proposals received and whether "
130 "they were rejected or accepted"));
133struct SplitModuleTimer : NamedRegionTimer {
134 SplitModuleTimer(StringRef
Name, StringRef
Desc)
144using FunctionsCostMap = DenseMap<const Function *, CostType>;
145using GetTTIFn = function_ref<
const TargetTransformInfo &(Function &)>;
146static constexpr unsigned InvalidPID = -1;
151static auto formatRatioOf(CostType Num, CostType Dem) {
152 CostType DemOr1 = Dem ? Dem : 1;
153 return format(
"%0.2f", (
static_cast<double>(Num) / DemOr1) * 100);
164static bool isNonCopyable(
const Function &
F) {
165 return F.hasExternalLinkage() || !
F.isDefinitionExact() ||
171 if (GV.hasLocalLinkage()) {
179 GV.setName(
"__llvmsplit_unnamed");
188static CostType calculateFunctionCosts(GetTTIFn GetTTI,
Module &M,
189 FunctionsCostMap &CostMap) {
190 SplitModuleTimer SMT(
"calculateFunctionCosts",
"cost analysis");
192 LLVM_DEBUG(
dbgs() <<
"[cost analysis] calculating function costs\n");
193 CostType ModuleCost = 0;
194 [[maybe_unused]] CostType KernelCost = 0;
197 if (Fn.isDeclaration())
201 const auto &
TTI = GetTTI(Fn);
202 for (
const auto &BB : Fn) {
203 for (
const auto &
I : BB) {
210 assert((FnCost + CostVal) >= FnCost &&
"Overflow!");
217 CostMap[&Fn] = FnCost;
218 assert((ModuleCost + FnCost) >= ModuleCost &&
"Overflow!");
219 ModuleCost += FnCost;
222 KernelCost += FnCost;
230 const CostType FnCost = ModuleCost - KernelCost;
231 dbgs() <<
" - total module cost is " << ModuleCost <<
". kernels cost "
232 <<
"" << KernelCost <<
" ("
233 <<
format(
"%0.2f", (
float(KernelCost) / ModuleCost) * 100)
234 <<
"% of the module), functions cost " << FnCost <<
" ("
235 <<
format(
"%0.2f", (
float(FnCost) / ModuleCost) * 100)
236 <<
"% of the module)\n";
243static bool canBeIndirectlyCalled(
const Function &
F) {
246 return !
F.hasLocalLinkage() ||
247 F.hasAddressTaken(
nullptr,
275 enum class EdgeKind :
uint8_t {
293 Edge(
Node *Src,
Node *Dst, EdgeKind Kind)
294 : Src(Src), Dst(Dst), Kind(Kind) {}
301 using EdgesVec = SmallVector<const Edge *, 0>;
303 using nodes_iterator =
const Node *
const *;
305 SplitGraph(
const Module &M,
const FunctionsCostMap &CostMap,
307 : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
309 void buildGraph(CallGraph &CG);
312 bool verifyGraph()
const;
315 bool empty()
const {
return Nodes.empty(); }
316 const iterator_range<nodes_iterator>
nodes()
const {
317 return {Nodes.begin(), Nodes.end()};
321 unsigned getNumNodes()
const {
return Nodes.size(); }
322 BitVector createNodesBitVector()
const {
return BitVector(Nodes.size()); }
324 const Module &getModule()
const {
return M; }
326 CostType getModuleCost()
const {
return ModuleCost; }
327 CostType getCost(
const Function &
F)
const {
return CostMap.at(&
F); }
331 CostType calculateCost(
const BitVector &BV)
const;
336 Node &
getNode(DenseMap<const GlobalValue *, Node *> &Cache,
337 const GlobalValue &GV);
340 const Edge &createEdge(
Node &Src,
Node &Dst, EdgeKind EK);
343 const FunctionsCostMap &CostMap;
347 SmallVector<Node *> Nodes;
349 SpecificBumpPtrAllocator<Node> NodesPool;
355 std::is_trivially_destructible_v<Edge>,
356 "Edge must be trivially destructible to use the BumpPtrAllocator");
372class SplitGraph::Node {
373 friend class SplitGraph;
376 Node(
unsigned ID,
const GlobalValue &GV, CostType IndividualCost,
378 : ID(ID), GV(GV), IndividualCost(IndividualCost),
379 IsNonCopyable(IsNonCopyable), IsEntryFnCC(
false), IsGraphEntry(
false) {
380 if (
auto *Fn = dyn_cast<Function>(&GV))
386 unsigned getID()
const {
return ID; }
388 const Function &
getFunction()
const {
return cast<Function>(GV); }
392 CostType getIndividualCost()
const {
return IndividualCost; }
394 bool isNonCopyable()
const {
return IsNonCopyable; }
395 bool isEntryFunctionCC()
const {
return IsEntryFnCC; }
401 bool isGraphEntryPoint()
const {
return IsGraphEntry; }
403 StringRef
getName()
const {
return GV.getName(); }
405 bool hasAnyIncomingEdges()
const {
return IncomingEdges.size(); }
406 bool hasAnyIncomingEdgesOfKind(EdgeKind EK)
const {
407 return any_of(IncomingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
410 bool hasAnyOutgoingEdges()
const {
return OutgoingEdges.size(); }
411 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK)
const {
412 return any_of(OutgoingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
415 iterator_range<edges_iterator> incoming_edges()
const {
416 return IncomingEdges;
419 iterator_range<edges_iterator> outgoing_edges()
const {
420 return OutgoingEdges;
423 bool shouldFollowIndirectCalls()
const {
return isEntryFunctionCC(); }
430 void visitAllDependencies(std::function<
void(
const Node &)> Visitor)
const;
439 void getDependencies(BitVector &BV)
const {
440 visitAllDependencies([&](
const Node &
N) { BV.set(
N.getID()); });
444 void markAsGraphEntry() { IsGraphEntry =
true; }
447 const GlobalValue &GV;
448 CostType IndividualCost;
449 bool IsNonCopyable : 1;
450 bool IsEntryFnCC : 1;
451 bool IsGraphEntry : 1;
455 EdgesVec IncomingEdges;
456 EdgesVec OutgoingEdges;
459void SplitGraph::Node::visitAllDependencies(
460 std::function<
void(
const Node &)> Visitor)
const {
461 const bool FollowIndirect = shouldFollowIndirectCalls();
464 DenseSet<const Node *> Seen;
465 SmallVector<const Node *, 8> WorkList({
this});
466 while (!WorkList.empty()) {
467 const Node *CurN = WorkList.pop_back_val();
468 if (
auto [It, Inserted] = Seen.insert(CurN); !Inserted)
473 for (
const Edge *
E : CurN->outgoing_edges()) {
474 if (!FollowIndirect &&
E->Kind == EdgeKind::IndirectCall)
476 WorkList.push_back(
E->Dst);
488static bool handleCalleesMD(
const Instruction &
I,
489 SetVector<Function *> &Callees) {
490 auto *MD =
I.getMetadata(LLVMContext::MD_callees);
494 for (
const auto &
Op : MD->operands()) {
495 Function *Callee = mdconst::extract_or_null<Function>(
Op);
498 Callees.insert(Callee);
504void SplitGraph::buildGraph(CallGraph &CG) {
505 SplitModuleTimer SMT(
"buildGraph",
"graph construction");
508 <<
"[build graph] constructing graph representation of the input\n");
516 DenseMap<const GlobalValue *, Node *> Cache;
517 SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
518 for (
const Function &Fn : M) {
519 if (Fn.isDeclaration())
523 SetVector<const Function *> DirectCallees;
524 bool CallsExternal =
false;
525 for (
auto &CGEntry : *CG[&Fn]) {
526 auto *CGNode = CGEntry.second;
527 if (
auto *Callee = CGNode->getFunction()) {
528 if (!Callee->isDeclaration())
529 DirectCallees.insert(Callee);
530 }
else if (CGNode == CG.getCallsExternalNode())
531 CallsExternal =
true;
538 Fn.printAsOperand(
dbgs());
539 dbgs() <<
" - analyzing function\n");
541 SetVector<Function *> KnownCallees;
542 bool HasUnknownIndirectCall =
false;
545 const auto *CB = dyn_cast<CallBase>(&Inst);
546 if (!CB || CB->getCalledFunction())
551 if (CB->isInlineAsm()) {
556 if (handleCalleesMD(Inst, KnownCallees))
560 KnownCallees.clear();
564 HasUnknownIndirectCall =
true;
568 if (HasUnknownIndirectCall) {
570 FnsWithIndirectCalls.push_back(&Fn);
571 }
else if (!KnownCallees.empty())
572 DirectCallees.insert(KnownCallees.begin(), KnownCallees.end());
576 for (
const auto *Callee : DirectCallees)
577 createEdge(
N,
getNode(Cache, *Callee), EdgeKind::DirectCall);
579 if (canBeIndirectlyCalled(Fn))
580 IndirectlyCallableFns.push_back(&Fn);
584 for (
const Function *Fn : FnsWithIndirectCalls) {
585 for (
const Function *Candidate : IndirectlyCallableFns) {
588 createEdge(Src, Dst, EdgeKind::IndirectCall);
593 SmallVector<Node *, 16> CandidateEntryPoints;
594 BitVector NodesReachableByKernels = createNodesBitVector();
595 for (
Node *
N : Nodes) {
597 if (
N->isEntryFunctionCC()) {
598 N->markAsGraphEntry();
599 N->getDependencies(NodesReachableByKernels);
600 }
else if (!
N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
601 CandidateEntryPoints.push_back(
N);
604 for (
Node *
N : CandidateEntryPoints) {
610 if (!NodesReachableByKernels.test(
N->getID()))
611 N->markAsGraphEntry();
620bool SplitGraph::verifyGraph()
const {
621 unsigned ExpectedID = 0;
623 DenseSet<const Node *> SeenNodes;
624 DenseSet<const Function *> SeenFunctionNodes;
625 for (
const Node *
N : Nodes) {
626 if (
N->getID() != (ExpectedID++)) {
627 errs() <<
"Node IDs are incorrect!\n";
631 if (!SeenNodes.insert(
N).second) {
632 errs() <<
"Node seen more than once!\n";
637 errs() <<
"getNode doesn't return the right node\n";
641 for (
const Edge *
E :
N->IncomingEdges) {
642 if (!
E->Src || !
E->Dst || (
E->Dst !=
N) ||
643 (
find(
E->Src->OutgoingEdges,
E) ==
E->Src->OutgoingEdges.end())) {
644 errs() <<
"ill-formed incoming edges\n";
649 for (
const Edge *
E :
N->OutgoingEdges) {
650 if (!
E->Src || !
E->Dst || (
E->Src !=
N) ||
651 (
find(
E->Dst->IncomingEdges,
E) ==
E->Dst->IncomingEdges.end())) {
652 errs() <<
"ill-formed outgoing edges\n";
657 const Function &Fn =
N->getFunction();
659 if (
N->hasAnyIncomingEdges()) {
660 errs() <<
"Kernels cannot have incoming edges\n";
665 if (Fn.isDeclaration()) {
666 errs() <<
"declarations shouldn't have nodes!\n";
670 auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
672 errs() <<
"one function has multiple nodes!\n";
677 if (ExpectedID != Nodes.size()) {
678 errs() <<
"Node IDs out of sync!\n";
682 if (createNodesBitVector().size() != getNumNodes()) {
683 errs() <<
"nodes bit vector doesn't have the right size!\n";
688 BitVector BV = createNodesBitVector();
690 if (
N->isGraphEntryPoint())
691 N->getDependencies(BV);
695 for (
const auto &Fn : M) {
696 if (!Fn.isDeclaration()) {
697 if (!SeenFunctionNodes.contains(&Fn)) {
698 errs() <<
"Fn has no associated node in the graph!\n";
705 errs() <<
"not all nodes are reachable through the graph's entry points!\n";
713CostType SplitGraph::calculateCost(
const BitVector &BV)
const {
715 for (
unsigned NodeID : BV.set_bits())
721SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
722 const GlobalValue &GV) {
723 auto &
N = Cache[&GV];
728 bool NonCopyable =
false;
729 if (
const Function *Fn = dyn_cast<Function>(&GV)) {
730 NonCopyable = isNonCopyable(*Fn);
731 Cost = CostMap.at(Fn);
733 N =
new (NodesPool.Allocate())
Node(Nodes.size(), GV,
Cost, NonCopyable);
739const SplitGraph::Edge &SplitGraph::createEdge(
Node &Src,
Node &Dst,
741 const Edge *
E =
new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
742 Src.OutgoingEdges.push_back(
E);
743 Dst.IncomingEdges.push_back(
E);
762 SplitProposal(
const SplitGraph &SG,
unsigned MaxPartitions) : SG(&SG) {
763 Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
766 void setName(StringRef NewName) { Name = NewName; }
767 StringRef
getName()
const {
return Name; }
769 const BitVector &operator[](
unsigned PID)
const {
770 return Partitions[PID].second;
773 void add(
unsigned PID,
const BitVector &BV) {
774 Partitions[PID].second |= BV;
778 void print(raw_ostream &
OS)
const;
783 unsigned findCheapestPartition()
const;
786 void calculateScores();
789 void verifyCompleteness()
const;
801 double getCodeSizeScore()
const {
return CodeSizeScore; }
815 double getBottleneckScore()
const {
return BottleneckScore; }
818 void updateScore(
unsigned PID) {
820 for (
auto &[PCost, Nodes] : Partitions) {
822 PCost = SG->calculateCost(Nodes);
828 double CodeSizeScore = 0.0;
830 double BottleneckScore = 0.0;
832 CostType TotalCost = 0;
834 const SplitGraph *SG =
nullptr;
837 std::vector<std::pair<CostType, BitVector>> Partitions;
840void SplitProposal::print(raw_ostream &
OS)
const {
843 OS <<
"[proposal] " <<
Name <<
", total cost:" << TotalCost
844 <<
", code size score:" <<
format(
"%0.3f", CodeSizeScore)
845 <<
", bottleneck score:" <<
format(
"%0.3f", BottleneckScore) <<
'\n';
846 for (
const auto &[PID, Part] :
enumerate(Partitions)) {
847 const auto &[
Cost, NodeIDs] = Part;
848 OS <<
" - P" << PID <<
" nodes:" << NodeIDs.count() <<
" cost: " <<
Cost
849 <<
'|' << formatRatioOf(
Cost, SG->getModuleCost()) <<
"%\n";
853unsigned SplitProposal::findCheapestPartition()
const {
854 assert(!Partitions.empty());
855 CostType CurCost = std::numeric_limits<CostType>::max();
856 unsigned CurPID = InvalidPID;
858 if (Part.first <= CurCost) {
860 CurCost = Part.first;
863 assert(CurPID != InvalidPID);
867void SplitProposal::calculateScores() {
868 if (Partitions.empty())
872 CostType LargestPCost = 0;
873 for (
auto &[PCost, Nodes] : Partitions) {
874 if (PCost > LargestPCost)
875 LargestPCost = PCost;
878 CostType ModuleCost = SG->getModuleCost();
879 CodeSizeScore = double(TotalCost) / ModuleCost;
880 assert(CodeSizeScore >= 0.0);
882 BottleneckScore = double(LargestPCost) / ModuleCost;
884 CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
885 BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
889void SplitProposal::verifyCompleteness()
const {
890 if (Partitions.empty())
893 BitVector Result = Partitions[0].second;
896 assert(Result.all() &&
"some nodes are missing from this proposal!");
915class RecursiveSearchSplitting {
917 using SubmitProposalFn = function_ref<void(SplitProposal)>;
919 RecursiveSearchSplitting(
const SplitGraph &SG,
unsigned NumParts,
920 SubmitProposalFn SubmitProposal);
925 struct WorkListEntry {
926 WorkListEntry(
const BitVector &BV) : Cluster(BV) {}
928 unsigned NumNonEntryNodes = 0;
929 CostType TotalCost = 0;
930 CostType CostExcludingGraphEntryPoints = 0;
937 void setupWorkList();
948 void pickPartition(
unsigned Depth,
unsigned Idx, SplitProposal SP);
955 std::pair<unsigned, CostType>
956 findMostSimilarPartition(
const WorkListEntry &Entry,
const SplitProposal &SP);
958 const SplitGraph &SG;
960 SubmitProposalFn SubmitProposal;
964 CostType LargeClusterThreshold = 0;
965 unsigned NumProposalsSubmitted = 0;
966 SmallVector<WorkListEntry> WorkList;
969RecursiveSearchSplitting::RecursiveSearchSplitting(
970 const SplitGraph &SG,
unsigned NumParts, SubmitProposalFn SubmitProposal)
971 : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
978 LargeClusterThreshold =
979 (LargeFnFactor != 0.0)
980 ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
981 : std::numeric_limits<CostType>::max();
982 LLVM_DEBUG(
dbgs() <<
"[recursive search] large cluster threshold set at "
983 << LargeClusterThreshold <<
"\n");
986void RecursiveSearchSplitting::run() {
988 SplitModuleTimer SMT(
"recursive_search_prepare",
"preparing worklist");
993 SplitModuleTimer SMT(
"recursive_search_pick",
"partitioning");
994 SplitProposal SP(SG, NumParts);
995 pickPartition(0, 0, SP);
999void RecursiveSearchSplitting::setupWorkList() {
1007 EquivalenceClasses<unsigned> NodeEC;
1008 for (
const SplitGraph::Node *
N : SG.nodes()) {
1009 if (!
N->isGraphEntryPoint())
1012 NodeEC.insert(
N->getID());
1013 N->visitAllDependencies([&](
const SplitGraph::Node &Dep) {
1014 if (&Dep !=
N && Dep.isNonCopyable())
1015 NodeEC.unionSets(
N->getID(), Dep.getID());
1019 for (
auto I = NodeEC.begin(),
E = NodeEC.end();
I !=
E; ++
I) {
1023 BitVector Cluster = SG.createNodesBitVector();
1024 for (
auto MI = NodeEC.member_begin(
I);
MI != NodeEC.member_end(); ++
MI) {
1025 const SplitGraph::Node &
N = SG.getNode(*
MI);
1026 if (
N.isGraphEntryPoint())
1027 N.getDependencies(Cluster);
1029 WorkList.emplace_back(std::move(Cluster));
1033 for (WorkListEntry &Entry : WorkList) {
1034 for (
unsigned NodeID : Entry.Cluster.set_bits()) {
1035 const SplitGraph::Node &
N = SG.getNode(NodeID);
1036 const CostType
Cost =
N.getIndividualCost();
1038 Entry.TotalCost +=
Cost;
1039 if (!
N.isGraphEntryPoint()) {
1040 Entry.CostExcludingGraphEntryPoints +=
Cost;
1041 ++Entry.NumNonEntryNodes;
1046 stable_sort(WorkList, [](
const WorkListEntry &
A,
const WorkListEntry &
B) {
1047 if (
A.TotalCost !=
B.TotalCost)
1048 return A.TotalCost >
B.TotalCost;
1050 if (
A.CostExcludingGraphEntryPoints !=
B.CostExcludingGraphEntryPoints)
1051 return A.CostExcludingGraphEntryPoints >
B.CostExcludingGraphEntryPoints;
1053 if (
A.NumNonEntryNodes !=
B.NumNonEntryNodes)
1054 return A.NumNonEntryNodes >
B.NumNonEntryNodes;
1056 return A.Cluster.count() >
B.Cluster.count();
1060 dbgs() <<
"[recursive search] worklist:\n";
1062 dbgs() <<
" - [" <<
Idx <<
"]: ";
1063 for (
unsigned NodeID : Entry.Cluster.set_bits())
1064 dbgs() << NodeID <<
" ";
1065 dbgs() <<
"(total_cost:" << Entry.TotalCost
1066 <<
", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1072void RecursiveSearchSplitting::pickPartition(
unsigned Depth,
unsigned Idx,
1074 while (
Idx < WorkList.size()) {
1077 const WorkListEntry &Entry = WorkList[
Idx];
1078 const BitVector &Cluster = Entry.Cluster;
1082 const unsigned CheapestPID = SP.findCheapestPartition();
1083 assert(CheapestPID != InvalidPID);
1087 const auto [MostSimilarPID, SimilarDepsCost] =
1088 findMostSimilarPartition(Entry, SP);
1092 unsigned SinglePIDToTry = InvalidPID;
1093 if (MostSimilarPID == InvalidPID)
1094 SinglePIDToTry = CheapestPID;
1095 else if (MostSimilarPID == CheapestPID)
1096 SinglePIDToTry = CheapestPID;
1100 if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1102 assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1103 const double Ratio =
static_cast<double>(SimilarDepsCost) /
1104 Entry.CostExcludingGraphEntryPoints;
1105 assert(Ratio >= 0.0 && Ratio <= 1.0);
1106 if (Ratio > LargeFnOverlapForMerge) {
1111 SinglePIDToTry = MostSimilarPID;
1114 SinglePIDToTry = CheapestPID;
1121 if (SinglePIDToTry != InvalidPID) {
1124 SP.add(SinglePIDToTry, Cluster);
1129 assert(MostSimilarPID != InvalidPID);
1138 SplitProposal BranchSP = SP;
1140 <<
" [lb] " <<
Idx <<
"=P" << CheapestPID <<
"? ");
1141 BranchSP.add(CheapestPID, Cluster);
1142 pickPartition(
Depth + 1,
Idx + 1, BranchSP);
1147 SplitProposal BranchSP = SP;
1149 <<
" [ms] " <<
Idx <<
"=P" << MostSimilarPID <<
"? ");
1150 BranchSP.add(MostSimilarPID, Cluster);
1151 pickPartition(
Depth + 1,
Idx + 1, BranchSP);
1161 "Search got out of bounds?");
1162 SP.setName(
"recursive_search (depth=" + std::to_string(
Depth) +
") #" +
1163 std::to_string(NumProposalsSubmitted++));
1168std::pair<unsigned, CostType>
1169RecursiveSearchSplitting::findMostSimilarPartition(
const WorkListEntry &Entry,
1170 const SplitProposal &SP) {
1171 if (!Entry.NumNonEntryNodes)
1172 return {InvalidPID, 0};
1177 unsigned ChosenPID = InvalidPID;
1178 CostType ChosenCost = 0;
1179 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1180 BitVector BV = SP[PID];
1181 BV &= Entry.Cluster;
1186 const CostType
Cost = SG.calculateCost(BV);
1188 if (ChosenPID == InvalidPID || ChosenCost <
Cost ||
1189 (ChosenCost ==
Cost && PID > ChosenPID)) {
1195 return {ChosenPID, ChosenCost};
1202const SplitGraph::Node *mapEdgeToDst(
const SplitGraph::Edge *
E) {
1206using SplitGraphEdgeDstIterator =
1207 mapped_iterator<SplitGraph::edges_iterator,
decltype(&mapEdgeToDst)>;
1222 return {
Ref->outgoing_edges().begin(), mapEdgeToDst};
1225 return {
Ref->outgoing_edges().end(), mapEdgeToDst};
1229 return G.nodes().begin();
1232 return G.nodes().end();
1240 return SG.getModule().getName().str();
1244 return N->getName().str();
1248 const SplitGraph &SG) {
1250 if (
N->isEntryFunctionCC())
1251 Result +=
"entry-fn-cc ";
1252 if (
N->isNonCopyable())
1253 Result +=
"non-copyable ";
1254 Result +=
"cost:" + std::to_string(
N->getIndividualCost());
1259 const SplitGraph &SG) {
1260 return N->hasAnyIncomingEdges() ?
"" :
"color=\"red\"";
1264 SplitGraphEdgeDstIterator EI,
1265 const SplitGraph &SG) {
1267 switch ((*EI.getCurrent())->Kind) {
1268 case SplitGraph::EdgeKind::DirectCall:
1270 case SplitGraph::EdgeKind::IndirectCall:
1271 return "style=\"dashed\"";
1286static bool needsConservativeImport(
const GlobalValue *GV) {
1287 if (
const auto *Var = dyn_cast<GlobalVariable>(GV))
1288 return Var->hasLocalLinkage();
1289 return isa<GlobalAlias>(GV);
1294static void printPartitionSummary(raw_ostream &
OS,
unsigned N,
const Module &M,
1295 unsigned PartCost,
unsigned ModuleCost) {
1296 OS <<
"*** Partition P" <<
N <<
" ***\n";
1298 for (
const auto &Fn : M) {
1299 if (!Fn.isDeclaration())
1300 OS <<
" - [function] " << Fn.getName() <<
"\n";
1303 for (
const auto &GV :
M.globals()) {
1304 if (GV.hasInitializer())
1305 OS <<
" - [global] " << GV.getName() <<
"\n";
1308 OS <<
"Partition contains " << formatRatioOf(PartCost, ModuleCost)
1309 <<
"% of the source\n";
1312static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1313 SplitModuleTimer SMT(
"proposal_evaluation",
"proposal ranking algorithm");
1316 New.verifyCompleteness();
1317 if (DebugProposalSearch)
1321 const double CurBScore = Best.getBottleneckScore();
1322 const double CurCSScore = Best.getCodeSizeScore();
1323 const double NewBScore =
New.getBottleneckScore();
1324 const double NewCSScore =
New.getCodeSizeScore();
1336 bool IsBest =
false;
1337 if (NewBScore < CurBScore)
1339 else if (NewBScore == CurBScore)
1340 IsBest = (NewCSScore < CurCSScore);
1343 Best = std::move(New);
1347 dbgs() <<
"[search] new best proposal!\n";
1349 dbgs() <<
"[search] discarding - not profitable\n";
1354static std::unique_ptr<Module> cloneAll(
const Module &M) {
1356 return CloneModule(M, VMap, [&](
const GlobalValue *GV) {
return true; });
1360static void writeDOTGraph(
const SplitGraph &SG) {
1361 if (ModuleDotCfgOutput.empty())
1365 raw_fd_ostream
OS(ModuleDotCfgOutput, EC);
1367 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1368 <<
"' - DOTGraph will not be printed\n";
1371 SG.getModule().getName());
1374static void splitAMDGPUModule(
1375 GetTTIFn GetTTI, Module &M,
unsigned NumParts,
1376 function_ref<
void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1396 if (!NoExternalizeOnAddrTaken) {
1397 for (
auto &Fn : M) {
1400 if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1402 dbgs() <<
" because its address is taken\n");
1410 if (!NoExternalizeGlobals) {
1411 for (
auto &GV :
M.globals()) {
1412 if (GV.hasLocalLinkage())
1413 LLVM_DEBUG(
dbgs() <<
"[externalize] GV " << GV.getName() <<
'\n');
1420 FunctionsCostMap FnCosts;
1421 const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1425 SplitGraph SG(M, FnCosts, ModuleCost);
1431 <<
"[!] no nodes in graph, input is empty - no splitting possible\n");
1432 ModuleCallback(cloneAll(M));
1437 dbgs() <<
"[graph] nodes:\n";
1438 for (
const SplitGraph::Node *
N : SG.nodes()) {
1439 dbgs() <<
" - [" <<
N->getID() <<
"]: " <<
N->getName() <<
" "
1440 << (
N->isGraphEntryPoint() ?
"(entry)" :
"") <<
" "
1441 << (
N->isNonCopyable() ?
"(noncopyable)" :
"") <<
"\n";
1449 std::optional<SplitProposal> Proposal;
1450 const auto EvaluateProposal = [&](SplitProposal SP) {
1451 SP.calculateScores();
1453 Proposal = std::move(SP);
1455 evaluateProposal(*Proposal, std::move(SP));
1460 RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1461 LLVM_DEBUG(
if (Proposal)
dbgs() <<
"[search done] selected proposal: "
1462 << Proposal->getName() <<
"\n";);
1465 LLVM_DEBUG(
dbgs() <<
"[!] no proposal made, no splitting possible!\n");
1466 ModuleCallback(cloneAll(M));
1472 std::optional<raw_fd_ostream> SummariesOS;
1473 if (!PartitionSummariesOutput.empty()) {
1475 SummariesOS.emplace(PartitionSummariesOutput, EC);
1477 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1478 <<
"' - Partition summaries will not be printed\n";
1481 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1482 SplitModuleTimer SMT2(
"modules_creation",
1483 "creating modules for each partition");
1486 DenseSet<const Function *> FnsInPart;
1487 for (
unsigned NodeID : (*Proposal)[PID].set_bits())
1488 FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1491 CostType PartCost = 0;
1492 std::unique_ptr<Module> MPart(
1495 if (
const auto *Fn = dyn_cast<Function>(GV)) {
1496 if (FnsInPart.contains(Fn)) {
1497 PartCost += SG.getCost(*Fn);
1504 return needsConservativeImport(GV) || PID == 0;
1512 if (needsConservativeImport(&GV) && GV.use_empty())
1513 GV.eraseFromParent();
1517 printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1520 printPartitionSummary(
dbgs(), PID, *MPart, PartCost, ModuleCost));
1522 ModuleCallback(std::move(MPart));
1529 SplitModuleTimer SMT(
1530 "total",
"total pass runtime (incl. potentially waiting for lockfile)");
1552 dbgs() <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1553 "output may be mangled by other processes\n");
1567 <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1568 "output may be mangled by other processes\n");
1576 splitAMDGPUModule(TTIGetter, M,
N, ModuleCallback);
1584 splitAMDGPUModule(TTIGetter, M,
N, ModuleCallback);
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Unify divergent function exit nodes
This file defines the BumpPtrAllocator interface.
Expand Atomic instructions
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
static Function * getFunction(Constant *C)
This file defines the little GraphTraits<X> template class that should be specialized by classes that...
Module.h This file contains the declarations for the Module class.
static const unsigned MaxDepth
Machine Check Debug Module
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
A container for analyses that lazily runs them and caches their results.
@ HiddenVisibility
The GV is hidden.
@ ExternalLinkage
Externally visible function.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Class that manages the creation of a lock file to aid implicit coordination between different process...
@ LFS_Error
An error occurred while trying to create or find the lock file.
@ LFS_Owned
The lock file has been created and is owned by this instance of the object.
@ LFS_Shared
The lock file already exists and is owned by some other instance.
std::error_code unsafeRemoveLockFile()
Remove the lock file.
WaitForUnlockResult waitForUnlock(const unsigned MaxSeconds=90)
For a shared lock, wait until the owner releases the lock.
@ Res_Success
The lock was released successfully.
@ Res_Timeout
Reached timeout while waiting for the owner to release the lock.
@ Res_OwnerDied
Owner died while holding the lock.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef str() const
Explicit conversion to StringRef.
Analysis pass providing the TargetTransformInfo.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
void system_temp_directory(bool erasedOnReboot, SmallVectorImpl< char > &result)
Get the typical temporary directory for the system, e.g., "/var/tmp" or "C:/TEMP".
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool TimePassesIsEnabled
If the user specifies the -time-passes argument on an LLVM tool command line then the value of this b...
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
BumpPtrAllocatorImpl BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Ref
The access may reference the value stored in memory.
DWARFExpression::Operation Op
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG)
static std::string getGraphName(const SplitGraph &SG)
DOTGraphTraits(bool IsSimple=false)
static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG)
static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG)
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
const SplitGraph::Edge * EdgeRef
static NodeRef getEntryNode(NodeRef N)
SplitGraph::nodes_iterator nodes_iterator
SplitGraphEdgeDstIterator ChildIteratorType
static nodes_iterator nodes_end(const SplitGraph &G)
static ChildIteratorType child_begin(NodeRef Ref)
static nodes_iterator nodes_begin(const SplitGraph &G)
const SplitGraph::Node * NodeRef
static ChildIteratorType child_end(NodeRef Ref)