69#define DEBUG_TYPE "amdgpu-split-module"
75 "amdgpu-module-splitting-max-depth",
77 "maximum search depth. 0 forces a greedy approach. "
78 "warning: the algorithm is up to O(2^N), where N is the max depth."),
81static cl::opt<float> LargeFnFactor(
84 "when max depth is reached and we can no longer branch out, this "
85 "value determines if a function is worth merging into an already "
86 "existing partition to reduce code duplication. This is a factor "
87 "of the ideal partition size, e.g. 2.0 means we consider the "
88 "function for merging if its cost (including its callees) is 2x the "
89 "size of an ideal partition."));
91static cl::opt<float> LargeFnOverlapForMerge(
93 cl::desc(
"when a function is considered for merging into a partition that "
94 "already contains some of its callees, do the merge if at least "
95 "n% of the code it can reach is already present inside the "
96 "partition; e.g. 0.7 means only merge >70%"));
99 "amdgpu-module-splitting-no-externalize-globals",
cl::Hidden,
100 cl::desc(
"disables externalization of global variable with local linkage; "
101 "may cause globals to be duplicated which increases binary size"));
104 "amdgpu-module-splitting-no-externalize-address-taken",
cl::Hidden,
106 "disables externalization of functions whose addresses are taken"));
109 ModuleDotCfgOutput(
"amdgpu-module-splitting-print-module-dotcfg",
111 cl::desc(
"output file to write out the dotgraph "
112 "representation of the input module"));
115 "amdgpu-module-splitting-print-partition-summaries",
cl::Hidden,
116 cl::desc(
"output file to write out a summary of "
117 "the partitions created for each module"));
121 UseLockFile(
"amdgpu-module-splitting-serial-execution",
cl::Hidden,
122 cl::desc(
"use a lock file so only one process in the system "
123 "can run this pass at once. useful to avoid mangled "
124 "debug output in multithreaded environments."));
127 DebugProposalSearch(
"amdgpu-module-splitting-debug-proposal-search",
129 cl::desc(
"print all proposals received and whether "
130 "they were rejected or accepted"));
133struct SplitModuleTimer : NamedRegionTimer {
134 SplitModuleTimer(StringRef Name, StringRef
Desc)
135 : NamedRegionTimer(Name,
Desc,
DEBUG_TYPE,
"AMDGPU Module Splitting",
144using FunctionsCostMap = DenseMap<const Function *, CostType>;
145using GetTTIFn = function_ref<
const TargetTransformInfo &(Function &)>;
146static constexpr unsigned InvalidPID = -1;
151static auto formatRatioOf(CostType Num, CostType Dem) {
152 CostType DemOr1 = Dem ? Dem : 1;
153 return format(
"%0.2f", (
static_cast<double>(Num) / DemOr1) * 100);
164static bool isNonCopyable(
const Function &
F) {
165 return F.hasExternalLinkage() || !
F.isDefinitionExact() ||
171 if (GV.hasLocalLinkage()) {
179 GV.setName(
"__llvmsplit_unnamed");
188static CostType calculateFunctionCosts(GetTTIFn GetTTI,
Module &M,
189 FunctionsCostMap &CostMap) {
190 SplitModuleTimer SMT(
"calculateFunctionCosts",
"cost analysis");
192 LLVM_DEBUG(
dbgs() <<
"[cost analysis] calculating function costs\n");
193 CostType ModuleCost = 0;
194 [[maybe_unused]] CostType KernelCost = 0;
197 if (Fn.isDeclaration())
201 const auto &
TTI = GetTTI(Fn);
202 for (
const auto &BB : Fn) {
203 for (
const auto &
I : BB) {
208 CostType CostVal = Cost.isValid()
211 assert((FnCost + CostVal) >= FnCost &&
"Overflow!");
218 CostMap[&Fn] = FnCost;
219 assert((ModuleCost + FnCost) >= ModuleCost &&
"Overflow!");
220 ModuleCost += FnCost;
223 KernelCost += FnCost;
231 const CostType FnCost = ModuleCost - KernelCost;
232 dbgs() <<
" - total module cost is " << ModuleCost <<
". kernels cost "
233 <<
"" << KernelCost <<
" ("
234 <<
format(
"%0.2f", (
float(KernelCost) / ModuleCost) * 100)
235 <<
"% of the module), functions cost " << FnCost <<
" ("
236 <<
format(
"%0.2f", (
float(FnCost) / ModuleCost) * 100)
237 <<
"% of the module)\n";
244static bool canBeIndirectlyCalled(
const Function &
F) {
247 return !
F.hasLocalLinkage() ||
248 F.hasAddressTaken(
nullptr,
276 enum class EdgeKind :
uint8_t {
295 : Src(Src), Dst(Dst), Kind(Kind) {}
302 using EdgesVec = SmallVector<const Edge *, 0>;
304 using nodes_iterator =
const Node *
const *;
306 SplitGraph(
const Module &M,
const FunctionsCostMap &CostMap,
308 : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
310 void buildGraph(CallGraph &CG);
313 bool verifyGraph()
const;
316 bool empty()
const {
return Nodes.empty(); }
320 unsigned getNumNodes()
const {
return Nodes.size(); }
321 BitVector createNodesBitVector()
const {
return BitVector(Nodes.size()); }
323 const Module &getModule()
const {
return M; }
325 CostType getModuleCost()
const {
return ModuleCost; }
326 CostType
getCost(
const Function &
F)
const {
return CostMap.at(&
F); }
330 CostType calculateCost(
const BitVector &BV)
const;
335 Node &
getNode(DenseMap<const GlobalValue *, Node *> &Cache,
336 const GlobalValue &GV);
339 const Edge &createEdge(
Node &Src,
Node &Dst, EdgeKind EK);
342 const FunctionsCostMap &CostMap;
348 SpecificBumpPtrAllocator<Node> NodesPool;
354 std::is_trivially_destructible_v<Edge>,
355 "Edge must be trivially destructible to use the BumpPtrAllocator");
371class SplitGraph::Node {
372 friend class SplitGraph;
375 Node(
unsigned ID,
const GlobalValue &GV, CostType IndividualCost,
377 :
ID(
ID), GV(GV), IndividualCost(IndividualCost),
378 IsNonCopyable(IsNonCopyable), IsEntryFnCC(
false), IsGraphEntry(
false) {
385 unsigned getID()
const {
return ID; }
391 CostType getIndividualCost()
const {
return IndividualCost; }
393 bool isNonCopyable()
const {
return IsNonCopyable; }
394 bool isEntryFunctionCC()
const {
return IsEntryFnCC; }
400 bool isGraphEntryPoint()
const {
return IsGraphEntry; }
402 StringRef
getName()
const {
return GV.getName(); }
404 bool hasAnyIncomingEdges()
const {
return IncomingEdges.size(); }
405 bool hasAnyIncomingEdgesOfKind(EdgeKind EK)
const {
406 return any_of(IncomingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
409 bool hasAnyOutgoingEdges()
const {
return OutgoingEdges.size(); }
410 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK)
const {
411 return any_of(OutgoingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
415 return IncomingEdges;
419 return OutgoingEdges;
422 bool shouldFollowIndirectCalls()
const {
return isEntryFunctionCC(); }
429 void visitAllDependencies(std::function<
void(
const Node &)> Visitor)
const;
438 void getDependencies(BitVector &BV)
const {
439 visitAllDependencies([&](
const Node &
N) { BV.set(
N.getID()); });
443 void markAsGraphEntry() { IsGraphEntry =
true; }
446 const GlobalValue &GV;
447 CostType IndividualCost;
448 bool IsNonCopyable : 1;
449 bool IsEntryFnCC : 1;
450 bool IsGraphEntry : 1;
454 EdgesVec IncomingEdges;
455 EdgesVec OutgoingEdges;
458void SplitGraph::Node::visitAllDependencies(
459 std::function<
void(
const Node &)> Visitor)
const {
460 const bool FollowIndirect = shouldFollowIndirectCalls();
463 DenseSet<const Node *> Seen;
464 SmallVector<const Node *, 8> WorkList({
this});
465 while (!WorkList.empty()) {
466 const Node *CurN = WorkList.pop_back_val();
467 if (
auto [It, Inserted] = Seen.insert(CurN); !Inserted)
472 for (
const Edge *
E : CurN->outgoing_edges()) {
473 if (!FollowIndirect &&
E->Kind == EdgeKind::IndirectCall)
475 WorkList.push_back(
E->Dst);
487static bool handleCalleesMD(
const Instruction &
I,
488 SetVector<Function *> &Callees) {
489 auto *MD =
I.getMetadata(LLVMContext::MD_callees);
493 for (
const auto &Op : MD->operands()) {
494 Function *Callee = mdconst::extract_or_null<Function>(Op);
497 Callees.insert(Callee);
503void SplitGraph::buildGraph(CallGraph &CG) {
504 SplitModuleTimer SMT(
"buildGraph",
"graph construction");
507 <<
"[build graph] constructing graph representation of the input\n");
515 DenseMap<const GlobalValue *, Node *> Cache;
516 SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
517 for (
const Function &Fn : M) {
518 if (Fn.isDeclaration())
522 SetVector<const Function *> DirectCallees;
523 bool CallsExternal =
false;
524 for (
auto &CGEntry : *CG[&Fn]) {
525 auto *CGNode = CGEntry.second;
526 if (
auto *Callee = CGNode->getFunction()) {
527 if (!Callee->isDeclaration())
528 DirectCallees.insert(Callee);
529 }
else if (CGNode == CG.getCallsExternalNode())
530 CallsExternal =
true;
536 LLVM_DEBUG(dbgs() <<
" [!] callgraph is incomplete for ";
537 Fn.printAsOperand(dbgs());
538 dbgs() <<
" - analyzing function\n");
540 SetVector<Function *> KnownCallees;
541 bool HasUnknownIndirectCall =
false;
544 const auto *CB = dyn_cast<CallBase>(&Inst);
545 if (!CB || CB->getCalledFunction())
550 if (CB->isInlineAsm()) {
551 LLVM_DEBUG(dbgs() <<
" found inline assembly\n");
555 if (handleCalleesMD(Inst, KnownCallees))
559 KnownCallees.clear();
563 HasUnknownIndirectCall =
true;
567 if (HasUnknownIndirectCall) {
568 LLVM_DEBUG(dbgs() <<
" indirect call found\n");
569 FnsWithIndirectCalls.push_back(&Fn);
570 }
else if (!KnownCallees.empty())
571 DirectCallees.insert_range(KnownCallees);
575 for (
const auto *Callee : DirectCallees)
576 createEdge(
N,
getNode(Cache, *Callee), EdgeKind::DirectCall);
578 if (canBeIndirectlyCalled(Fn))
579 IndirectlyCallableFns.push_back(&Fn);
583 for (
const Function *Fn : FnsWithIndirectCalls) {
584 for (
const Function *Candidate : IndirectlyCallableFns) {
587 createEdge(Src, Dst, EdgeKind::IndirectCall);
592 SmallVector<Node *, 16> CandidateEntryPoints;
593 BitVector NodesReachableByKernels = createNodesBitVector();
594 for (
Node *
N : Nodes) {
596 if (
N->isEntryFunctionCC()) {
597 N->markAsGraphEntry();
598 N->getDependencies(NodesReachableByKernels);
599 }
else if (!
N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
600 CandidateEntryPoints.push_back(
N);
603 for (
Node *
N : CandidateEntryPoints) {
609 if (!NodesReachableByKernels.test(
N->getID()))
610 N->markAsGraphEntry();
619bool SplitGraph::verifyGraph()
const {
620 unsigned ExpectedID = 0;
622 DenseSet<const Node *> SeenNodes;
623 DenseSet<const Function *> SeenFunctionNodes;
624 for (
const Node *
N : Nodes) {
625 if (
N->getID() != (ExpectedID++)) {
626 errs() <<
"Node IDs are incorrect!\n";
630 if (!SeenNodes.insert(
N).second) {
631 errs() <<
"Node seen more than once!\n";
636 errs() <<
"getNode doesn't return the right node\n";
640 for (
const Edge *
E :
N->IncomingEdges) {
641 if (!
E->Src || !
E->Dst || (
E->Dst !=
N) ||
642 (
find(
E->Src->OutgoingEdges,
E) ==
E->Src->OutgoingEdges.end())) {
643 errs() <<
"ill-formed incoming edges\n";
648 for (
const Edge *
E :
N->OutgoingEdges) {
649 if (!
E->Src || !
E->Dst || (
E->Src !=
N) ||
650 (
find(
E->Dst->IncomingEdges,
E) ==
E->Dst->IncomingEdges.end())) {
651 errs() <<
"ill-formed outgoing edges\n";
656 const Function &Fn =
N->getFunction();
657 if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
658 if (
N->hasAnyIncomingEdges()) {
659 errs() <<
"Kernels cannot have incoming edges\n";
664 if (Fn.isDeclaration()) {
665 errs() <<
"declarations shouldn't have nodes!\n";
669 auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
671 errs() <<
"one function has multiple nodes!\n";
676 if (ExpectedID != Nodes.size()) {
677 errs() <<
"Node IDs out of sync!\n";
681 if (createNodesBitVector().size() != getNumNodes()) {
682 errs() <<
"nodes bit vector doesn't have the right size!\n";
687 BitVector BV = createNodesBitVector();
689 if (
N->isGraphEntryPoint())
690 N->getDependencies(BV);
694 for (
const auto &Fn : M) {
695 if (!Fn.isDeclaration()) {
696 if (!SeenFunctionNodes.contains(&Fn)) {
697 errs() <<
"Fn has no associated node in the graph!\n";
704 errs() <<
"not all nodes are reachable through the graph's entry points!\n";
712CostType SplitGraph::calculateCost(
const BitVector &BV)
const {
714 for (
unsigned NodeID : BV.set_bits())
715 Cost +=
getNode(NodeID).getIndividualCost();
720SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
721 const GlobalValue &GV) {
722 auto &
N = Cache[&GV];
727 bool NonCopyable =
false;
728 if (
const Function *Fn = dyn_cast<Function>(&GV)) {
729 NonCopyable = isNonCopyable(*Fn);
730 Cost = CostMap.at(Fn);
732 N =
new (NodesPool.Allocate())
Node(Nodes.size(), GV, Cost, NonCopyable);
738const SplitGraph::Edge &SplitGraph::createEdge(
Node &Src,
Node &Dst,
740 const Edge *
E =
new (EdgesPool.Allocate<Edge>(1))
Edge(&Src, &Dst, EK);
741 Src.OutgoingEdges.push_back(
E);
742 Dst.IncomingEdges.push_back(
E);
761 SplitProposal(
const SplitGraph &SG,
unsigned MaxPartitions) : SG(&SG) {
762 Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
765 void setName(StringRef NewName) { Name = NewName; }
766 StringRef
getName()
const {
return Name; }
768 const BitVector &operator[](
unsigned PID)
const {
769 return Partitions[PID].second;
772 void add(
unsigned PID,
const BitVector &BV) {
773 Partitions[PID].second |= BV;
777 void print(raw_ostream &OS)
const;
782 unsigned findCheapestPartition()
const;
785 void calculateScores();
788 void verifyCompleteness()
const;
800 double getCodeSizeScore()
const {
return CodeSizeScore; }
814 double getBottleneckScore()
const {
return BottleneckScore; }
817 void updateScore(
unsigned PID) {
819 for (
auto &[PCost, Nodes] : Partitions) {
821 PCost = SG->calculateCost(Nodes);
827 double CodeSizeScore = 0.0;
829 double BottleneckScore = 0.0;
831 CostType TotalCost = 0;
833 const SplitGraph *SG =
nullptr;
836 std::vector<std::pair<CostType, BitVector>> Partitions;
839void SplitProposal::print(raw_ostream &OS)
const {
842 OS <<
"[proposal] " << Name <<
", total cost:" << TotalCost
843 <<
", code size score:" << format(
"%0.3f", CodeSizeScore)
844 <<
", bottleneck score:" << format(
"%0.3f", BottleneckScore) <<
'\n';
845 for (
const auto &[PID, Part] : enumerate(Partitions)) {
846 const auto &[Cost, NodeIDs] = Part;
847 OS <<
" - P" << PID <<
" nodes:" << NodeIDs.count() <<
" cost: " << Cost
848 <<
'|' << formatRatioOf(Cost, SG->getModuleCost()) <<
"%\n";
852unsigned SplitProposal::findCheapestPartition()
const {
853 assert(!Partitions.empty());
854 CostType CurCost = std::numeric_limits<CostType>::max();
855 unsigned CurPID = InvalidPID;
856 for (
const auto &[Idx, Part] : enumerate(Partitions)) {
857 if (Part.first <= CurCost) {
859 CurCost = Part.first;
862 assert(CurPID != InvalidPID);
866void SplitProposal::calculateScores() {
867 if (Partitions.empty())
871 CostType LargestPCost = 0;
872 for (
auto &[PCost, Nodes] : Partitions) {
873 if (PCost > LargestPCost)
874 LargestPCost = PCost;
877 CostType ModuleCost = SG->getModuleCost();
878 CodeSizeScore = double(TotalCost) / ModuleCost;
879 assert(CodeSizeScore >= 0.0);
881 BottleneckScore = double(LargestPCost) / ModuleCost;
883 CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
884 BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
888void SplitProposal::verifyCompleteness()
const {
889 if (Partitions.empty())
892 BitVector Result = Partitions[0].second;
893 for (
const auto &
P : drop_begin(Partitions))
895 assert(Result.all() &&
"some nodes are missing from this proposal!");
914class RecursiveSearchSplitting {
916 using SubmitProposalFn = function_ref<void(SplitProposal)>;
918 RecursiveSearchSplitting(
const SplitGraph &SG,
unsigned NumParts,
919 SubmitProposalFn SubmitProposal);
924 struct WorkListEntry {
925 WorkListEntry(
const BitVector &BV) : Cluster(BV) {}
927 unsigned NumNonEntryNodes = 0;
928 CostType TotalCost = 0;
929 CostType CostExcludingGraphEntryPoints = 0;
936 void setupWorkList();
947 void pickPartition(
unsigned Depth,
unsigned Idx, SplitProposal SP);
954 std::pair<unsigned, CostType>
955 findMostSimilarPartition(
const WorkListEntry &Entry,
const SplitProposal &SP);
957 const SplitGraph &SG;
959 SubmitProposalFn SubmitProposal;
963 CostType LargeClusterThreshold = 0;
964 unsigned NumProposalsSubmitted = 0;
965 SmallVector<WorkListEntry> WorkList;
968RecursiveSearchSplitting::RecursiveSearchSplitting(
969 const SplitGraph &SG,
unsigned NumParts, SubmitProposalFn SubmitProposal)
970 : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
975 report_fatal_error(
"[amdgpu-split-module] search depth of " +
976 Twine(MaxDepth) +
" is too high!");
977 LargeClusterThreshold =
978 (LargeFnFactor != 0.0)
979 ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
980 : std::numeric_limits<CostType>::max();
981 LLVM_DEBUG(dbgs() <<
"[recursive search] large cluster threshold set at "
982 << LargeClusterThreshold <<
"\n");
985void RecursiveSearchSplitting::run() {
987 SplitModuleTimer SMT(
"recursive_search_prepare",
"preparing worklist");
992 SplitModuleTimer SMT(
"recursive_search_pick",
"partitioning");
993 SplitProposal SP(SG, NumParts);
994 pickPartition(0, 0, SP);
998void RecursiveSearchSplitting::setupWorkList() {
1006 EquivalenceClasses<unsigned> NodeEC;
1007 for (
const SplitGraph::Node *
N : SG.nodes()) {
1008 if (!
N->isGraphEntryPoint())
1011 NodeEC.insert(
N->getID());
1012 N->visitAllDependencies([&](
const SplitGraph::Node &Dep) {
1013 if (&Dep !=
N && Dep.isNonCopyable())
1014 NodeEC.unionSets(
N->getID(), Dep.getID());
1018 for (
const auto &
Node : NodeEC) {
1019 if (!
Node->isLeader())
1022 BitVector Cluster = SG.createNodesBitVector();
1023 for (
unsigned M : NodeEC.members(*
Node)) {
1024 const SplitGraph::Node &
N = SG.getNode(M);
1025 if (
N.isGraphEntryPoint())
1026 N.getDependencies(Cluster);
1028 WorkList.emplace_back(std::move(Cluster));
1032 for (WorkListEntry &Entry : WorkList) {
1033 for (
unsigned NodeID : Entry.Cluster.set_bits()) {
1034 const SplitGraph::Node &
N = SG.getNode(NodeID);
1035 const CostType Cost =
N.getIndividualCost();
1037 Entry.TotalCost += Cost;
1038 if (!
N.isGraphEntryPoint()) {
1039 Entry.CostExcludingGraphEntryPoints += Cost;
1040 ++Entry.NumNonEntryNodes;
1045 stable_sort(WorkList, [](
const WorkListEntry &
A,
const WorkListEntry &
B) {
1046 if (
A.TotalCost !=
B.TotalCost)
1047 return A.TotalCost >
B.TotalCost;
1049 if (
A.CostExcludingGraphEntryPoints !=
B.CostExcludingGraphEntryPoints)
1050 return A.CostExcludingGraphEntryPoints >
B.CostExcludingGraphEntryPoints;
1052 if (
A.NumNonEntryNodes !=
B.NumNonEntryNodes)
1053 return A.NumNonEntryNodes >
B.NumNonEntryNodes;
1055 return A.Cluster.count() >
B.Cluster.count();
1059 dbgs() <<
"[recursive search] worklist:\n";
1060 for (
const auto &[Idx, Entry] : enumerate(WorkList)) {
1061 dbgs() <<
" - [" << Idx <<
"]: ";
1062 for (
unsigned NodeID : Entry.Cluster.set_bits())
1063 dbgs() << NodeID <<
" ";
1064 dbgs() <<
"(total_cost:" << Entry.TotalCost
1065 <<
", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1071void RecursiveSearchSplitting::pickPartition(
unsigned Depth,
unsigned Idx,
1073 while (Idx < WorkList.size()) {
1076 const WorkListEntry &Entry = WorkList[Idx];
1077 const BitVector &Cluster = Entry.Cluster;
1081 const unsigned CheapestPID = SP.findCheapestPartition();
1082 assert(CheapestPID != InvalidPID);
1086 const auto [MostSimilarPID, SimilarDepsCost] =
1087 findMostSimilarPartition(Entry, SP);
1091 unsigned SinglePIDToTry = InvalidPID;
1092 if (MostSimilarPID == InvalidPID)
1093 SinglePIDToTry = CheapestPID;
1094 else if (MostSimilarPID == CheapestPID)
1095 SinglePIDToTry = CheapestPID;
1096 else if (Depth >= MaxDepth) {
1099 if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1101 assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1102 const double Ratio =
static_cast<double>(SimilarDepsCost) /
1103 Entry.CostExcludingGraphEntryPoints;
1104 assert(Ratio >= 0.0 && Ratio <= 1.0);
1105 if (Ratio > LargeFnOverlapForMerge) {
1110 SinglePIDToTry = MostSimilarPID;
1113 SinglePIDToTry = CheapestPID;
1120 if (SinglePIDToTry != InvalidPID) {
1121 LLVM_DEBUG(dbgs() << Idx <<
"=P" << SinglePIDToTry <<
' ');
1123 SP.add(SinglePIDToTry, Cluster);
1128 assert(MostSimilarPID != InvalidPID);
1137 SplitProposal BranchSP = SP;
1139 <<
" [lb] " << Idx <<
"=P" << CheapestPID <<
"? ");
1140 BranchSP.add(CheapestPID, Cluster);
1141 pickPartition(Depth + 1, Idx + 1, BranchSP);
1146 SplitProposal BranchSP = SP;
1148 <<
" [ms] " << Idx <<
"=P" << MostSimilarPID <<
"? ");
1149 BranchSP.add(MostSimilarPID, Cluster);
1150 pickPartition(Depth + 1, Idx + 1, BranchSP);
1158 assert(Idx == WorkList.size());
1159 assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
1160 "Search got out of bounds?");
1161 SP.setName(
"recursive_search (depth=" + std::to_string(Depth) +
") #" +
1162 std::to_string(NumProposalsSubmitted++));
1167std::pair<unsigned, CostType>
1168RecursiveSearchSplitting::findMostSimilarPartition(
const WorkListEntry &Entry,
1169 const SplitProposal &SP) {
1170 if (!Entry.NumNonEntryNodes)
1171 return {InvalidPID, 0};
1176 unsigned ChosenPID = InvalidPID;
1177 CostType ChosenCost = 0;
1178 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1179 BitVector BV = SP[PID];
1180 BV &= Entry.Cluster;
1185 const CostType Cost = SG.calculateCost(BV);
1187 if (ChosenPID == InvalidPID || ChosenCost < Cost ||
1188 (ChosenCost == Cost && PID > ChosenPID)) {
1194 return {ChosenPID, ChosenCost};
1201const SplitGraph::Node *mapEdgeToDst(
const SplitGraph::Edge *
E) {
1205using SplitGraphEdgeDstIterator =
1206 mapped_iterator<SplitGraph::edges_iterator,
decltype(&mapEdgeToDst)>;
1221 return {
Ref->outgoing_edges().begin(), mapEdgeToDst};
1224 return {
Ref->outgoing_edges().end(), mapEdgeToDst};
1228 return G.nodes().begin();
1231 return G.nodes().end();
1239 return SG.getModule().getName().str();
1243 return N->getName().str();
1247 const SplitGraph &SG) {
1249 if (
N->isEntryFunctionCC())
1250 Result +=
"entry-fn-cc ";
1251 if (
N->isNonCopyable())
1252 Result +=
"non-copyable ";
1253 Result +=
"cost:" + std::to_string(
N->getIndividualCost());
1258 const SplitGraph &SG) {
1259 return N->hasAnyIncomingEdges() ?
"" :
"color=\"red\"";
1263 SplitGraphEdgeDstIterator EI,
1264 const SplitGraph &SG) {
1266 switch ((*EI.getCurrent())->Kind) {
1267 case SplitGraph::EdgeKind::DirectCall:
1269 case SplitGraph::EdgeKind::IndirectCall:
1270 return "style=\"dashed\"";
1285static bool needsConservativeImport(
const GlobalValue *GV) {
1286 if (
const auto *Var = dyn_cast<GlobalVariable>(GV))
1287 return Var->hasLocalLinkage();
1288 return isa<GlobalAlias>(GV);
1293static void printPartitionSummary(raw_ostream &OS,
unsigned N,
const Module &M,
1294 unsigned PartCost,
unsigned ModuleCost) {
1295 OS <<
"*** Partition P" <<
N <<
" ***\n";
1297 for (
const auto &Fn : M) {
1298 if (!Fn.isDeclaration())
1299 OS <<
" - [function] " << Fn.getName() <<
"\n";
1302 for (
const auto &GV :
M.globals()) {
1303 if (GV.hasInitializer())
1304 OS <<
" - [global] " << GV.getName() <<
"\n";
1307 OS <<
"Partition contains " << formatRatioOf(PartCost, ModuleCost)
1308 <<
"% of the source\n";
1311static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1312 SplitModuleTimer SMT(
"proposal_evaluation",
"proposal ranking algorithm");
1315 New.verifyCompleteness();
1316 if (DebugProposalSearch)
1320 const double CurBScore = Best.getBottleneckScore();
1321 const double CurCSScore = Best.getCodeSizeScore();
1322 const double NewBScore =
New.getBottleneckScore();
1323 const double NewCSScore =
New.getCodeSizeScore();
1335 bool IsBest =
false;
1336 if (NewBScore < CurBScore)
1338 else if (NewBScore == CurBScore)
1339 IsBest = (NewCSScore < CurCSScore);
1342 Best = std::move(New);
1346 dbgs() <<
"[search] new best proposal!\n";
1348 dbgs() <<
"[search] discarding - not profitable\n";
1353static std::unique_ptr<Module> cloneAll(
const Module &M) {
1355 return CloneModule(M, VMap, [&](
const GlobalValue *GV) {
return true; });
1359static void writeDOTGraph(
const SplitGraph &SG) {
1360 if (ModuleDotCfgOutput.empty())
1364 raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1366 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1367 <<
"' - DOTGraph will not be printed\n";
1370 SG.getModule().getName());
1373static void splitAMDGPUModule(
1374 GetTTIFn GetTTI,
Module &M,
unsigned NumParts,
1375 function_ref<
void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1395 if (!NoExternalizeOnAddrTaken) {
1396 for (
auto &Fn : M) {
1399 if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1401 dbgs() <<
" because its address is taken\n");
1409 if (!NoExternalizeGlobals) {
1410 for (
auto &GV :
M.globals()) {
1411 if (GV.hasLocalLinkage())
1412 LLVM_DEBUG(
dbgs() <<
"[externalize] GV " << GV.getName() <<
'\n');
1419 FunctionsCostMap FnCosts;
1420 const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1424 SplitGraph SG(M, FnCosts, ModuleCost);
1430 <<
"[!] no nodes in graph, input is empty - no splitting possible\n");
1431 ModuleCallback(cloneAll(M));
1436 dbgs() <<
"[graph] nodes:\n";
1437 for (
const SplitGraph::Node *
N : SG.nodes()) {
1438 dbgs() <<
" - [" <<
N->getID() <<
"]: " <<
N->getName() <<
" "
1439 << (
N->isGraphEntryPoint() ?
"(entry)" :
"") <<
" "
1440 << (
N->isNonCopyable() ?
"(noncopyable)" :
"") <<
"\n";
1448 std::optional<SplitProposal> Proposal;
1449 const auto EvaluateProposal = [&](SplitProposal
SP) {
1450 SP.calculateScores();
1452 Proposal = std::move(SP);
1454 evaluateProposal(*Proposal, std::move(SP));
1459 RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1460 LLVM_DEBUG(
if (Proposal)
dbgs() <<
"[search done] selected proposal: "
1461 << Proposal->getName() <<
"\n";);
1464 LLVM_DEBUG(
dbgs() <<
"[!] no proposal made, no splitting possible!\n");
1465 ModuleCallback(cloneAll(M));
1471 std::optional<raw_fd_ostream> SummariesOS;
1472 if (!PartitionSummariesOutput.empty()) {
1474 SummariesOS.emplace(PartitionSummariesOutput, EC);
1476 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1477 <<
"' - Partition summaries will not be printed\n";
1482 bool ImportAllGVs =
true;
1484 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1485 SplitModuleTimer SMT2(
"modules_creation",
1486 "creating modules for each partition");
1489 DenseSet<const Function *> FnsInPart;
1490 for (
unsigned NodeID : (*Proposal)[PID].set_bits())
1491 FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1494 if (FnsInPart.empty()) {
1496 <<
" is empty, not creating module\n");
1501 CostType PartCost = 0;
1502 std::unique_ptr<Module> MPart(
1505 if (
const auto *Fn = dyn_cast<Function>(GV)) {
1506 if (FnsInPart.contains(Fn)) {
1507 PartCost += SG.getCost(*Fn);
1514 return ImportAllGVs || needsConservativeImport(GV);
1517 ImportAllGVs =
false;
1524 if (needsConservativeImport(&GV) && GV.use_empty())
1525 GV.eraseFromParent();
1529 printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1532 printPartitionSummary(
dbgs(), PID, *MPart, PartCost, ModuleCost));
1534 ModuleCallback(std::move(MPart));
1541 SplitModuleTimer SMT(
1542 "total",
"total pass runtime (incl. potentially waiting for lockfile)");
1565 dbgs() <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1566 "output may be mangled by other processes\n");
1567 }
else if (!Owned) {
1576 <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1577 "output may be mangled by other processes\n");
1583 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1591 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Unify divergent function exit nodes
This file defines the BumpPtrAllocator interface.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
This file defines the DenseMap class.
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
This file defines the little GraphTraits<X> template class that should be specialized by classes that...
Module.h This file contains the declarations for the Module class.
Machine Check Debug Module
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
static const BasicSubtargetSubTypeKV * find(StringRef S, ArrayRef< BasicSubtargetSubTypeKV > A)
Find KV in array using binary search.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
Lightweight error class with error context and mandatory checking.
@ HiddenVisibility
The GV is hidden.
@ ExternalLinkage
Externally visible function.
static InstructionCost getMax()
Class that manages the creation of a lock file to aid implicit coordination between different process...
std::error_code unsafeMaybeUnlock() override
Remove the lock file.
WaitForUnlockResult waitForUnlockFor(std::chrono::seconds MaxSeconds) override
For a shared lock, wait until the owner releases the lock.
Expected< bool > tryLock() override
Tries to acquire the lock without blocking.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef str() const
Explicit conversion to StringRef.
Analysis pass providing the TargetTransformInfo.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
template class LLVM_TEMPLATE_ABI opt< bool >
template class LLVM_TEMPLATE_ABI opt< unsigned >
initializer< Ty > init(const Ty &Val)
template class LLVM_TEMPLATE_ABI opt< std::string >
LLVM_ABI void system_temp_directory(bool erasedOnReboot, SmallVectorImpl< char > &result)
Get the typical temporary directory for the system, e.g., "/var/tmp" or "C:/TEMP".
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI bool TimePassesIsEnabled
If the user specifies the -time-passes argument on an LLVM tool command line then the value of this b...
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
@ Success
The lock was released successfully.
@ OwnerDied
Owner died while holding the lock.
@ Timeout
Reached timeout while waiting for the owner to release the lock.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
void consumeError(Error Err)
Consume a Error without doing anything.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG)
static std::string getGraphName(const SplitGraph &SG)
DOTGraphTraits(bool IsSimple=false)
static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG)
static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG)
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG)
DefaultDOTGraphTraits(bool simple=false)
const SplitGraph::Edge * EdgeRef
static NodeRef getEntryNode(NodeRef N)
SplitGraph::nodes_iterator nodes_iterator
SplitGraph::edges_iterator ChildEdgeIteratorType
SplitGraphEdgeDstIterator ChildIteratorType
static nodes_iterator nodes_end(const SplitGraph &G)
static ChildIteratorType child_begin(NodeRef Ref)
static nodes_iterator nodes_begin(const SplitGraph &G)
const SplitGraph::Node * NodeRef
static ChildIteratorType child_end(NodeRef Ref)