70#define DEBUG_TYPE "amdgpu-split-module"
76 "amdgpu-module-splitting-max-depth",
78 "maximum search depth. 0 forces a greedy approach. "
79 "warning: the algorithm is up to O(2^N), where N is the max depth."),
82static cl::opt<float> LargeFnFactor(
85 "when max depth is reached and we can no longer branch out, this "
86 "value determines if a function is worth merging into an already "
87 "existing partition to reduce code duplication. This is a factor "
88 "of the ideal partition size, e.g. 2.0 means we consider the "
89 "function for merging if its cost (including its callees) is 2x the "
90 "size of an ideal partition."));
92static cl::opt<float> LargeFnOverlapForMerge(
94 cl::desc(
"when a function is considered for merging into a partition that "
95 "already contains some of its callees, do the merge if at least "
96 "n% of the code it can reach is already present inside the "
97 "partition; e.g. 0.7 means only merge >70%"));
100 "amdgpu-module-splitting-no-externalize-globals",
cl::Hidden,
101 cl::desc(
"disables externalization of global variable with local linkage; "
102 "may cause globals to be duplicated which increases binary size"));
105 "amdgpu-module-splitting-no-externalize-address-taken",
cl::Hidden,
107 "disables externalization of functions whose addresses are taken"));
110 ModuleDotCfgOutput(
"amdgpu-module-splitting-print-module-dotcfg",
112 cl::desc(
"output file to write out the dotgraph "
113 "representation of the input module"));
116 "amdgpu-module-splitting-print-partition-summaries",
cl::Hidden,
117 cl::desc(
"output file to write out a summary of "
118 "the partitions created for each module"));
122 UseLockFile(
"amdgpu-module-splitting-serial-execution",
cl::Hidden,
123 cl::desc(
"use a lock file so only one process in the system "
124 "can run this pass at once. useful to avoid mangled "
125 "debug output in multithreaded environments."));
128 DebugProposalSearch(
"amdgpu-module-splitting-debug-proposal-search",
130 cl::desc(
"print all proposals received and whether "
131 "they were rejected or accepted"));
134struct SplitModuleTimer : NamedRegionTimer {
135 SplitModuleTimer(StringRef Name, StringRef
Desc)
136 : NamedRegionTimer(Name,
Desc,
DEBUG_TYPE,
"AMDGPU Module Splitting",
145using FunctionsCostMap = DenseMap<const Function *, CostType>;
146using GetTTIFn = function_ref<
const TargetTransformInfo &(Function &)>;
147static constexpr unsigned InvalidPID = -1;
152static auto formatRatioOf(CostType Num, CostType Dem) {
153 CostType DemOr1 = Dem ? Dem : 1;
154 return format(
"%0.2f", (
static_cast<double>(Num) / DemOr1) * 100);
165static bool isNonCopyable(
const Function &
F) {
166 return F.hasExternalLinkage() || !
F.isDefinitionExact() ||
172 if (GV.hasLocalLinkage()) {
180 GV.setName(
"__llvmsplit_unnamed");
189static CostType calculateFunctionCosts(GetTTIFn GetTTI,
Module &M,
190 FunctionsCostMap &CostMap) {
191 SplitModuleTimer SMT(
"calculateFunctionCosts",
"cost analysis");
193 LLVM_DEBUG(
dbgs() <<
"[cost analysis] calculating function costs\n");
194 CostType ModuleCost = 0;
195 [[maybe_unused]] CostType KernelCost = 0;
198 if (Fn.isDeclaration())
202 const auto &
TTI = GetTTI(Fn);
203 for (
const auto &BB : Fn) {
204 for (
const auto &
I : BB) {
209 CostType CostVal = Cost.isValid()
212 assert((FnCost + CostVal) >= FnCost &&
"Overflow!");
219 CostMap[&Fn] = FnCost;
220 assert((ModuleCost + FnCost) >= ModuleCost &&
"Overflow!");
221 ModuleCost += FnCost;
224 KernelCost += FnCost;
232 const CostType FnCost = ModuleCost - KernelCost;
233 dbgs() <<
" - total module cost is " << ModuleCost <<
". kernels cost "
234 <<
"" << KernelCost <<
" ("
235 <<
format(
"%0.2f", (
float(KernelCost) / ModuleCost) * 100)
236 <<
"% of the module), functions cost " << FnCost <<
" ("
237 <<
format(
"%0.2f", (
float(FnCost) / ModuleCost) * 100)
238 <<
"% of the module)\n";
245static bool canBeIndirectlyCalled(
const Function &
F) {
248 return !
F.hasLocalLinkage() ||
249 F.hasAddressTaken(
nullptr,
277 enum class EdgeKind :
uint8_t {
296 : Src(Src), Dst(Dst), Kind(Kind) {}
303 using EdgesVec = SmallVector<const Edge *, 0>;
305 using nodes_iterator =
const Node *
const *;
307 SplitGraph(
const Module &M,
const FunctionsCostMap &CostMap,
309 : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
311 void buildGraph(CallGraph &CG);
314 bool verifyGraph()
const;
317 bool empty()
const {
return Nodes.empty(); }
321 unsigned getNumNodes()
const {
return Nodes.size(); }
322 BitVector createNodesBitVector()
const {
return BitVector(Nodes.size()); }
324 const Module &getModule()
const {
return M; }
326 CostType getModuleCost()
const {
return ModuleCost; }
327 CostType
getCost(
const Function &
F)
const {
return CostMap.at(&
F); }
331 CostType calculateCost(
const BitVector &BV)
const;
336 Node &
getNode(DenseMap<const GlobalValue *, Node *> &Cache,
337 const GlobalValue &GV);
340 const Edge &createEdge(
Node &Src,
Node &Dst, EdgeKind EK);
343 const FunctionsCostMap &CostMap;
349 SpecificBumpPtrAllocator<Node> NodesPool;
355 std::is_trivially_destructible_v<Edge>,
356 "Edge must be trivially destructible to use the BumpPtrAllocator");
372class SplitGraph::Node {
373 friend class SplitGraph;
376 Node(
unsigned ID,
const GlobalValue &GV, CostType IndividualCost,
378 :
ID(
ID), GV(GV), IndividualCost(IndividualCost),
379 IsNonCopyable(IsNonCopyable), IsEntryFnCC(
false), IsGraphEntry(
false) {
386 unsigned getID()
const {
return ID; }
392 CostType getIndividualCost()
const {
return IndividualCost; }
394 bool isNonCopyable()
const {
return IsNonCopyable; }
395 bool isEntryFunctionCC()
const {
return IsEntryFnCC; }
401 bool isGraphEntryPoint()
const {
return IsGraphEntry; }
403 StringRef
getName()
const {
return GV.getName(); }
405 bool hasAnyIncomingEdges()
const {
return IncomingEdges.size(); }
406 bool hasAnyIncomingEdgesOfKind(EdgeKind EK)
const {
407 return any_of(IncomingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
410 bool hasAnyOutgoingEdges()
const {
return OutgoingEdges.size(); }
411 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK)
const {
412 return any_of(OutgoingEdges, [&](
const auto *
E) {
return E->Kind == EK; });
416 return IncomingEdges;
420 return OutgoingEdges;
423 bool shouldFollowIndirectCalls()
const {
return isEntryFunctionCC(); }
430 void visitAllDependencies(std::function<
void(
const Node &)> Visitor)
const;
439 void getDependencies(BitVector &BV)
const {
440 visitAllDependencies([&](
const Node &
N) { BV.set(
N.getID()); });
444 void markAsGraphEntry() { IsGraphEntry =
true; }
447 const GlobalValue &GV;
448 CostType IndividualCost;
449 bool IsNonCopyable : 1;
450 bool IsEntryFnCC : 1;
451 bool IsGraphEntry : 1;
455 EdgesVec IncomingEdges;
456 EdgesVec OutgoingEdges;
459void SplitGraph::Node::visitAllDependencies(
460 std::function<
void(
const Node &)> Visitor)
const {
461 const bool FollowIndirect = shouldFollowIndirectCalls();
464 DenseSet<const Node *> Seen;
465 SmallVector<const Node *, 8> WorkList({
this});
466 while (!WorkList.empty()) {
467 const Node *CurN = WorkList.pop_back_val();
468 if (
auto [It, Inserted] = Seen.insert(CurN); !Inserted)
473 for (
const Edge *
E : CurN->outgoing_edges()) {
474 if (!FollowIndirect &&
E->Kind == EdgeKind::IndirectCall)
476 WorkList.push_back(
E->Dst);
488static bool handleCalleesMD(
const Instruction &
I,
489 SetVector<Function *> &Callees) {
490 auto *MD =
I.getMetadata(LLVMContext::MD_callees);
494 for (
const auto &Op : MD->operands()) {
495 Function *Callee = mdconst::extract_or_null<Function>(Op);
498 Callees.insert(Callee);
504void SplitGraph::buildGraph(CallGraph &CG) {
505 SplitModuleTimer SMT(
"buildGraph",
"graph construction");
508 <<
"[build graph] constructing graph representation of the input\n");
516 DenseMap<const GlobalValue *, Node *> Cache;
517 SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
518 for (
const Function &Fn : M) {
519 if (Fn.isDeclaration())
523 SetVector<const Function *> DirectCallees;
524 bool CallsExternal =
false;
525 for (
auto &CGEntry : *CG[&Fn]) {
526 auto *CGNode = CGEntry.second;
527 if (
auto *Callee = CGNode->getFunction()) {
528 if (!Callee->isDeclaration())
529 DirectCallees.insert(Callee);
530 }
else if (CGNode == CG.getCallsExternalNode())
531 CallsExternal =
true;
537 LLVM_DEBUG(dbgs() <<
" [!] callgraph is incomplete for ";
538 Fn.printAsOperand(dbgs());
539 dbgs() <<
" - analyzing function\n");
541 SetVector<Function *> KnownCallees;
542 bool HasUnknownIndirectCall =
false;
545 const auto *CB = dyn_cast<CallBase>(&Inst);
546 if (!CB || CB->getCalledFunction())
551 if (CB->isInlineAsm()) {
552 LLVM_DEBUG(dbgs() <<
" found inline assembly\n");
556 if (handleCalleesMD(Inst, KnownCallees))
560 KnownCallees.clear();
564 HasUnknownIndirectCall =
true;
568 if (HasUnknownIndirectCall) {
569 LLVM_DEBUG(dbgs() <<
" indirect call found\n");
570 FnsWithIndirectCalls.push_back(&Fn);
571 }
else if (!KnownCallees.empty())
572 DirectCallees.insert_range(KnownCallees);
576 for (
const auto *Callee : DirectCallees)
577 createEdge(
N,
getNode(Cache, *Callee), EdgeKind::DirectCall);
579 if (canBeIndirectlyCalled(Fn))
580 IndirectlyCallableFns.push_back(&Fn);
584 for (
const Function *Fn : FnsWithIndirectCalls) {
585 for (
const Function *Candidate : IndirectlyCallableFns) {
588 createEdge(Src, Dst, EdgeKind::IndirectCall);
593 SmallVector<Node *, 16> CandidateEntryPoints;
594 BitVector NodesReachableByKernels = createNodesBitVector();
595 for (
Node *
N : Nodes) {
597 if (
N->isEntryFunctionCC()) {
598 N->markAsGraphEntry();
599 N->getDependencies(NodesReachableByKernels);
600 }
else if (!
N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
601 CandidateEntryPoints.push_back(
N);
604 for (
Node *
N : CandidateEntryPoints) {
610 if (!NodesReachableByKernels.test(
N->getID()))
611 N->markAsGraphEntry();
620bool SplitGraph::verifyGraph()
const {
621 unsigned ExpectedID = 0;
623 DenseSet<const Node *> SeenNodes;
624 DenseSet<const Function *> SeenFunctionNodes;
625 for (
const Node *
N : Nodes) {
626 if (
N->getID() != (ExpectedID++)) {
627 errs() <<
"Node IDs are incorrect!\n";
631 if (!SeenNodes.insert(
N).second) {
632 errs() <<
"Node seen more than once!\n";
637 errs() <<
"getNode doesn't return the right node\n";
641 for (
const Edge *
E :
N->IncomingEdges) {
642 if (!
E->Src || !
E->Dst || (
E->Dst !=
N) ||
643 (
find(
E->Src->OutgoingEdges,
E) ==
E->Src->OutgoingEdges.end())) {
644 errs() <<
"ill-formed incoming edges\n";
649 for (
const Edge *
E :
N->OutgoingEdges) {
650 if (!
E->Src || !
E->Dst || (
E->Src !=
N) ||
651 (
find(
E->Dst->IncomingEdges,
E) ==
E->Dst->IncomingEdges.end())) {
652 errs() <<
"ill-formed outgoing edges\n";
657 const Function &Fn =
N->getFunction();
658 if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
659 if (
N->hasAnyIncomingEdges()) {
660 errs() <<
"Kernels cannot have incoming edges\n";
665 if (Fn.isDeclaration()) {
666 errs() <<
"declarations shouldn't have nodes!\n";
670 auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
672 errs() <<
"one function has multiple nodes!\n";
677 if (ExpectedID != Nodes.size()) {
678 errs() <<
"Node IDs out of sync!\n";
682 if (createNodesBitVector().size() != getNumNodes()) {
683 errs() <<
"nodes bit vector doesn't have the right size!\n";
688 BitVector BV = createNodesBitVector();
690 if (
N->isGraphEntryPoint())
691 N->getDependencies(BV);
695 for (
const auto &Fn : M) {
696 if (!Fn.isDeclaration()) {
697 if (!SeenFunctionNodes.contains(&Fn)) {
698 errs() <<
"Fn has no associated node in the graph!\n";
705 errs() <<
"not all nodes are reachable through the graph's entry points!\n";
713CostType SplitGraph::calculateCost(
const BitVector &BV)
const {
715 for (
unsigned NodeID : BV.set_bits())
716 Cost +=
getNode(NodeID).getIndividualCost();
721SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
722 const GlobalValue &GV) {
723 auto &
N = Cache[&GV];
728 bool NonCopyable =
false;
729 if (
const Function *Fn = dyn_cast<Function>(&GV)) {
730 NonCopyable = isNonCopyable(*Fn);
731 Cost = CostMap.at(Fn);
733 N =
new (NodesPool.Allocate())
Node(Nodes.size(), GV, Cost, NonCopyable);
739const SplitGraph::Edge &SplitGraph::createEdge(
Node &Src,
Node &Dst,
741 const Edge *
E =
new (EdgesPool.Allocate<Edge>(1))
Edge(&Src, &Dst, EK);
742 Src.OutgoingEdges.push_back(
E);
743 Dst.IncomingEdges.push_back(
E);
762 SplitProposal(
const SplitGraph &SG,
unsigned MaxPartitions) : SG(&SG) {
763 Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
766 void setName(StringRef NewName) { Name = NewName; }
767 StringRef
getName()
const {
return Name; }
769 const BitVector &operator[](
unsigned PID)
const {
770 return Partitions[PID].second;
773 void add(
unsigned PID,
const BitVector &BV) {
774 Partitions[PID].second |= BV;
778 void print(raw_ostream &OS)
const;
783 unsigned findCheapestPartition()
const;
786 void calculateScores();
789 void verifyCompleteness()
const;
801 double getCodeSizeScore()
const {
return CodeSizeScore; }
815 double getBottleneckScore()
const {
return BottleneckScore; }
818 void updateScore(
unsigned PID) {
820 for (
auto &[PCost, Nodes] : Partitions) {
822 PCost = SG->calculateCost(Nodes);
828 double CodeSizeScore = 0.0;
830 double BottleneckScore = 0.0;
832 CostType TotalCost = 0;
834 const SplitGraph *SG =
nullptr;
837 std::vector<std::pair<CostType, BitVector>> Partitions;
840void SplitProposal::print(raw_ostream &OS)
const {
843 OS <<
"[proposal] " << Name <<
", total cost:" << TotalCost
844 <<
", code size score:" << format(
"%0.3f", CodeSizeScore)
845 <<
", bottleneck score:" << format(
"%0.3f", BottleneckScore) <<
'\n';
846 for (
const auto &[PID, Part] : enumerate(Partitions)) {
847 const auto &[Cost, NodeIDs] = Part;
848 OS <<
" - P" << PID <<
" nodes:" << NodeIDs.count() <<
" cost: " << Cost
849 <<
'|' << formatRatioOf(Cost, SG->getModuleCost()) <<
"%\n";
853unsigned SplitProposal::findCheapestPartition()
const {
854 assert(!Partitions.empty());
855 CostType CurCost = std::numeric_limits<CostType>::max();
856 unsigned CurPID = InvalidPID;
857 for (
const auto &[Idx, Part] : enumerate(Partitions)) {
858 if (Part.first <= CurCost) {
860 CurCost = Part.first;
863 assert(CurPID != InvalidPID);
867void SplitProposal::calculateScores() {
868 if (Partitions.empty())
872 CostType LargestPCost = 0;
873 for (
auto &[PCost, Nodes] : Partitions) {
874 if (PCost > LargestPCost)
875 LargestPCost = PCost;
878 CostType ModuleCost = SG->getModuleCost();
879 CodeSizeScore = double(TotalCost) / ModuleCost;
880 assert(CodeSizeScore >= 0.0);
882 BottleneckScore = double(LargestPCost) / ModuleCost;
884 CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
885 BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
889void SplitProposal::verifyCompleteness()
const {
890 if (Partitions.empty())
893 BitVector Result = Partitions[0].second;
894 for (
const auto &
P : drop_begin(Partitions))
896 assert(Result.all() &&
"some nodes are missing from this proposal!");
915class RecursiveSearchSplitting {
917 using SubmitProposalFn = function_ref<void(SplitProposal)>;
919 RecursiveSearchSplitting(
const SplitGraph &SG,
unsigned NumParts,
920 SubmitProposalFn SubmitProposal);
925 struct WorkListEntry {
926 WorkListEntry(
const BitVector &BV) : Cluster(BV) {}
928 unsigned NumNonEntryNodes = 0;
929 CostType TotalCost = 0;
930 CostType CostExcludingGraphEntryPoints = 0;
937 void setupWorkList();
948 void pickPartition(
unsigned Depth,
unsigned Idx, SplitProposal SP);
955 std::pair<unsigned, CostType>
956 findMostSimilarPartition(
const WorkListEntry &Entry,
const SplitProposal &SP);
958 const SplitGraph &SG;
960 SubmitProposalFn SubmitProposal;
964 CostType LargeClusterThreshold = 0;
965 unsigned NumProposalsSubmitted = 0;
966 SmallVector<WorkListEntry> WorkList;
969RecursiveSearchSplitting::RecursiveSearchSplitting(
970 const SplitGraph &SG,
unsigned NumParts, SubmitProposalFn SubmitProposal)
971 : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
976 report_fatal_error(
"[amdgpu-split-module] search depth of " +
977 Twine(MaxDepth) +
" is too high!");
978 LargeClusterThreshold =
979 (LargeFnFactor != 0.0)
980 ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
981 : std::numeric_limits<CostType>::max();
982 LLVM_DEBUG(dbgs() <<
"[recursive search] large cluster threshold set at "
983 << LargeClusterThreshold <<
"\n");
986void RecursiveSearchSplitting::run() {
988 SplitModuleTimer SMT(
"recursive_search_prepare",
"preparing worklist");
993 SplitModuleTimer SMT(
"recursive_search_pick",
"partitioning");
994 SplitProposal SP(SG, NumParts);
995 pickPartition(0, 0, std::move(SP));
999void RecursiveSearchSplitting::setupWorkList() {
1007 EquivalenceClasses<unsigned> NodeEC;
1008 for (
const SplitGraph::Node *
N : SG.nodes()) {
1009 if (!
N->isGraphEntryPoint())
1012 NodeEC.insert(
N->getID());
1013 N->visitAllDependencies([&](
const SplitGraph::Node &Dep) {
1014 if (&Dep !=
N && Dep.isNonCopyable())
1015 NodeEC.unionSets(
N->getID(), Dep.getID());
1019 for (
const auto &
Node : NodeEC) {
1020 if (!
Node->isLeader())
1023 BitVector Cluster = SG.createNodesBitVector();
1024 for (
unsigned M : NodeEC.members(*
Node)) {
1025 const SplitGraph::Node &
N = SG.getNode(M);
1026 if (
N.isGraphEntryPoint())
1027 N.getDependencies(Cluster);
1029 WorkList.emplace_back(std::move(Cluster));
1033 for (WorkListEntry &Entry : WorkList) {
1034 for (
unsigned NodeID : Entry.Cluster.set_bits()) {
1035 const SplitGraph::Node &
N = SG.getNode(NodeID);
1036 const CostType Cost =
N.getIndividualCost();
1038 Entry.TotalCost += Cost;
1039 if (!
N.isGraphEntryPoint()) {
1040 Entry.CostExcludingGraphEntryPoints += Cost;
1041 ++Entry.NumNonEntryNodes;
1046 stable_sort(WorkList, [](
const WorkListEntry &
A,
const WorkListEntry &
B) {
1047 if (
A.TotalCost !=
B.TotalCost)
1048 return A.TotalCost >
B.TotalCost;
1050 if (
A.CostExcludingGraphEntryPoints !=
B.CostExcludingGraphEntryPoints)
1051 return A.CostExcludingGraphEntryPoints >
B.CostExcludingGraphEntryPoints;
1053 if (
A.NumNonEntryNodes !=
B.NumNonEntryNodes)
1054 return A.NumNonEntryNodes >
B.NumNonEntryNodes;
1056 return A.Cluster.count() >
B.Cluster.count();
1060 dbgs() <<
"[recursive search] worklist:\n";
1061 for (
const auto &[Idx, Entry] : enumerate(WorkList)) {
1062 dbgs() <<
" - [" << Idx <<
"]: ";
1063 for (
unsigned NodeID : Entry.Cluster.set_bits())
1064 dbgs() << NodeID <<
" ";
1065 dbgs() <<
"(total_cost:" << Entry.TotalCost
1066 <<
", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1072void RecursiveSearchSplitting::pickPartition(
unsigned Depth,
unsigned Idx,
1074 while (Idx < WorkList.size()) {
1077 const WorkListEntry &Entry = WorkList[Idx];
1078 const BitVector &Cluster = Entry.Cluster;
1082 const unsigned CheapestPID = SP.findCheapestPartition();
1083 assert(CheapestPID != InvalidPID);
1087 const auto [MostSimilarPID, SimilarDepsCost] =
1088 findMostSimilarPartition(Entry, SP);
1092 unsigned SinglePIDToTry = InvalidPID;
1093 if (MostSimilarPID == InvalidPID)
1094 SinglePIDToTry = CheapestPID;
1095 else if (MostSimilarPID == CheapestPID)
1096 SinglePIDToTry = CheapestPID;
1097 else if (Depth >= MaxDepth) {
1100 if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1102 assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1103 const double Ratio =
static_cast<double>(SimilarDepsCost) /
1104 Entry.CostExcludingGraphEntryPoints;
1105 assert(Ratio >= 0.0 && Ratio <= 1.0);
1106 if (Ratio > LargeFnOverlapForMerge) {
1111 SinglePIDToTry = MostSimilarPID;
1114 SinglePIDToTry = CheapestPID;
1121 if (SinglePIDToTry != InvalidPID) {
1122 LLVM_DEBUG(dbgs() << Idx <<
"=P" << SinglePIDToTry <<
' ');
1124 SP.add(SinglePIDToTry, Cluster);
1129 assert(MostSimilarPID != InvalidPID);
1138 SplitProposal BranchSP = SP;
1140 <<
" [lb] " << Idx <<
"=P" << CheapestPID <<
"? ");
1141 BranchSP.add(CheapestPID, Cluster);
1142 pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
1147 SplitProposal BranchSP = SP;
1149 <<
" [ms] " << Idx <<
"=P" << MostSimilarPID <<
"? ");
1150 BranchSP.add(MostSimilarPID, Cluster);
1151 pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
1159 assert(Idx == WorkList.size());
1160 assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
1161 "Search got out of bounds?");
1162 SP.setName(
"recursive_search (depth=" + std::to_string(Depth) +
") #" +
1163 std::to_string(NumProposalsSubmitted++));
1165 SubmitProposal(std::move(SP));
1168std::pair<unsigned, CostType>
1169RecursiveSearchSplitting::findMostSimilarPartition(
const WorkListEntry &Entry,
1170 const SplitProposal &SP) {
1171 if (!Entry.NumNonEntryNodes)
1172 return {InvalidPID, 0};
1177 unsigned ChosenPID = InvalidPID;
1178 CostType ChosenCost = 0;
1179 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1180 BitVector BV = SP[PID];
1181 BV &= Entry.Cluster;
1186 const CostType Cost = SG.calculateCost(BV);
1188 if (ChosenPID == InvalidPID || ChosenCost < Cost ||
1189 (ChosenCost == Cost && PID > ChosenPID)) {
1195 return {ChosenPID, ChosenCost};
1202const SplitGraph::Node *mapEdgeToDst(
const SplitGraph::Edge *
E) {
1206using SplitGraphEdgeDstIterator =
1207 mapped_iterator<SplitGraph::edges_iterator,
decltype(&mapEdgeToDst)>;
1222 return {
Ref->outgoing_edges().begin(), mapEdgeToDst};
1225 return {
Ref->outgoing_edges().end(), mapEdgeToDst};
1229 return G.nodes().begin();
1232 return G.nodes().end();
1240 return SG.getModule().getName().str();
1244 return N->getName().str();
1248 const SplitGraph &SG) {
1250 if (
N->isEntryFunctionCC())
1251 Result +=
"entry-fn-cc ";
1252 if (
N->isNonCopyable())
1253 Result +=
"non-copyable ";
1254 Result +=
"cost:" + std::to_string(
N->getIndividualCost());
1259 const SplitGraph &SG) {
1260 return N->hasAnyIncomingEdges() ?
"" :
"color=\"red\"";
1264 SplitGraphEdgeDstIterator EI,
1265 const SplitGraph &SG) {
1267 switch ((*EI.getCurrent())->Kind) {
1268 case SplitGraph::EdgeKind::DirectCall:
1270 case SplitGraph::EdgeKind::IndirectCall:
1271 return "style=\"dashed\"";
1286static bool needsConservativeImport(
const GlobalValue *GV) {
1287 if (
const auto *Var = dyn_cast<GlobalVariable>(GV))
1288 return Var->hasLocalLinkage();
1289 if (
const auto *GA = dyn_cast<GlobalAlias>(GV))
1290 return GA->hasLocalLinkage();
1296static void printPartitionSummary(raw_ostream &OS,
unsigned N,
const Module &M,
1297 unsigned PartCost,
unsigned ModuleCost) {
1298 OS <<
"*** Partition P" <<
N <<
" ***\n";
1300 for (
const auto &Fn : M) {
1301 if (!Fn.isDeclaration())
1302 OS <<
" - [function] " << Fn.getName() <<
"\n";
1305 for (
const auto &GV :
M.globals()) {
1306 if (GV.hasInitializer())
1307 OS <<
" - [global] " << GV.getName() <<
"\n";
1310 OS <<
"Partition contains " << formatRatioOf(PartCost, ModuleCost)
1311 <<
"% of the source\n";
1314static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1315 SplitModuleTimer SMT(
"proposal_evaluation",
"proposal ranking algorithm");
1318 New.verifyCompleteness();
1319 if (DebugProposalSearch)
1323 const double CurBScore = Best.getBottleneckScore();
1324 const double CurCSScore = Best.getCodeSizeScore();
1325 const double NewBScore =
New.getBottleneckScore();
1326 const double NewCSScore =
New.getCodeSizeScore();
1338 bool IsBest =
false;
1339 if (NewBScore < CurBScore)
1341 else if (NewBScore == CurBScore)
1342 IsBest = (NewCSScore < CurCSScore);
1345 Best = std::move(New);
1349 dbgs() <<
"[search] new best proposal!\n";
1351 dbgs() <<
"[search] discarding - not profitable\n";
1356static std::unique_ptr<Module> cloneAll(
const Module &M) {
1358 return CloneModule(M, VMap, [&](
const GlobalValue *GV) {
return true; });
1362static void writeDOTGraph(
const SplitGraph &SG) {
1363 if (ModuleDotCfgOutput.empty())
1367 raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1369 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1370 <<
"' - DOTGraph will not be printed\n";
1373 SG.getModule().getName());
1376static void splitAMDGPUModule(
1377 GetTTIFn GetTTI,
Module &M,
unsigned NumParts,
1378 function_ref<
void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1398 if (!NoExternalizeOnAddrTaken) {
1399 for (
auto &Fn : M) {
1400 if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1402 dbgs() <<
" because its address is taken\n");
1410 if (!NoExternalizeGlobals) {
1411 for (
auto &GV :
M.globals()) {
1412 if (GV.hasLocalLinkage())
1413 LLVM_DEBUG(
dbgs() <<
"[externalize] GV " << GV.getName() <<
'\n');
1418 for (
auto &GA :
M.aliases()) {
1419 if (GA.hasLocalLinkage()) {
1420 LLVM_DEBUG(
dbgs() <<
"[externalize] alias " << GA.getName() <<
'\n');
1427 FunctionsCostMap FnCosts;
1428 const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1432 SplitGraph SG(M, FnCosts, ModuleCost);
1438 <<
"[!] no nodes in graph, input is empty - no splitting possible\n");
1439 ModuleCallback(cloneAll(M));
1444 dbgs() <<
"[graph] nodes:\n";
1445 for (
const SplitGraph::Node *
N : SG.nodes()) {
1446 dbgs() <<
" - [" <<
N->getID() <<
"]: " <<
N->getName() <<
" "
1447 << (
N->isGraphEntryPoint() ?
"(entry)" :
"") <<
" "
1448 << (
N->isNonCopyable() ?
"(noncopyable)" :
"") <<
"\n";
1456 std::optional<SplitProposal> Proposal;
1457 const auto EvaluateProposal = [&](SplitProposal
SP) {
1458 SP.calculateScores();
1460 Proposal = std::move(SP);
1462 evaluateProposal(*Proposal, std::move(SP));
1467 RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1468 LLVM_DEBUG(
if (Proposal)
dbgs() <<
"[search done] selected proposal: "
1469 << Proposal->getName() <<
"\n";);
1472 LLVM_DEBUG(
dbgs() <<
"[!] no proposal made, no splitting possible!\n");
1473 ModuleCallback(cloneAll(M));
1479 std::optional<raw_fd_ostream> SummariesOS;
1480 if (!PartitionSummariesOutput.empty()) {
1482 SummariesOS.emplace(PartitionSummariesOutput, EC);
1484 errs() <<
"[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1485 <<
"' - Partition summaries will not be printed\n";
1490 bool ImportAllGVs =
true;
1492 for (
unsigned PID = 0; PID < NumParts; ++PID) {
1493 SplitModuleTimer SMT2(
"modules_creation",
1494 "creating modules for each partition");
1497 DenseSet<const Function *> FnsInPart;
1498 for (
unsigned NodeID : (*Proposal)[PID].set_bits())
1499 FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1502 if (FnsInPart.empty()) {
1504 <<
" is empty, not creating module\n");
1509 CostType PartCost = 0;
1510 std::unique_ptr<Module> MPart(
1513 if (
const auto *Fn = dyn_cast<Function>(GV)) {
1514 if (FnsInPart.contains(Fn)) {
1515 PartCost += SG.getCost(*Fn);
1522 if (
const auto *GA = dyn_cast<GlobalAlias>(GV)) {
1523 if (
const auto *Fn = dyn_cast<Function>(GA->getAliaseeObject()))
1524 return FnsInPart.contains(Fn);
1528 return ImportAllGVs || needsConservativeImport(GV);
1531 ImportAllGVs =
false;
1535 if (needsConservativeImport(&GV) && GV.use_empty())
1536 GV.eraseFromParent();
1540 printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1543 printPartitionSummary(
dbgs(), PID, *MPart, PartCost, ModuleCost));
1545 ModuleCallback(std::move(MPart));
1552 SplitModuleTimer SMT(
1553 "total",
"total pass runtime (incl. potentially waiting for lockfile)");
1576 dbgs() <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1577 "output may be mangled by other processes\n");
1578 }
else if (!Owned) {
1587 <<
"[amdgpu-split-module] unable to acquire lockfile, debug "
1588 "output may be mangled by other processes\n");
1594 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1602 splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Unify divergent function exit nodes
This file defines the BumpPtrAllocator interface.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI)
This file defines the DenseMap class.
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
This file defines the little GraphTraits<X> template class that should be specialized by classes that...
Module.h This file contains the declarations for the Module class.
Machine Check Debug Module
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
static const BasicSubtargetSubTypeKV * find(StringRef S, ArrayRef< BasicSubtargetSubTypeKV > A)
Find KV in array using binary search.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM)
Lightweight error class with error context and mandatory checking.
@ HiddenVisibility
The GV is hidden.
@ ExternalLinkage
Externally visible function.
static InstructionCost getMax()
Class that manages the creation of a lock file to aid implicit coordination between different process...
std::error_code unsafeUnlock() override
Remove the lock file.
WaitForUnlockResult waitForUnlockFor(std::chrono::seconds MaxSeconds) override
For a shared lock, wait until the owner releases the lock.
Expected< bool > tryLock() override
Tries to acquire the lock without blocking.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
StringRef str() const
Explicit conversion to StringRef.
Analysis pass providing the TargetTransformInfo.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
template class LLVM_TEMPLATE_ABI opt< bool >
template class LLVM_TEMPLATE_ABI opt< unsigned >
initializer< Ty > init(const Ty &Val)
template class LLVM_TEMPLATE_ABI opt< std::string >
LLVM_ABI void system_temp_directory(bool erasedOnReboot, SmallVectorImpl< char > &result)
Get the typical temporary directory for the system, e.g., "/var/tmp" or "C:/TEMP".
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI bool TimePassesIsEnabled
If the user specifies the -time-passes argument on an LLVM tool command line then the value of this b...
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
@ Success
The lock was released successfully.
@ OwnerDied
Owner died while holding the lock.
@ Timeout
Reached timeout while waiting for the owner to release the lock.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
void consumeError(Error Err)
Consume a Error without doing anything.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static std::string getEdgeAttributes(const SplitGraph::Node *N, SplitGraphEdgeDstIterator EI, const SplitGraph &SG)
static std::string getGraphName(const SplitGraph &SG)
DOTGraphTraits(bool IsSimple=false)
static std::string getNodeAttributes(const SplitGraph::Node *N, const SplitGraph &SG)
static std::string getNodeDescription(const SplitGraph::Node *N, const SplitGraph &SG)
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG)
DefaultDOTGraphTraits(bool simple=false)
const SplitGraph::Edge * EdgeRef
static NodeRef getEntryNode(NodeRef N)
SplitGraph::nodes_iterator nodes_iterator
SplitGraph::edges_iterator ChildEdgeIteratorType
SplitGraphEdgeDstIterator ChildIteratorType
static nodes_iterator nodes_end(const SplitGraph &G)
static ChildIteratorType child_begin(NodeRef Ref)
static nodes_iterator nodes_begin(const SplitGraph &G)
const SplitGraph::Node * NodeRef
static ChildIteratorType child_end(NodeRef Ref)