21#define DEBUG_TYPE "balanced-partitioning"
24 OS <<
formatv(
"{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}",
Id,
28template <
typename Func>
29void BalancedPartitioning::BPThreadPool::async(Func &&
F) {
30#if LLVM_ENABLE_THREADS
33 TheThreadPool.async([=]() {
38 if (--NumActiveThreads == 0) {
41 std::unique_lock<std::mutex> lock(mtx);
42 assert(!IsFinishedSpawning);
43 IsFinishedSpawning =
true;
53void BalancedPartitioning::BPThreadPool::wait() {
54#if LLVM_ENABLE_THREADS
58 std::unique_lock<std::mutex> lock(mtx);
59 cv.wait(lock, [&]() {
return IsFinishedSpawning; });
60 assert(IsFinishedSpawning && NumActiveThreads == 0);
74 for (
unsigned I = 1;
I < LOG_CACHE_SIZE;
I++)
75 Log2Cache[
I] = std::log2(
I);
81 "Partitioning %d nodes using depth %d and %d iterations per split\n",
83 std::optional<BPThreadPool> TP;
84#if LLVM_ENABLE_THREADS
87 TP.emplace(TheThreadPool);
91 for (
unsigned I = 0;
I < Nodes.size();
I++)
92 Nodes[
I].InputOrderIndex =
I;
95 auto BisectTask = [=, &TP]() {
96 bisect(NodesRange, 0, 1, 0, TP);
99 TP->async(std::move(BisectTask));
106 return L.Bucket < R.Bucket;
112void BalancedPartitioning::bisect(
const FunctionNodeRange Nodes,
113 unsigned RecDepth,
unsigned RootBucket,
115 std::optional<BPThreadPool> &TP)
const {
116 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
117 if (NumNodes <= 1 || RecDepth >= Config.
SplitDepth) {
120 llvm::sort(Nodes, [](
const auto &L,
const auto &R) {
121 return L.InputOrderIndex < R.InputOrderIndex;
123 for (
auto &
N : Nodes)
129 NumNodes, RootBucket));
131 std::mt19937 RNG(RootBucket);
133 unsigned LeftBucket = 2 * RootBucket;
134 unsigned RightBucket = 2 * RootBucket + 1;
137 split(Nodes, LeftBucket);
139 runIterations(Nodes, LeftBucket, RightBucket, RNG);
144 unsigned MidOffset =
Offset + std::distance(Nodes.begin(), NodesMid);
149 auto LeftRecTask = [=, &TP]() {
150 bisect(LeftNodes, RecDepth + 1, LeftBucket,
Offset, TP);
152 auto RightRecTask = [=, &TP]() {
153 bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP);
156 if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) {
157 TP->async(std::move(LeftRecTask));
158 TP->async(std::move(RightRecTask));
165void BalancedPartitioning::runIterations(
const FunctionNodeRange Nodes,
167 unsigned RightBucket,
168 std::mt19937 &RNG)
const {
169 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
171 for (
auto &
N : Nodes)
172 for (
auto &UN :
N.UtilityNodes)
173 ++UtilityNodeIndex[UN];
176 for (
auto &
N : Nodes)
178 return UtilityNodeIndex[UN] == 1 || UtilityNodeIndex[UN] == NumNodes;
182 UtilityNodeIndex.
clear();
183 for (
auto &
N : Nodes)
184 for (
auto &UN :
N.UtilityNodes)
185 UN = UtilityNodeIndex.
insert({UN, UtilityNodeIndex.
size()}).first->second;
189 for (
auto &
N : Nodes) {
190 for (
auto &UN :
N.UtilityNodes) {
192 if (
N.Bucket == LeftBucket) {
201 unsigned NumMovedNodes =
202 runIteration(Nodes, LeftBucket, RightBucket,
Signatures, RNG);
203 if (NumMovedNodes == 0)
208unsigned BalancedPartitioning::runIteration(
const FunctionNodeRange Nodes,
210 unsigned RightBucket,
212 std::mt19937 &RNG)
const {
215 if (Signature.CachedGainIsValid)
217 unsigned L = Signature.LeftCount;
218 unsigned R = Signature.RightCount;
219 assert((L > 0 || R > 0) &&
"incorrect signature");
220 float Cost = logCost(L, R);
221 Signature.CachedGainLR = 0.f;
222 Signature.CachedGainRL = 0.f;
224 Signature.CachedGainLR =
Cost - logCost(L - 1, R + 1);
226 Signature.CachedGainRL =
Cost - logCost(L + 1, R - 1);
227 Signature.CachedGainIsValid =
true;
231 typedef std::pair<float, BPFunctionNode *> GainPair;
232 std::vector<GainPair> Gains;
233 for (
auto &
N : Nodes) {
234 bool FromLeftToRight = (
N.Bucket == LeftBucket);
236 Gains.push_back(std::make_pair(Gain, &
N));
241 Gains, [&](
const auto &GP) {
return GP.second->Bucket == LeftBucket; });
246 auto LargerGain = [](
const auto &
L,
const auto &
R) {
247 return L.first >
R.first;
252 unsigned NumMovedDataVertices = 0;
253 for (
auto [LeftPair, RightPair] :
llvm::zip(LeftRange, RightRange)) {
254 auto &[LeftGain, LeftNode] = LeftPair;
255 auto &[RightGain, RightNode] = RightPair;
257 if (LeftGain + RightGain <= 0.f)
260 if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket,
Signatures, RNG))
261 ++NumMovedDataVertices;
262 if (moveFunctionNode(*RightNode, LeftBucket, RightBucket,
Signatures, RNG))
263 ++NumMovedDataVertices;
265 return NumMovedDataVertices;
270 unsigned RightBucket,
272 std::mt19937 &RNG)
const {
274 if (std::uniform_real_distribution<float>(0.f, 1.f)(RNG) <=
278 bool FromLeftToRight = (
N.Bucket == LeftBucket);
280 N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket);
283 if (FromLeftToRight) {
284 for (
auto &UN :
N.UtilityNodes) {
286 Signature.LeftCount--;
287 Signature.RightCount++;
288 Signature.CachedGainIsValid =
false;
291 for (
auto &UN :
N.UtilityNodes) {
293 Signature.LeftCount++;
294 Signature.RightCount--;
295 Signature.CachedGainIsValid =
false;
301void BalancedPartitioning::split(
const FunctionNodeRange Nodes,
302 unsigned StartBucket)
const {
303 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
304 auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
306 std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](
auto &L,
auto &R) {
307 return L.InputOrderIndex < R.InputOrderIndex;
311 N.Bucket = StartBucket;
313 N.Bucket = StartBucket + 1;
317 bool FromLeftToRight,
320 for (
auto &UN :
N.UtilityNodes)
321 Gain += (FromLeftToRight ?
Signatures[UN].CachedGainLR
326float BalancedPartitioning::logCost(
unsigned X,
unsigned Y)
const {
327 return -(
X * log2Cached(
X + 1) +
Y * log2Cached(
Y + 1));
330float BalancedPartitioning::log2Cached(
unsigned i)
const {
331 return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i);
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const FuncProtoTy Signatures[]
A function with a set of utility nodes where it is beneficial to order two functions close together i...
IDT Id
The ID of this node.
SmallVector< UtilityNodeT, 4 > UtilityNodes
The list of utility nodes associated with this node.
std::optional< unsigned > Bucket
The bucket assigned by balanced partitioning.
void dump(raw_ostream &OS) const
static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures)
Compute the move gain for uniform log-gap cost.
void run(std::vector< BPFunctionNode > &Nodes) const
Run recursive graph partitioning that optimizes a given objective.
BalancedPartitioning(const BalancedPartitioningConfig &Config)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
A non-threaded implementation.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
auto formatv(const char *Fmt, Ts &&...Vals) -> formatv_object< decltype(std::make_tuple(support::detail::build_format_adapter(std::forward< Ts >(Vals))...))>
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
auto partition(R &&Range, UnaryPredicate P)
Provide wrappers to std::partition which take ranges instead of having to pass begin/end explicitly.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Algorithm parameters; default values are tuned on real-world binaries.
unsigned SplitDepth
The depth of the recursive bisection.
float SkipProbability
The probability for a vertex to skip a move from its current bucket to another bucket; it often helps...
unsigned TaskSplitDepth
Recursive subtasks up to the given depth are added to the queue and distributed among threads by Thre...
unsigned IterationsPerSplit
The maximum number of bp iterations per split.