LLVM 20.0.0git
BalancedPartitioning.h
Go to the documentation of this file.
1//===- BalancedPartitioning.h ---------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements BalancedPartitioning, a recursive balanced graph
10// partitioning algorithm.
11//
12// The algorithm is used to find an ordering of FunctionNodes while optimizing
13// a specified objective. The algorithm uses recursive bisection; it starts
14// with a collection of unordered FunctionNodes and tries to split them into
15// two sets (buckets) of equal cardinality. Each bisection step is comprised of
16// iterations that greedily swap the FunctionNodes between the two buckets while
17// there is an improvement of the objective. Once the process converges, the
18// problem is divided into two sub-problems of half the size, which are
19// recursively applied for the two buckets. The final ordering of the
20// FunctionNodes is obtained by concatenating the two (recursively computed)
21// orderings.
22//
23// In order to speed up the computation, we limit the depth of the recursive
24// tree by a specified constant (SplitDepth) and apply at most a constant
25// number of greedy iterations per split (IterationsPerSplit). The worst-case
26// time complexity of the implementation is bounded by O(M*log^2 N), where
27// N is the number of FunctionNodes and M is the number of
28// FunctionNode-UtilityNode edges; (assuming that any collection of D
29// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
30// recursive sub-problems are independent and thus can be efficiently processed
31// in parallel.
32//
33// Reference:
34// * Optimizing Function Layout for Mobile Applications,
35// https://arxiv.org/abs/2211.09285
36//
37//===----------------------------------------------------------------------===//
38
39#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
40#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
41
42#include "raw_ostream.h"
43#include "llvm/ADT/ArrayRef.h"
44
45#include <atomic>
46#include <condition_variable>
47#include <mutex>
48#include <random>
49#include <vector>
50
51namespace llvm {
52
53class ThreadPoolInterface;
54/// A function with a set of utility nodes where it is beneficial to order two
55/// functions close together if they have similar utility nodes
58
59public:
60 using IDT = uint64_t;
62
63 /// \param UtilityNodes the set of utility nodes (must be unique'd)
66
67 /// The ID of this node
69
70 void dump(raw_ostream &OS) const;
71
72protected:
73 /// The list of utility nodes associated with this node
75 /// The bucket assigned by balanced partitioning
76 std::optional<unsigned> Bucket;
77 /// The index of the input order of the FunctionNodes
79
83};
84
85/// Algorithm parameters; default values are tuned on real-world binaries
87 /// The depth of the recursive bisection
88 unsigned SplitDepth = 18;
89 /// The maximum number of bp iterations per split
90 unsigned IterationsPerSplit = 40;
91 /// The probability for a vertex to skip a move from its current bucket to
92 /// another bucket; it often helps to escape from a local optima
93 float SkipProbability = 0.1f;
94 /// Recursive subtasks up to the given depth are added to the queue and
95 /// distributed among threads by ThreadPool; all subsequent calls are executed
96 /// on the same thread
97 unsigned TaskSplitDepth = 9;
98};
99
101public:
103
104 /// Run recursive graph partitioning that optimizes a given objective.
105 void run(std::vector<BPFunctionNode> &Nodes) const;
106
107private:
108 struct UtilitySignature;
110 using FunctionNodeRange =
112
113 /// A special ThreadPool that allows for spawning new tasks after blocking on
114 /// wait(). BalancedPartitioning recursively spawns new threads inside other
115 /// threads, so we need to track how many active threads that could spawn more
116 /// threads.
117 struct BPThreadPool {
118 ThreadPoolInterface &TheThreadPool;
119 std::mutex mtx;
120 std::condition_variable cv;
121 /// The number of threads that could spawn more threads
122 std::atomic<int> NumActiveThreads = 0;
123 /// Only true when all threads are down spawning new threads
124 bool IsFinishedSpawning = false;
125 /// Asynchronous submission of the task to the pool
126 template <typename Func> void async(Func &&F);
127 /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
128 /// acceptable for other threads to add more tasks while blocking on this
129 /// call.
130 void wait();
131 BPThreadPool(ThreadPoolInterface &TheThreadPool)
132 : TheThreadPool(TheThreadPool) {}
133 };
134
135 /// Run a recursive bisection of a given list of FunctionNodes
136 /// \param RecDepth the current depth of recursion
137 /// \param RootBucket the initial bucket of the dataVertices
138 /// \param Offset the assigned buckets are the range [Offset, Offset +
139 /// Nodes.size()]
140 void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
141 unsigned RootBucket, unsigned Offset,
142 std::optional<BPThreadPool> &TP) const;
143
144 /// Run bisection iterations
145 void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,
146 unsigned RightBucket, std::mt19937 &RNG) const;
147
148 /// Run a bisection iteration to improve the optimization goal
149 /// \returns the total number of moved FunctionNodes
150 unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
151 unsigned RightBucket, SignaturesT &Signatures,
152 std::mt19937 &RNG) const;
153
154 /// Try to move \p N from one bucket to another
155 /// \returns true iff \p N is moved
156 bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
157 unsigned RightBucket, SignaturesT &Signatures,
158 std::mt19937 &RNG) const;
159
160 /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
161 /// 1 The method is used for an initial assignment before a bisection step
162 void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
163
164 /// The cost of the uniform log-gap cost, assuming a utility node has \p X
165 /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
166 float logCost(unsigned X, unsigned Y) const;
167
168 float log2Cached(unsigned i) const;
169
171
172 /// Precomputed values of log2(x). Table size is small enough to fit in cache.
173 static constexpr unsigned LOG_CACHE_SIZE = 16384;
174 float Log2Cache[LOG_CACHE_SIZE];
175
176 /// The signature of a particular utility node used for the bisection step,
177 /// i.e., the number of \p FunctionNodes in each of the two buckets
178 struct UtilitySignature {
179 /// The number of \p FunctionNodes in the left bucket
180 unsigned LeftCount = 0;
181 /// The number of \p FunctionNodes in the right bucket
182 unsigned RightCount = 0;
183 /// The cached gain of moving a \p FunctionNode from the left bucket to the
184 /// right bucket
185 float CachedGainLR;
186 /// The cached gain of moving a \p FunctionNode from the right bucket to the
187 /// left bucket
188 float CachedGainRL;
189 /// Whether \p CachedGainLR and \p CachedGainRL are valid
190 bool CachedGainIsValid = false;
191 };
192
193protected:
194 /// Compute the move gain for uniform log-gap cost
195 static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
196 const SignaturesT &Signatures);
198};
199
200} // end namespace llvm
201
202#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
RelaxConfig Config
Definition: ELF_riscv.cpp:506
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
raw_pwrite_stream & OS
static const FuncProtoTy Signatures[]
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A function with a set of utility nodes where it is beneficial to order two functions close together i...
IDT Id
The ID of this node.
friend class BPFunctionNodeTest_Basic_Test
BPFunctionNode(IDT Id, ArrayRef< UtilityNodeT > UtilityNodes)
friend class BalancedPartitioningTest_Large_Test
SmallVector< UtilityNodeT, 4 > UtilityNodes
The list of utility nodes associated with this node.
uint64_t InputOrderIndex
The index of the input order of the FunctionNodes.
friend class BalancedPartitioningTest_Basic_Test
std::optional< unsigned > Bucket
The bucket assigned by balanced partitioning.
void dump(raw_ostream &OS) const
static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures)
Compute the move gain for uniform log-gap cost.
void run(std::vector< BPFunctionNode > &Nodes) const
Run recursive graph partitioning that optimizes a given objective.
friend class BalancedPartitioningTest_MoveGain_Test
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
This defines the abstract base interface for a ThreadPool allowing asynchronous parallel execution on...
Definition: ThreadPool.h:49
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
#define N
Algorithm parameters; default values are tuned on real-world binaries.
unsigned SplitDepth
The depth of the recursive bisection.
float SkipProbability
The probability for a vertex to skip a move from its current bucket to another bucket; it often helps...
unsigned TaskSplitDepth
Recursive subtasks up to the given depth are added to the queue and distributed among threads by Thre...
unsigned IterationsPerSplit
The maximum number of bp iterations per split.