LLVM 22.0.0git
BalancedPartitioning.h
Go to the documentation of this file.
1//===- BalancedPartitioning.h ---------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements BalancedPartitioning, a recursive balanced graph
10// partitioning algorithm.
11//
12// The algorithm is used to find an ordering of FunctionNodes while optimizing
13// a specified objective. The algorithm uses recursive bisection; it starts
14// with a collection of unordered FunctionNodes and tries to split them into
15// two sets (buckets) of equal cardinality. Each bisection step is comprised of
16// iterations that greedily swap the FunctionNodes between the two buckets while
17// there is an improvement of the objective. Once the process converges, the
18// problem is divided into two sub-problems of half the size, which are
19// recursively applied for the two buckets. The final ordering of the
20// FunctionNodes is obtained by concatenating the two (recursively computed)
21// orderings.
22//
23// In order to speed up the computation, we limit the depth of the recursive
24// tree by a specified constant (SplitDepth) and apply at most a constant
25// number of greedy iterations per split (IterationsPerSplit). The worst-case
26// time complexity of the implementation is bounded by O(M*log^2 N), where
27// N is the number of FunctionNodes and M is the number of
28// FunctionNode-UtilityNode edges; (assuming that any collection of D
29// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
30// recursive sub-problems are independent and thus can be efficiently processed
31// in parallel.
32//
33// Reference:
34// * Optimizing Function Layout for Mobile Applications,
35// https://arxiv.org/abs/2211.09285
36//
37//===----------------------------------------------------------------------===//
38
39#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
40#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
41
42#include "raw_ostream.h"
43#include "llvm/ADT/ArrayRef.h"
45
46#include <atomic>
47#include <condition_variable>
48#include <mutex>
49#include <random>
50#include <vector>
51
52namespace llvm {
53
54class ThreadPoolInterface;
55/// A function with a set of utility nodes where it is beneficial to order two
56/// functions close together if they have similar utility nodes
59
60public:
61 using IDT = uint64_t;
63
64 /// \param UtilityNodes the set of utility nodes (must be unique'd)
67
68 /// The ID of this node
70
71 LLVM_ABI void dump(raw_ostream &OS) const;
72
73protected:
74 /// The list of utility nodes associated with this node
76 /// The bucket assigned by balanced partitioning
77 std::optional<unsigned> Bucket;
78 /// The index of the input order of the FunctionNodes
80
84};
85
86/// Algorithm parameters; default values are tuned on real-world binaries
88 /// The depth of the recursive bisection
89 unsigned SplitDepth = 18;
90 /// The maximum number of bp iterations per split
91 unsigned IterationsPerSplit = 40;
92 /// The probability for a vertex to skip a move from its current bucket to
93 /// another bucket; it often helps to escape from a local optima
94 float SkipProbability = 0.1f;
95 /// Recursive subtasks up to the given depth are added to the queue and
96 /// distributed among threads by ThreadPool; all subsequent calls are executed
97 /// on the same thread
98 unsigned TaskSplitDepth = 9;
99};
100
102public:
104
105 /// Run recursive graph partitioning that optimizes a given objective.
106 LLVM_ABI void run(std::vector<BPFunctionNode> &Nodes) const;
107
108private:
109 struct UtilitySignature;
111 using FunctionNodeRange =
113
114 /// A special ThreadPool that allows for spawning new tasks after blocking on
115 /// wait(). BalancedPartitioning recursively spawns new threads inside other
116 /// threads, so we need to track how many active threads that could spawn more
117 /// threads.
118 struct BPThreadPool {
119 ThreadPoolInterface &TheThreadPool;
120 std::mutex mtx;
121 std::condition_variable cv;
122 /// The number of threads that could spawn more threads
123 std::atomic<int> NumActiveThreads = 0;
124 /// Only true when all threads are down spawning new threads
125 bool IsFinishedSpawning = false;
126 /// Asynchronous submission of the task to the pool
127 template <typename Func> void async(Func &&F);
128 /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
129 /// acceptable for other threads to add more tasks while blocking on this
130 /// call.
131 LLVM_ABI void wait();
132 BPThreadPool(ThreadPoolInterface &TheThreadPool)
133 : TheThreadPool(TheThreadPool) {}
134 };
135
136 /// Run a recursive bisection of a given list of FunctionNodes
137 /// \param RecDepth the current depth of recursion
138 /// \param RootBucket the initial bucket of the dataVertices
139 /// \param Offset the assigned buckets are the range [Offset, Offset +
140 /// Nodes.size()]
141 void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
142 unsigned RootBucket, unsigned Offset,
143 std::optional<BPThreadPool> &TP) const;
144
145 /// Run bisection iterations
146 void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,
147 unsigned RightBucket, std::mt19937 &RNG) const;
148
149 /// Run a bisection iteration to improve the optimization goal
150 /// \returns the total number of moved FunctionNodes
151 unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
152 unsigned RightBucket, SignaturesT &Signatures,
153 std::mt19937 &RNG) const;
154
155 /// Try to move \p N from one bucket to another
156 /// \returns true iff \p N is moved
157 bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
158 unsigned RightBucket, SignaturesT &Signatures,
159 std::mt19937 &RNG) const;
160
161 /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
162 /// 1 The method is used for an initial assignment before a bisection step
163 void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
164
165 /// The cost of the uniform log-gap cost, assuming a utility node has \p X
166 /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
167 float logCost(unsigned X, unsigned Y) const;
168
169 float log2Cached(unsigned i) const;
170
172
173 /// Precomputed values of log2(x). Table size is small enough to fit in cache.
174 static constexpr unsigned LOG_CACHE_SIZE = 16384;
175 float Log2Cache[LOG_CACHE_SIZE];
176
177 /// The signature of a particular utility node used for the bisection step,
178 /// i.e., the number of \p FunctionNodes in each of the two buckets
179 struct UtilitySignature {
180 /// The number of \p FunctionNodes in the left bucket
181 unsigned LeftCount = 0;
182 /// The number of \p FunctionNodes in the right bucket
183 unsigned RightCount = 0;
184 /// The cached gain of moving a \p FunctionNode from the left bucket to the
185 /// right bucket
186 float CachedGainLR;
187 /// The cached gain of moving a \p FunctionNode from the right bucket to the
188 /// left bucket
189 float CachedGainRL;
190 /// Whether \p CachedGainLR and \p CachedGainRL are valid
191 bool CachedGainIsValid = false;
192 };
193
194protected:
195 /// Compute the move gain for uniform log-gap cost
196 LLVM_ABI static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
197 const SignaturesT &Signatures);
199};
200
201} // end namespace llvm
202
203#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
#define LLVM_ABI
Definition: Compiler.h:213
RelaxConfig Config
Definition: ELF_riscv.cpp:506
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
raw_pwrite_stream & OS
static const FuncProtoTy Signatures[]
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A function with a set of utility nodes where it is beneficial to order two functions close together i...
IDT Id
The ID of this node.
friend class BPFunctionNodeTest_Basic_Test
BPFunctionNode(IDT Id, ArrayRef< UtilityNodeT > UtilityNodes)
friend class BalancedPartitioningTest_Large_Test
SmallVector< UtilityNodeT, 4 > UtilityNodes
The list of utility nodes associated with this node.
uint64_t InputOrderIndex
The index of the input order of the FunctionNodes.
friend class BalancedPartitioningTest_Basic_Test
std::optional< unsigned > Bucket
The bucket assigned by balanced partitioning.
LLVM_ABI void dump(raw_ostream &OS) const
static LLVM_ABI float moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures)
Compute the move gain for uniform log-gap cost.
LLVM_ABI void run(std::vector< BPFunctionNode > &Nodes) const
Run recursive graph partitioning that optimizes a given objective.
friend class BalancedPartitioningTest_MoveGain_Test
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
This defines the abstract base interface for a ThreadPool allowing asynchronous parallel execution on...
Definition: ThreadPool.h:50
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
#define N
Algorithm parameters; default values are tuned on real-world binaries.
unsigned SplitDepth
The depth of the recursive bisection.
float SkipProbability
The probability for a vertex to skip a move from its current bucket to another bucket; it often helps...
unsigned TaskSplitDepth
Recursive subtasks up to the given depth are added to the queue and distributed among threads by Thre...
unsigned IterationsPerSplit
The maximum number of bp iterations per split.