15#include "llvm/Config/llvm-config.h"
22#define DEBUG_TYPE "balanced-partitioning"
25 OS <<
formatv(
"{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}",
Id,
29template <
typename Func>
30void BalancedPartitioning::BPThreadPool::async(Func &&
F) {
31#if LLVM_ENABLE_THREADS
34 TheThreadPool.async([
this,
F]() {
39 if (--NumActiveThreads == 0) {
42 std::unique_lock<std::mutex> lock(mtx);
43 assert(!IsFinishedSpawning);
44 IsFinishedSpawning =
true;
54void BalancedPartitioning::BPThreadPool::wait() {
55#if LLVM_ENABLE_THREADS
59 std::unique_lock<std::mutex> lock(mtx);
60 cv.wait(lock, [&]() {
return IsFinishedSpawning; });
61 assert(IsFinishedSpawning && NumActiveThreads == 0);
75 for (
unsigned I = 1;
I < LOG_CACHE_SIZE;
I++)
76 Log2Cache[
I] = std::log2(
I);
82 "Partitioning %d nodes using depth %d and %d iterations per split\n",
84 std::optional<BPThreadPool> TP;
85#if LLVM_ENABLE_THREADS
88 TP.emplace(TheThreadPool);
92 for (
unsigned I = 0;
I < Nodes.size();
I++)
93 Nodes[
I].InputOrderIndex =
I;
96 auto BisectTask = [
this, NodesRange, &TP]() {
97 bisect(NodesRange, 0, 1, 0, TP);
100 TP->async(std::move(BisectTask));
107 return L.Bucket < R.Bucket;
113void BalancedPartitioning::bisect(
const FunctionNodeRange Nodes,
114 unsigned RecDepth,
unsigned RootBucket,
116 std::optional<BPThreadPool> &TP)
const {
117 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
118 if (NumNodes <= 1 || RecDepth >= Config.
SplitDepth) {
121 llvm::sort(Nodes, [](
const auto &L,
const auto &R) {
122 return L.InputOrderIndex < R.InputOrderIndex;
124 for (
auto &
N : Nodes)
130 NumNodes, RootBucket));
132 std::mt19937 RNG(RootBucket);
134 unsigned LeftBucket = 2 * RootBucket;
135 unsigned RightBucket = 2 * RootBucket + 1;
138 split(Nodes, LeftBucket);
140 runIterations(Nodes, LeftBucket, RightBucket, RNG);
145 unsigned MidOffset =
Offset + std::distance(Nodes.begin(), NodesMid);
150 auto LeftRecTask = [
this, LeftNodes, RecDepth, LeftBucket,
Offset, &TP]() {
151 bisect(LeftNodes, RecDepth + 1, LeftBucket,
Offset, TP);
153 auto RightRecTask = [
this, RightNodes, RecDepth, RightBucket, MidOffset,
155 bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP);
158 if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) {
159 TP->async(std::move(LeftRecTask));
160 TP->async(std::move(RightRecTask));
167void BalancedPartitioning::runIterations(
const FunctionNodeRange Nodes,
169 unsigned RightBucket,
170 std::mt19937 &RNG)
const {
171 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
173 for (
auto &
N : Nodes)
174 for (
auto &UN :
N.UtilityNodes)
175 ++UtilityNodeIndex[UN];
178 for (
auto &
N : Nodes)
180 return UtilityNodeIndex[UN] == 1 || UtilityNodeIndex[UN] == NumNodes;
184 UtilityNodeIndex.
clear();
185 for (
auto &
N : Nodes)
186 for (
auto &UN :
N.UtilityNodes)
187 UN = UtilityNodeIndex.
insert({UN, UtilityNodeIndex.
size()}).first->second;
191 for (
auto &
N : Nodes) {
192 for (
auto &UN :
N.UtilityNodes) {
194 if (
N.Bucket == LeftBucket) {
203 unsigned NumMovedNodes =
204 runIteration(Nodes, LeftBucket, RightBucket,
Signatures, RNG);
205 if (NumMovedNodes == 0)
210unsigned BalancedPartitioning::runIteration(
const FunctionNodeRange Nodes,
212 unsigned RightBucket,
214 std::mt19937 &RNG)
const {
217 if (Signature.CachedGainIsValid)
219 unsigned L = Signature.LeftCount;
220 unsigned R = Signature.RightCount;
221 assert((L > 0 || R > 0) &&
"incorrect signature");
222 float Cost = logCost(L, R);
223 Signature.CachedGainLR = 0.f;
224 Signature.CachedGainRL = 0.f;
226 Signature.CachedGainLR =
Cost - logCost(L - 1, R + 1);
228 Signature.CachedGainRL =
Cost - logCost(L + 1, R - 1);
229 Signature.CachedGainIsValid =
true;
233 typedef std::pair<float, BPFunctionNode *> GainPair;
234 std::vector<GainPair> Gains;
235 for (
auto &
N : Nodes) {
236 bool FromLeftToRight = (
N.Bucket == LeftBucket);
238 Gains.push_back(std::make_pair(Gain, &
N));
243 Gains, [&](
const auto &GP) {
return GP.second->Bucket == LeftBucket; });
248 auto LargerGain = [](
const auto &
L,
const auto &
R) {
249 return L.first >
R.first;
254 unsigned NumMovedDataVertices = 0;
255 for (
auto [LeftPair, RightPair] :
llvm::zip(LeftRange, RightRange)) {
256 auto &[LeftGain, LeftNode] = LeftPair;
257 auto &[RightGain, RightNode] = RightPair;
259 if (LeftGain + RightGain <= 0.f)
262 if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket,
Signatures, RNG))
263 ++NumMovedDataVertices;
264 if (moveFunctionNode(*RightNode, LeftBucket, RightBucket,
Signatures, RNG))
265 ++NumMovedDataVertices;
267 return NumMovedDataVertices;
272 unsigned RightBucket,
274 std::mt19937 &RNG)
const {
276 if (std::uniform_real_distribution<float>(0.f, 1.f)(RNG) <=
280 bool FromLeftToRight = (
N.Bucket == LeftBucket);
282 N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket);
285 if (FromLeftToRight) {
286 for (
auto &UN :
N.UtilityNodes) {
288 Signature.LeftCount--;
289 Signature.RightCount++;
290 Signature.CachedGainIsValid =
false;
293 for (
auto &UN :
N.UtilityNodes) {
295 Signature.LeftCount++;
296 Signature.RightCount--;
297 Signature.CachedGainIsValid =
false;
303void BalancedPartitioning::split(
const FunctionNodeRange Nodes,
304 unsigned StartBucket)
const {
305 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
306 auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
308 std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](
auto &L,
auto &R) {
309 return L.InputOrderIndex < R.InputOrderIndex;
313 N.Bucket = StartBucket;
315 N.Bucket = StartBucket + 1;
319 bool FromLeftToRight,
322 for (
auto &UN :
N.UtilityNodes)
323 Gain += (FromLeftToRight ?
Signatures[UN].CachedGainLR
328float BalancedPartitioning::logCost(
unsigned X,
unsigned Y)
const {
329 return -(
X * log2Cached(
X + 1) +
Y * log2Cached(
Y + 1));
332float BalancedPartitioning::log2Cached(
unsigned i)
const {
333 return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i);
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const FuncProtoTy Signatures[]
A function with a set of utility nodes where it is beneficial to order two functions close together i...
IDT Id
The ID of this node.
SmallVector< UtilityNodeT, 4 > UtilityNodes
The list of utility nodes associated with this node.
std::optional< unsigned > Bucket
The bucket assigned by balanced partitioning.
void dump(raw_ostream &OS) const
static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures)
Compute the move gain for uniform log-gap cost.
void run(std::vector< BPFunctionNode > &Nodes) const
Run recursive graph partitioning that optimizes a given objective.
BalancedPartitioning(const BalancedPartitioningConfig &Config)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
A non-threaded implementation.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
auto partition(R &&Range, UnaryPredicate P)
Provide wrappers to std::partition which take ranges instead of having to pass begin/end explicitly.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Algorithm parameters; default values are tuned on real-world binaries.
unsigned SplitDepth
The depth of the recursive bisection.
float SkipProbability
The probability for a vertex to skip a move from its current bucket to another bucket; it often helps...
unsigned TaskSplitDepth
Recursive subtasks up to the given depth are added to the queue and distributed among threads by Thre...
unsigned IterationsPerSplit
The maximum number of bp iterations per split.