docs/doxygen/BalancedPartitioning_8h_source.html

//===- BalancedPartitioning.h ---------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file implements BalancedPartitioning, a recursive balanced graph

// partitioning algorithm.

//

// The algorithm is used to find an ordering of FunctionNodes while optimizing

// a specified objective. The algorithm uses recursive bisection; it starts

// with a collection of unordered FunctionNodes and tries to split them into

// two sets (buckets) of equal cardinality. Each bisection step is comprised of

// iterations that greedily swap the FunctionNodes between the two buckets while

// there is an improvement of the objective. Once the process converges, the

// problem is divided into two sub-problems of half the size, which are

// recursively applied for the two buckets. The final ordering of the

// FunctionNodes is obtained by concatenating the two (recursively computed)

// orderings.

//

// In order to speed up the computation, we limit the depth of the recursive

// tree by a specified constant (SplitDepth) and apply at most a constant

// number of greedy iterations per split (IterationsPerSplit). The worst-case

// time complexity of the implementation is bounded by O(M*log^2 N), where

// N is the number of FunctionNodes and M is the number of

// FunctionNode-UtilityNode edges; (assuming that any collection of D

// FunctionNodes contains O(D) UtilityNodes). Notice that the two different

// recursive sub-problems are independent and thus can be efficiently processed

// in parallel.

//

// Reference:

//   * Optimizing Function Layout for Mobile Applications,

//     https://arxiv.org/abs/2211.09285

//

//===----------------------------------------------------------------------===//


#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H

#define LLVM_SUPPORT_BALANCED_PARTITIONING_H


#include "raw_ostream.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/Support/Compiler.h"


#include <atomic>

#include <condition_variable>

#include <mutex>

#include <random>

#include <vector>


namespace llvm {


class ThreadPoolInterface;

/// A function with a set of utility nodes where it is beneficial to order two

/// functions close together if they have similar utility nodes


class BPFunctionNode {

  friend class BalancedPartitioning;


public:

  using IDT = uint64_t;

  using UtilityNodeT = uint32_t;


  /// \param UtilityNodes the set of utility nodes (must be unique'd)


  BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)

      : Id(Id), UtilityNodes(UtilityNodes) {}


  /// The ID of this node

  IDT Id;


  LLVM_ABI void dump(raw_ostream &OS) const;


protected:

  /// The list of utility nodes associated with this node

  SmallVector<UtilityNodeT, 4> UtilityNodes;

  /// The bucket assigned by balanced partitioning

  std::optional<unsigned> Bucket;

  /// The index of the input order of the FunctionNodes

  uint64_t InputOrderIndex = 0;


  friend class BPFunctionNodeTest_Basic_Test;

  friend class BalancedPartitioningTest_Basic_Test;

  friend class BalancedPartitioningTest_Large_Test;

};


/// Algorithm parameters; default values are tuned on real-world binaries


struct BalancedPartitioningConfig {

  /// The depth of the recursive bisection

  unsigned SplitDepth = 18;

  /// The maximum number of bp iterations per split

  unsigned IterationsPerSplit = 40;

  /// The probability for a vertex to skip a move from its current bucket to

  /// another bucket; it often helps to escape from a local optima

  float SkipProbability = 0.1f;

  /// Recursive subtasks up to the given depth are added to the queue and

  /// distributed among threads by ThreadPool; all subsequent calls are executed

  /// on the same thread

  unsigned TaskSplitDepth = 9;

};


class BalancedPartitioning {

public:

  LLVM_ABI BalancedPartitioning(const BalancedPartitioningConfig &Config);


  /// Run recursive graph partitioning that optimizes a given objective.

  LLVM_ABI void run(std::vector<BPFunctionNode> &Nodes) const;


private:

  struct UtilitySignature;

  using SignaturesT = SmallVector<UtilitySignature, 4>;

  using FunctionNodeRange =

      iterator_range<std::vector<BPFunctionNode>::iterator>;


  /// A special ThreadPool that allows for spawning new tasks after blocking on

  /// wait(). BalancedPartitioning recursively spawns new threads inside other

  /// threads, so we need to track how many active threads that could spawn more

  /// threads.

  struct BPThreadPool {

    ThreadPoolInterface &TheThreadPool;

    std::mutex mtx;

    std::condition_variable cv;

    /// The number of threads that could spawn more threads

    std::atomic<int> NumActiveThreads = 0;

    /// Only true when all threads are down spawning new threads

    bool IsFinishedSpawning = false;

    /// Asynchronous submission of the task to the pool

    template <typename Func> void async(Func &&F);

    /// Blocking wait for all threads to complete. Unlike ThreadPool, it is

    /// acceptable for other threads to add more tasks while blocking on this

    /// call.

    LLVM_ABI void wait();

    BPThreadPool(ThreadPoolInterface &TheThreadPool)

        : TheThreadPool(TheThreadPool) {}

  };


  /// Run a recursive bisection of a given list of FunctionNodes

  /// \param RecDepth the current depth of recursion

  /// \param RootBucket the initial bucket of the dataVertices

  /// \param Offset the assigned buckets are the range [Offset, Offset +

  /// Nodes.size()]

  void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,

              unsigned RootBucket, unsigned Offset,

              std::optional<BPThreadPool> &TP) const;


  /// Run bisection iterations

  void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,

                     unsigned RightBucket, std::mt19937 &RNG) const;


  /// Run a bisection iteration to improve the optimization goal

  /// \returns the total number of moved FunctionNodes

  unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,

                        unsigned RightBucket, SignaturesT &Signatures,

                        std::mt19937 &RNG) const;


  /// Try to move \p N from one bucket to another

  /// \returns true iff \p N is moved

  bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,

                        unsigned RightBucket, SignaturesT &Signatures,

                        std::mt19937 &RNG) const;


  /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +

  /// 1 The method is used for an initial assignment before a bisection step

  void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;


  /// The cost of the uniform log-gap cost, assuming a utility node has \p X

  /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.

  float logCost(unsigned X, unsigned Y) const;


  float log2Cached(unsigned i) const;


  const BalancedPartitioningConfig &Config;


  /// Precomputed values of log2(x). Table size is small enough to fit in cache.

  static constexpr unsigned LOG_CACHE_SIZE = 16384;

  float Log2Cache[LOG_CACHE_SIZE];


  /// The signature of a particular utility node used for the bisection step,

  /// i.e., the number of \p FunctionNodes in each of the two buckets

  struct UtilitySignature {

    /// The number of \p FunctionNodes in the left bucket

    unsigned LeftCount = 0;

    /// The number of \p FunctionNodes in the right bucket

    unsigned RightCount = 0;

    /// The cached gain of moving a \p FunctionNode from the left bucket to the

    /// right bucket

    float CachedGainLR;

    /// The cached gain of moving a \p FunctionNode from the right bucket to the

    /// left bucket

    float CachedGainRL;

    /// Whether \p CachedGainLR and \p CachedGainRL are valid

    bool CachedGainIsValid = false;

  };


protected:

  /// Compute the move gain for uniform log-gap cost

  LLVM_ABI static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,

                                 const SignaturesT &Signatures);

  friend class BalancedPartitioningTest_MoveGain_Test;

};


} // end namespace llvm


#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H

ArrayRef.h

Compiler.h

LLVM_ABI
#define LLVM_ABI
Definition Compiler.h:213

F
#define F(x, y, z)
Definition MD5.cpp:54

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

X
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::BPFunctionNode
A function with a set of utility nodes where it is beneficial to order two functions close together i...
Definition BalancedPartitioning.h:57

llvm::BPFunctionNode::BalancedPartitioning
friend class BalancedPartitioning
Definition BalancedPartitioning.h:58

llvm::BPFunctionNode::IDT
uint64_t IDT
Definition BalancedPartitioning.h:61

llvm::BPFunctionNode::Id
IDT Id
The ID of this node.
Definition BalancedPartitioning.h:69

llvm::BPFunctionNode::BPFunctionNodeTest_Basic_Test
friend class BPFunctionNodeTest_Basic_Test
Definition BalancedPartitioning.h:81

llvm::BPFunctionNode::BPFunctionNode
BPFunctionNode(IDT Id, ArrayRef< UtilityNodeT > UtilityNodes)
Definition BalancedPartitioning.h:65

llvm::BPFunctionNode::BalancedPartitioningTest_Large_Test
friend class BalancedPartitioningTest_Large_Test
Definition BalancedPartitioning.h:83

llvm::BPFunctionNode::UtilityNodeT
uint32_t UtilityNodeT
Definition BalancedPartitioning.h:62

llvm::BPFunctionNode::UtilityNodes
SmallVector< UtilityNodeT, 4 > UtilityNodes
The list of utility nodes associated with this node.
Definition BalancedPartitioning.h:75

llvm::BPFunctionNode::InputOrderIndex
uint64_t InputOrderIndex
The index of the input order of the FunctionNodes.
Definition BalancedPartitioning.h:79

llvm::BPFunctionNode::BalancedPartitioningTest_Basic_Test
friend class BalancedPartitioningTest_Basic_Test
Definition BalancedPartitioning.h:82

llvm::BPFunctionNode::Bucket
std::optional< unsigned > Bucket
The bucket assigned by balanced partitioning.
Definition BalancedPartitioning.h:77

llvm::BPFunctionNode::dump
LLVM_ABI void dump(raw_ostream &OS) const
Definition BalancedPartitioning.cpp:24

llvm::BalancedPartitioning::moveGain
static LLVM_ABI float moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures)
Compute the move gain for uniform log-gap cost.
Definition BalancedPartitioning.cpp:319

llvm::BalancedPartitioning::run
LLVM_ABI void run(std::vector< BPFunctionNode > &Nodes) const
Run recursive graph partitioning that optimizes a given objective.
Definition BalancedPartitioning.cpp:79

llvm::BalancedPartitioning::BalancedPartitioningTest_MoveGain_Test
friend class BalancedPartitioningTest_MoveGain_Test
Definition BalancedPartitioning.h:198

llvm::BalancedPartitioning::BalancedPartitioning
LLVM_ABI BalancedPartitioning(const BalancedPartitioningConfig &Config)
Definition BalancedPartitioning.cpp:70

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1203

llvm::ThreadPoolInterface
This defines the abstract base interface for a ThreadPool allowing asynchronous parallel execution on...
Definition ThreadPool.h:51

llvm::iterator_range
A range adaptor for a pair of iterators.
Definition iterator_range.h:32

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53

uint32_t

uint64_t

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Offset
@ Offset
Definition DWP.cpp:532

raw_ostream.h

N
#define N

llvm::BalancedPartitioningConfig
Algorithm parameters; default values are tuned on real-world binaries.
Definition BalancedPartitioning.h:87

llvm::BalancedPartitioningConfig::SplitDepth
unsigned SplitDepth
The depth of the recursive bisection.
Definition BalancedPartitioning.h:89

llvm::BalancedPartitioningConfig::SkipProbability
float SkipProbability
The probability for a vertex to skip a move from its current bucket to another bucket; it often helps...
Definition BalancedPartitioning.h:94

llvm::BalancedPartitioningConfig::TaskSplitDepth
unsigned TaskSplitDepth
Recursive subtasks up to the given depth are added to the queue and distributed among threads by Thre...
Definition BalancedPartitioning.h:98

llvm::BalancedPartitioningConfig::IterationsPerSplit
unsigned IterationsPerSplit
The maximum number of bp iterations per split.
Definition BalancedPartitioning.h:91