18#include "llvm/Config/llvm-config.h"
41#define DEBUG_TYPE "block-freq"
45 "check-bfi-unknown-block-queries",
47 cl::desc(
"Check if block frequency is queried for an unknown block "
48 "for debugging missed BFI updates"));
52 cl::desc(
"Apply an iterative post-processing to infer correct BFI counts"));
56 cl::desc(
"Iterative inference: maximum number of update iterations "
61 cl::desc(
"Iterative inference: delta convergence precision; smaller values "
62 "typically lead to better results at the cost of worsen runtime"));
71#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
83 for (
int Digits = 0; Digits < 16; ++Digits)
116struct DitheringDistributer {
120 DitheringDistributer(Distribution &Dist,
const BlockMass &Mass);
127DitheringDistributer::DitheringDistributer(Distribution &Dist,
130 RemWeight = Dist.Total;
135 assert(Weight &&
"invalid weight");
136 assert(Weight <= RemWeight);
145void Distribution::add(
const BlockNode &
Node,
uint64_t Amount,
146 Weight::DistType
Type) {
147 assert(Amount &&
"invalid weight of 0");
151 bool IsOverflow = NewTotal <
Total;
152 assert(!(DidOverflow && IsOverflow) &&
"unexpected repeated overflow");
153 DidOverflow |= IsOverflow;
159 Weights.push_back(Weight(
Type,
Node, Amount));
163 assert(OtherW.TargetNode.isValid());
168 assert(W.Type == OtherW.Type);
169 assert(W.TargetNode == OtherW.TargetNode);
170 assert(OtherW.Amount &&
"Expected non-zero weight");
171 if (W.Amount > W.Amount + OtherW.Amount)
175 W.Amount += OtherW.Amount;
180 llvm::sort(Weights, [](
const Weight &L,
const Weight &R) {
181 return L.TargetNode < R.TargetNode;
185 WeightList::iterator O = Weights.begin();
186 for (WeightList::const_iterator
I = O, L = O, E = Weights.end();
I != E;
191 for (++L; L != E &&
I->TargetNode == L->TargetNode; ++L)
196 Weights.erase(O, Weights.end());
204 for (
const Weight &W : Weights)
208 if (Weights.size() == Combined.size())
213 Weights.reserve(Combined.size());
214 for (
const auto &
I : Combined)
215 Weights.push_back(
I.second);
220 if (Weights.size() > 128) {
233 return (
N >> Shift) + (UINT64_C(1) &
N >> (Shift - 1));
236void Distribution::normalize() {
242 if (Weights.size() > 1)
246 if (Weights.size() == 1) {
248 Weights.front().Amount = 1;
259 else if (
Total > UINT32_MAX)
266 assert(
Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0),
268 return Sum + W.Amount;
270 "Expected total to be correct");
279 for (
Weight &W : Weights) {
282 assert(W.TargetNode.isValid());
284 assert(W.Amount <= UINT32_MAX);
295 std::vector<FrequencyData>().swap(Freqs);
296 IsIrrLoopHeader.clear();
297 std::vector<WorkingData>().swap(Working);
305 std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
308 BFI.Freqs = std::move(SavedFreqs);
309 BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader);
320 auto isLoopHeader = [&OuterLoop](
const BlockNode &Node) {
321 return OuterLoop && OuterLoop->
isHeader(Node);
327 auto debugSuccessor = [&](
const char *
Type) {
330 if (!isLoopHeader(Resolved))
332 if (Resolved != Succ)
336 (void)debugSuccessor;
339 if (isLoopHeader(Resolved)) {
345 if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {
351 if (Resolved < Pred) {
352 if (!isLoopHeader(Pred)) {
355 "unhandled irreducible control flow");
366 "unhandled irreducible control flow");
377 for (
const auto &
I :
Loop.Exits)
400 const Scaled64 InfiniteLoopScale(1, 12);
405 for (
auto &Mass :
Loop.BackedgeMass)
406 TotalBackedgeMass += Mass;
418 <<
" - scale = " <<
Loop.Scale <<
"\n");
427 if (
auto *
Loop = Working[M.Index].getPackagedLoop())
431 Loop.IsPackaged =
true;
436 const DitheringDistributer &
D,
const BlockNode &
T,
438 dbgs() <<
" => assign " << M <<
" (" <<
D.RemMass <<
")";
442 dbgs() <<
" to " << BFI.getBlockName(
T);
454 DitheringDistributer
D(Dist, Mass);
459 if (W.Type == Weight::Local) {
460 Working[W.TargetNode.Index].getMass() += Taken;
466 assert(OuterLoop &&
"backedge or exit outside of loop");
469 if (W.Type == Weight::Backedge) {
476 assert(W.Type == Weight::Exit);
483 const Scaled64 &Min,
const Scaled64 &Max) {
491 const unsigned MaxBits =
sizeof(Scaled64::DigitsType) * CHAR_BIT;
495 const unsigned Slack = 10;
496 Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max;
499 LLVM_DEBUG(
dbgs() <<
"float-to-int: min = " << Min <<
", max = " << Max
500 <<
", factor = " << ScalingFactor <<
"\n");
502 for (
size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
503 Scaled64
Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
504 BFI.Freqs[Index].Integer = std::max(UINT64_C(1),
Scaled.toInt<
uint64_t>());
505 LLVM_DEBUG(
dbgs() <<
" - " << BFI.getBlockName(Index) <<
": float = "
506 << BFI.Freqs[Index].Scaled <<
", scaled = " <<
Scaled
507 <<
", int = " << BFI.Freqs[Index].Integer <<
"\n");
517 <<
": mass = " <<
Loop.Mass <<
", scale = " <<
Loop.Scale
520 Loop.IsPackaged =
false;
526 for (
const BlockNode &
N :
Loop.Nodes) {
527 const auto &Working = BFI.Working[
N.Index];
528 Scaled64 &
F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
529 : BFI.Freqs[
N.Index].Scaled;
530 Scaled64 New =
Loop.Scale *
F;
539 for (
size_t Index = 0; Index < Working.size(); ++Index)
549 auto Min = Scaled64::getLargest();
550 auto Max = Scaled64::getZero();
551 for (
size_t Index = 0; Index < Working.size(); ++Index) {
553 Min = std::min(Min, Freqs[Index].
Scaled);
554 Max = std::max(Max, Freqs[Index].
Scaled);
569 if (!Node.isValid()) {
574 OS <<
"*** Detected BFI query for unknown block " <<
getBlockName(Node);
583std::optional<uint64_t>
586 bool AllowSynthetic)
const {
587 return getProfileCountFromFreq(
F, getBlockFreq(Node), AllowSynthetic);
592 auto EntryCount =
F.getEntryCount(AllowSynthetic);
596 APInt BlockCount(128, EntryCount->getCount());
598 APInt EntryFreq(128, getEntryFreq().getFrequency());
599 BlockCount *= BlockFreq;
602 BlockCount = (BlockCount + EntryFreq.
lshr(1)).udiv(EntryFreq);
610 return IsIrrLoopHeader.test(Node.Index);
616 return Scaled64::getZero();
617 return Freqs[Node.Index].Scaled;
622 assert(Node.isValid() &&
"Expected valid node");
623 assert(Node.Index < Freqs.size() &&
"Expected legal index");
640 for (
auto N : OuterLoop.
Nodes)
660 if (OuterLoop && OuterLoop->
isHeader(Succ))
666 Irr.
Edges.push_back(&SuccIrr);
667 SuccIrr.
Edges.push_front(&Irr);
692 const std::vector<const IrreducibleGraph::IrrNode *> &SCC,
693 LoopData::NodeList &Headers, LoopData::NodeList &Others) {
698 for (
const auto *
I : SCC)
701 for (
auto I = InSCC.
begin(), E = InSCC.
end();
I != E; ++
I) {
702 auto &Irr = *
I->first;
703 for (
const auto *
P :
make_range(Irr.pred_begin(), Irr.pred_end())) {
709 Headers.push_back(Irr.Node);
715 assert(Headers.size() >= 2 &&
716 "Expected irreducible CFG; -loop-info is likely invalid");
717 if (Headers.size() == InSCC.
size()) {
724 for (
const auto &
I : InSCC) {
729 auto &Irr = *
I.first;
730 for (
const auto *
P :
make_range(Irr.pred_begin(), Irr.pred_end())) {
732 if (
P->Node < Irr.Node)
741 Headers.push_back(Irr.Node);
746 if (Headers.back() == Irr.Node)
751 Others.push_back(Irr.Node);
752 LLVM_DEBUG(
dbgs() <<
" => other = " << BFI.getBlockName(Irr.Node) <<
"\n");
760 LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
761 const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {
765 LoopData::NodeList Headers;
766 LoopData::NodeList Others;
769 auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(),
770 Headers.end(), Others.begin(), Others.end());
773 for (
const auto &
N :
Loop->Nodes)
774 if (BFI.Working[
N.Index].isLoopHeader())
775 BFI.Working[
N.Index].Loop->Parent = &*
Loop;
777 BFI.Working[
N.Index].Loop = &*
Loop;
783 std::list<LoopData>::iterator Insert) {
784 assert((OuterLoop ==
nullptr) == (Insert ==
Loops.begin()));
785 auto Prev = OuterLoop ? std::prev(Insert) :
Loops.end();
806 for (
auto I = O, E = OuterLoop.
Nodes.
end();
I != E; ++
I)
807 if (!Working[
I->Index].isPackaged())
813 assert(
Loop.isIrreducible() &&
"this only makes sense on irreducible loops");
826 auto &HeaderNode =
Loop.Nodes[
H];
827 auto &BackedgeMass =
Loop.BackedgeMass[
Loop.getHeaderIndex(HeaderNode)];
831 if (BackedgeMass.getMass() > 0)
832 Dist.
addLocal(HeaderNode, BackedgeMass.getMass());
837 DitheringDistributer
D(Dist, LoopMass);
840 <<
" to headers using above weights\n");
843 assert(W.Type == Weight::Local &&
"all weights should be local");
844 Working[W.TargetNode.Index].getMass() = Taken;
851 DitheringDistributer
D(Dist, LoopMass);
854 assert(W.Type == Weight::Local &&
"all weights should be local");
855 Working[W.TargetNode.Index].getMass() = Taken;
This file implements a class to represent arbitrary precision integral constant values and operations...
static void combineWeightsBySorting(WeightList &Weights)
static void cleanup(BlockFrequencyInfoImplBase &BFI)
Clear all memory not needed downstream.
static void combineWeightsByHashing(WeightList &Weights)
static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop)
Unwrap a loop package.
static void combineWeight(Weight &W, const Weight &OtherW)
static void debugAssign(const BlockFrequencyInfoImplBase &BFI, const DitheringDistributer &D, const BlockNode &T, const BlockMass &M, const char *Desc)
static void combineWeights(WeightList &Weights)
static char getHexDigit(int N)
static void findIrreducibleHeaders(const BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, const std::vector< const IrreducibleGraph::IrrNode * > &SCC, LoopData::NodeList &Headers, LoopData::NodeList &Others)
Find extra irreducible headers.
static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, const Scaled64 &Min, const Scaled64 &Max)
static void createIrreducibleLoop(BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, LoopData *OuterLoop, std::list< LoopData >::iterator Insert, const std::vector< const IrreducibleGraph::IrrNode * > &SCC)
static uint64_t shiftRightAndRound(uint64_t N, int Shift)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file defines the DenseMap class.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
Class for arbitrary precision integers.
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Base class for BlockFrequencyInfoImpl.
std::vector< WorkingData > Working
Loop data: see initializeLoops().
bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist)
Add all edges out of a packaged loop to the distribution.
ScaledNumber< uint64_t > Scaled64
std::string getLoopName(const LoopData &Loop) const
bool isIrrLoopHeader(const BlockNode &Node)
void computeLoopScale(LoopData &Loop)
Compute the loop scale for a loop.
void packageLoop(LoopData &Loop)
Package up a loop.
virtual std::string getBlockName(const BlockNode &Node) const
void finalizeMetrics()
Finalize frequency metrics.
void setBlockFreq(const BlockNode &Node, BlockFrequency Freq)
void updateLoopWithIrreducible(LoopData &OuterLoop)
Update a loop after packaging irreducible SCCs inside of it.
void clear()
Clear all memory.
std::optional< uint64_t > getBlockProfileCount(const Function &F, const BlockNode &Node, bool AllowSynthetic=false) const
BlockFrequency getBlockFreq(const BlockNode &Node) const
void distributeIrrLoopHeaderMass(Distribution &Dist)
iterator_range< std::list< LoopData >::iterator > analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop, std::list< LoopData >::iterator Insert)
Analyze irreducible SCCs.
bool addToDist(Distribution &Dist, const LoopData *OuterLoop, const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight)
Add an edge to the distribution.
void unwrapLoops()
Unwrap loops.
std::optional< uint64_t > getProfileCountFromFreq(const Function &F, BlockFrequency Freq, bool AllowSynthetic=false) const
Scaled64 getFloatingBlockFreq(const BlockNode &Node) const
void distributeMass(const BlockNode &Source, LoopData *OuterLoop, Distribution &Dist)
Distribute mass according to a distribution.
void adjustLoopHeaderMass(LoopData &Loop)
Adjust the mass of all headers in an irreducible loop.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Simple representation of a scaled number.
ScaledNumber inverse() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
iterator erase(const_iterator CI)
void push_back(const T &Elt)
The instances of the Type class are immutable: once they are created, they are never changed.
raw_ostream & print(raw_ostream &OS) const
static BlockMass getEmpty()
static BlockMass getFull()
ScaledNumber< uint64_t > toScaled() const
Convert to scaled number.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
A raw_ostream that writes to an SmallVector or SmallString.
std::string getBlockName(const BlockT *BB)
Get the name of a MachineBasicBlock.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
llvm::cl::opt< unsigned > IterativeBFIMaxIterationsPerBlock
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
void sort(IteratorTy Start, IteratorTy End)
llvm::cl::opt< bool > UseIterativeBFIInference
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
llvm::cl::opt< bool > CheckBFIUnknownBlockQueries
llvm::cl::opt< double > IterativeBFIPrecision
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Representative of a block.
Distribution of unscaled probability weight.
void addBackedge(const BlockNode &Node, uint64_t Amount)
SmallVector< Weight, 4 > WeightList
WeightList Weights
Individual successor weights.
void addExit(const BlockNode &Node, uint64_t Amount)
void addLocal(const BlockNode &Node, uint64_t Amount)
Stats about a block itself.
bool isHeader(const BlockNode &Node) const
ExitMap Exits
Successor edges (and weights).
bool isIrreducible() const
BlockNode getHeader() const
NodeList Nodes
Header and the members of the loop.
HeaderMassList BackedgeMass
Mass returned to each loop header.
HeaderMassList::difference_type getHeaderIndex(const BlockNode &B)
Unscaled probability weight.
Description of the encoding of one expression Op.
static ChildIteratorType child_begin(NodeRef N)
static ChildIteratorType child_end(NodeRef N)
GraphT::IrrNode::iterator ChildIteratorType
static NodeRef getEntryNode(const GraphT &G)
std::deque< const IrrNode * > Edges
Graph of irreducible control flow.
void addNodesInFunction()
void addEdge(IrrNode &Irr, const BlockNode &Succ, const BFIBase::LoopData *OuterLoop)
std::vector< IrrNode > Nodes
SmallDenseMap< uint32_t, IrrNode *, 4 > Lookup
void addNode(const BlockNode &Node)
void addNodesInLoop(const BFIBase::LoopData &OuterLoop)