37#define INSTR_PROF_VALUE_PROF_MEMOP_API
52#define DEBUG_TYPE "pgo-memop-opt"
54STATISTIC(NumOfPGOMemOPOpt,
"Number of memop intrinsics optimized.");
55STATISTIC(NumOfPGOMemOPAnnotate,
"Number of memop intrinsics annotated.");
60 cl::desc(
"The minimum count to optimize memory "
72 cl::desc(
"The percentage threshold for the "
73 "memory intrinsic calls optimization"));
78 cl::desc(
"The max version for the optimized memory "
84 cl::desc(
"Scale the memop size counts using the basic "
85 " block count value"));
90 cl::desc(
"Size-specialize memcmp and bcmp calls"));
94 cl::desc(
"Optimize the memop size <= this value"));
99 switch (
MI->getIntrinsicID()) {
100 case Intrinsic::memcpy:
102 case Intrinsic::memmove:
104 case Intrinsic::memset:
117 CallInst *asCI() {
return cast<CallInst>(
I); }
119 if (
auto MI = asMI())
120 return MemOp(cast<MemIntrinsic>(
MI->clone()));
121 return MemOp(cast<CallInst>(asCI()->clone()));
124 if (
auto MI = asMI())
125 return MI->getLength();
126 return asCI()->getArgOperand(2);
129 if (
auto MI = asMI())
131 asCI()->setArgOperand(2,
Length);
134 if (
auto MI = asMI())
135 return MI->getCalledFunction()->getName();
136 return asCI()->getCalledFunction()->getName();
139 if (
auto MI = asMI())
140 if (
MI->getIntrinsicID() == Intrinsic::memmove)
146 if (asMI() ==
nullptr && TLI.
getLibFunc(*asCI(), Func) &&
147 Func == LibFunc_memcmp) {
154 if (asMI() ==
nullptr && TLI.
getLibFunc(*asCI(), Func) &&
155 Func == LibFunc_bcmp) {
161 if (
auto MI = asMI())
162 return getMIName(
MI);
165 if (Func == LibFunc_memcmp)
167 if (Func == LibFunc_bcmp)
175class MemOPSizeOpt :
public InstVisitor<MemOPSizeOpt> {
181 bool isChanged()
const {
return Changed; }
186 for (
auto &MO : WorkList) {
187 ++NumOfPGOMemOPAnnotate;
192 <<
"is Transformed.\n");
200 if (isa<ConstantInt>(
Length))
202 WorkList.push_back(
MemOp(&
MI));
208 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
210 WorkList.push_back(
MemOp(&CI));
221 std::vector<MemOp> WorkList;
222 bool perform(
MemOp MO);
226 assert(Count <= TotalCount);
240 return ScaleCount / Denom;
243bool MemOPSizeOpt::perform(
MemOp MO) {
250 uint32_t MaxNumVals = INSTR_PROF_NUM_BUCKETS;
258 uint64_t SavedTotalCount = TotalCount;
260 auto BBEdgeCount =
BFI.getBlockProfileCount(MO.I->getParent());
263 ActualCount = *BBEdgeCount;
266 LLVM_DEBUG(
dbgs() <<
"Read one memory intrinsic profile with count "
267 << ActualCount <<
"\n");
270 : VDs) {
dbgs() <<
" (" << VD.Value <<
"," << VD.Count <<
")\n"; });
279 TotalCount = ActualCount;
282 <<
" denominator = " << SavedTotalCount <<
"\n");
286 uint64_t SavedRemainCount = SavedTotalCount;
295 for (
auto I = VDs.begin(), E = VDs.end();
I != E; ++
I) {
297 int64_t
V = VD.Value;
300 C = getScaledCount(
C, ActualCount, SavedTotalCount);
309 if (!isProfitable(
C, RemainCount)) {
310 RemainingVDs.
insert(RemainingVDs.
end(),
I, E);
314 if (!SeenSizeId.
insert(V).second) {
315 errs() <<
"warning: Invalid Profile Data in Function " <<
Func.getName()
316 <<
": Two identical values in MemOp value counts.\n";
327 assert(SavedRemainCount >= VD.Count);
328 SavedRemainCount -= VD.Count;
331 RemainingVDs.
insert(RemainingVDs.
end(),
I + 1, E);
339 CaseCounts[0] = RemainCount;
340 if (RemainCount > MaxCount)
341 MaxCount = RemainCount;
343 uint64_t SumForOpt = TotalCount - RemainCount;
345 LLVM_DEBUG(
dbgs() <<
"Optimize one memory intrinsic call to " << Version
346 <<
" Versions (covering " << SumForOpt <<
" out of "
347 << TotalCount <<
")\n");
368 auto OrigBBFreq =
BFI.getBlockFreq(BB);
375 MergeBB->
setName(
"MemOP.Merge");
376 BFI.setBlockFreq(MergeBB, OrigBBFreq);
377 DefaultBB->
setName(
"MemOP.Default");
380 auto &Ctx =
Func.getContext();
383 Value *SizeVar = MO.getLength();
385 Type *MemOpTy = MO.I->getType();
390 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.
size() + 1,
"MemOP.RVMerge");
391 MO.I->replaceAllUsesWith(
PHI);
392 PHI->addIncoming(MO.I, DefaultBB);
396 MO.I->setMetadata(LLVMContext::MD_prof,
nullptr);
398 if (SavedRemainCount > 0 || Version != VDs.size()) {
401 IPVK_MemOPSize, VDs.
size());
406 std::vector<DominatorTree::UpdateType> Updates;
408 Updates.reserve(2 * SizeIds.
size());
412 Ctx,
Twine(
"MemOP.Case.") +
Twine(SizeId), &Func, DefaultBB);
413 MemOp NewMO = MO.clone();
415 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
416 assert(SizeType &&
"Expected integer type size argument.");
417 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
418 NewMO.setLength(CaseSizeId);
419 NewMO.I->insertInto(CaseBB, CaseBB->
end());
421 IRBCase.CreateBr(MergeBB);
422 SI->addCase(CaseSizeId, CaseBB);
424 PHI->addIncoming(NewMO.I, CaseBB);
426 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
427 Updates.push_back({DominatorTree::Insert, BB, CaseBB});
431 DTU.applyUpdates(Updates);
444 <<
"optimized " <<
NV(
"Memop", MO.getName(TLI)) <<
" with count "
445 <<
NV(
"Count", SumForOpt) <<
" out of " <<
NV(
"Total", TotalCount)
446 <<
" for " <<
NV(
"Versions", Version) <<
" versions";
459 if (
F.hasFnAttribute(Attribute::OptimizeForSize))
461 MemOPSizeOpt MemOPSizeOpt(
F, BFI, ORE, DT, TLI);
462 MemOPSizeOpt.perform();
463 return MemOPSizeOpt.isChanged();
This file provides the interface for IR based instrumentation passes ( (profile-gen,...
static cl::opt< unsigned > MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), cl::desc("Optimize the memop size <= this value"))
static cl::opt< unsigned > MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), cl::Hidden, cl::desc("The percentage threshold for the " "memory intrinsic calls optimization"))
static cl::opt< bool > DisableMemOPOPT("disable-memop-opt", cl::init(false), cl::Hidden, cl::desc("Disable optimize"))
static cl::opt< unsigned > MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, cl::desc("The max version for the optimized memory " " intrinsic calls"))
static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, DominatorTree *DT, TargetLibraryInfo &TLI)
static cl::opt< bool > MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, cl::desc("Scale the memop size counts using the basic " " block count value"))
cl::opt< bool > MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), cl::Hidden, cl::desc("Size-specialize memcmp and bcmp calls"))
static cl::opt< unsigned > MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), cl::desc("The minimum count to optimize memory " "intrinsic calls"))
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM Basic Block Representation.
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitMemIntrinsic(MemIntrinsic &I)
void visit(Iterator Start, Iterator End)
RetTy visitCallInst(CallInst &I)
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
This is the common base class for memset/memcpy/memmove.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &MAM)
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Implements a dense probed hash-table based set with some number of buckets stored inline.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVoidTy() const
Return true if this is 'void'.
LLVM Value Representation.
void setName(const Twine &Name)
Change the name of the value.
std::pair< iterator, bool > insert(const ValueT &V)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
This is an optimization pass for GlobalISel generic memory operations.
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
SmallVector< InstrProfValueData, 4 > getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst and returns them if Inst is annotated with value profile dat...
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiply(T X, T Y, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, of type T.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
void setProfMetadata(Module *M, Instruction *TI, ArrayRef< uint64_t > EdgeCounts, uint64_t MaxCount)