37#define INSTR_PROF_VALUE_PROF_MEMOP_API
52#define DEBUG_TYPE "pgo-memop-opt"
54STATISTIC(NumOfPGOMemOPOpt,
"Number of memop intrinsics optimized.");
55STATISTIC(NumOfPGOMemOPAnnotate,
"Number of memop intrinsics annotated.");
60 cl::desc(
"The minimum count to optimize memory "
72 cl::desc(
"The percentage threshold for the "
73 "memory intrinsic calls optimization"));
78 cl::desc(
"The max version for the optimized memory "
84 cl::desc(
"Scale the memop size counts using the basic "
85 " block count value"));
90 cl::desc(
"Size-specialize memcmp and bcmp calls"));
94 cl::desc(
"Optimize the memop size <= this value"));
99 switch (
MI->getIntrinsicID()) {
100 case Intrinsic::memcpy:
102 case Intrinsic::memmove:
104 case Intrinsic::memset:
117 CallInst *asCI() {
return cast<CallInst>(
I); }
119 if (
auto MI = asMI())
120 return MemOp(cast<MemIntrinsic>(
MI->clone()));
121 return MemOp(cast<CallInst>(asCI()->clone()));
124 if (
auto MI = asMI())
125 return MI->getLength();
126 return asCI()->getArgOperand(2);
129 if (
auto MI = asMI())
131 asCI()->setArgOperand(2,
Length);
134 if (
auto MI = asMI())
135 return MI->getCalledFunction()->getName();
136 return asCI()->getCalledFunction()->getName();
139 if (
auto MI = asMI())
140 if (
MI->getIntrinsicID() == Intrinsic::memmove)
146 if (asMI() ==
nullptr && TLI.
getLibFunc(*asCI(), Func) &&
147 Func == LibFunc_memcmp) {
154 if (asMI() ==
nullptr && TLI.
getLibFunc(*asCI(), Func) &&
155 Func == LibFunc_bcmp) {
161 if (
auto MI = asMI())
162 return getMIName(
MI);
165 if (Func == LibFunc_memcmp)
167 if (Func == LibFunc_bcmp)
175class MemOPSizeOpt :
public InstVisitor<MemOPSizeOpt> {
181 bool isChanged()
const {
return Changed; }
186 for (
auto &MO : WorkList) {
187 ++NumOfPGOMemOPAnnotate;
192 <<
"is Transformed.\n");
200 if (isa<ConstantInt>(
Length))
202 WorkList.push_back(
MemOp(&
MI));
208 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
210 WorkList.push_back(
MemOp(&CI));
221 std::vector<MemOp> WorkList;
222 bool perform(
MemOp MO);
226 assert(Count <= TotalCount);
240 return ScaleCount / Denom;
243bool MemOPSizeOpt::perform(
MemOp MO) {
250 uint32_t NumVals = INSTR_PROF_NUM_BUCKETS;
251 uint32_t MaxNumVals = INSTR_PROF_NUM_BUCKETS;
254 *MO.I, IPVK_MemOPSize, MaxNumVals, NumVals, TotalCount);
259 uint64_t SavedTotalCount = TotalCount;
261 auto BBEdgeCount =
BFI.getBlockProfileCount(MO.I->getParent());
264 ActualCount = *BBEdgeCount;
268 LLVM_DEBUG(
dbgs() <<
"Read one memory intrinsic profile with count "
269 << ActualCount <<
"\n");
272 : VDs) {
dbgs() <<
" (" << VD.Value <<
"," << VD.Count <<
")\n"; });
281 TotalCount = ActualCount;
284 <<
" denominator = " << SavedTotalCount <<
"\n");
288 uint64_t SavedRemainCount = SavedTotalCount;
297 for (
auto I = VDs.begin(), E = VDs.end();
I != E; ++
I) {
299 int64_t
V = VD.Value;
302 C = getScaledCount(
C, ActualCount, SavedTotalCount);
311 if (!isProfitable(
C, RemainCount)) {
312 RemainingVDs.
insert(RemainingVDs.
end(),
I, E);
316 if (!SeenSizeId.
insert(V).second) {
317 errs() <<
"warning: Invalid Profile Data in Function " <<
Func.getName()
318 <<
": Two identical values in MemOp value counts.\n";
329 assert(SavedRemainCount >= VD.Count);
330 SavedRemainCount -= VD.Count;
333 RemainingVDs.
insert(RemainingVDs.
end(),
I + 1, E);
341 CaseCounts[0] = RemainCount;
342 if (RemainCount > MaxCount)
343 MaxCount = RemainCount;
345 uint64_t SumForOpt = TotalCount - RemainCount;
347 LLVM_DEBUG(
dbgs() <<
"Optimize one memory intrinsic call to " << Version
348 <<
" Versions (covering " << SumForOpt <<
" out of "
349 << TotalCount <<
")\n");
370 auto OrigBBFreq =
BFI.getBlockFreq(BB);
377 MergeBB->
setName(
"MemOP.Merge");
378 BFI.setBlockFreq(MergeBB, OrigBBFreq);
379 DefaultBB->
setName(
"MemOP.Default");
382 auto &Ctx =
Func.getContext();
385 Value *SizeVar = MO.getLength();
387 Type *MemOpTy = MO.I->getType();
392 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.
size() + 1,
"MemOP.RVMerge");
393 MO.I->replaceAllUsesWith(
PHI);
394 PHI->addIncoming(MO.I, DefaultBB);
398 MO.I->setMetadata(LLVMContext::MD_prof,
nullptr);
400 if (SavedRemainCount > 0 || Version != NumVals) {
403 IPVK_MemOPSize, NumVals);
408 std::vector<DominatorTree::UpdateType> Updates;
410 Updates.reserve(2 * SizeIds.
size());
414 Ctx,
Twine(
"MemOP.Case.") +
Twine(SizeId), &Func, DefaultBB);
415 MemOp NewMO = MO.clone();
417 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
418 assert(SizeType &&
"Expected integer type size argument.");
419 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
420 NewMO.setLength(CaseSizeId);
421 NewMO.I->insertInto(CaseBB, CaseBB->
end());
423 IRBCase.CreateBr(MergeBB);
424 SI->addCase(CaseSizeId, CaseBB);
426 PHI->addIncoming(NewMO.I, CaseBB);
428 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
429 Updates.push_back({DominatorTree::Insert, BB, CaseBB});
433 DTU.applyUpdates(Updates);
446 <<
"optimized " <<
NV(
"Memop", MO.getName(TLI)) <<
" with count "
447 <<
NV(
"Count", SumForOpt) <<
" out of " <<
NV(
"Total", TotalCount)
448 <<
" for " <<
NV(
"Versions", Version) <<
" versions";
461 if (
F.hasFnAttribute(Attribute::OptimizeForSize))
463 MemOPSizeOpt MemOPSizeOpt(
F, BFI, ORE, DT, TLI);
464 MemOPSizeOpt.perform();
465 return MemOPSizeOpt.isChanged();
This file provides the interface for IR based instrumentation passes ( (profile-gen,...
static cl::opt< unsigned > MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), cl::desc("Optimize the memop size <= this value"))
static cl::opt< unsigned > MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), cl::Hidden, cl::desc("The percentage threshold for the " "memory intrinsic calls optimization"))
static cl::opt< bool > DisableMemOPOPT("disable-memop-opt", cl::init(false), cl::Hidden, cl::desc("Disable optimize"))
static cl::opt< unsigned > MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, cl::desc("The max version for the optimized memory " " intrinsic calls"))
static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, DominatorTree *DT, TargetLibraryInfo &TLI)
static cl::opt< bool > MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, cl::desc("Scale the memop size counts using the basic " " block count value"))
cl::opt< bool > MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), cl::Hidden, cl::desc("Size-specialize memcmp and bcmp calls"))
static cl::opt< unsigned > MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), cl::desc("The minimum count to optimize memory " "intrinsic calls"))
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
LLVM Basic Block Representation.
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitMemIntrinsic(MemIntrinsic &I)
void visit(Iterator Start, Iterator End)
RetTy visitCallInst(CallInst &I)
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
This is the common base class for memset/memcpy/memmove.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &MAM)
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Implements a dense probed hash-table based set with some number of buckets stored inline.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVoidTy() const
Return true if this is 'void'.
LLVM Value Representation.
void setName(const Twine &Name)
Change the name of the value.
std::pair< iterator, bool > insert(const ValueT &V)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< InstrProfValueData[]> getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst and returns them if Inst is annotated with value profile dat...
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiply(T X, T Y, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, of type T.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
void setProfMetadata(Module *M, Instruction *TI, ArrayRef< uint64_t > EdgeCounts, uint64_t MaxCount)