LLVM 19.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <sstream>
48#include <unordered_map>
49#include <vector>
50using namespace llvm;
51using namespace llvm::memprof;
52
53#define DEBUG_TYPE "memprof-context-disambiguation"
54
55STATISTIC(FunctionClonesAnalysis,
56 "Number of function clones created during whole program analysis");
57STATISTIC(FunctionClonesThinBackend,
58 "Number of function clones created during ThinLTO backend");
59STATISTIC(FunctionsClonedThinBackend,
60 "Number of functions that had clones created during ThinLTO backend");
61STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
62 "cloned) during whole program analysis");
63STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
64 "during whole program analysis");
65STATISTIC(AllocTypeNotColdThinBackend,
66 "Number of not cold static allocations (possibly cloned) during "
67 "ThinLTO backend");
68STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
69 "(possibly cloned) during ThinLTO backend");
70STATISTIC(OrigAllocsThinBackend,
71 "Number of original (not cloned) allocations with memprof profiles "
72 "during ThinLTO backend");
74 AllocVersionsThinBackend,
75 "Number of allocation versions (including clones) during ThinLTO backend");
76STATISTIC(MaxAllocVersionsThinBackend,
77 "Maximum number of allocation versions created for an original "
78 "allocation during ThinLTO backend");
79STATISTIC(UnclonableAllocsThinBackend,
80 "Number of unclonable ambigous allocations during ThinLTO backend");
81STATISTIC(RemovedEdgesWithMismatchedCallees,
82 "Number of edges removed due to mismatched callees (profiled vs IR)");
83STATISTIC(FoundProfiledCalleeCount,
84 "Number of profiled callees found via tail calls");
85STATISTIC(FoundProfiledCalleeDepth,
86 "Aggregate depth of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeMaxDepth,
88 "Maximum depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeNonUniquelyCount,
90 "Number of profiled callees found via multiple tail call chains");
91
93 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
94 cl::value_desc("filename"),
95 cl::desc("Specify the path prefix of the MemProf dot files."));
96
97static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
99 cl::desc("Export graph to dot files."));
100
101static cl::opt<bool>
102 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
103 cl::desc("Dump CallingContextGraph to stdout after each stage."));
104
105static cl::opt<bool>
106 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
107 cl::desc("Perform verification checks on CallingContextGraph."));
108
109static cl::opt<bool>
110 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
111 cl::desc("Perform frequent verification checks on nodes."));
112
114 "memprof-import-summary",
115 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
116 cl::Hidden);
117
119 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
121 cl::desc("Max depth to recursively search for missing "
122 "frames through tail calls."));
123
124namespace llvm {
126 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
127 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
128
129// Indicate we are linking with an allocator that supports hot/cold operator
130// new interfaces.
132 "supports-hot-cold-new", cl::init(false), cl::Hidden,
133 cl::desc("Linking with hot/cold operator new interfaces"));
134} // namespace llvm
135
136namespace {
137/// CRTP base for graphs built from either IR or ThinLTO summary index.
138///
139/// The graph represents the call contexts in all memprof metadata on allocation
140/// calls, with nodes for the allocations themselves, as well as for the calls
141/// in each context. The graph is initially built from the allocation memprof
142/// metadata (or summary) MIBs. It is then updated to match calls with callsite
143/// metadata onto the nodes, updating it to reflect any inlining performed on
144/// those calls.
145///
146/// Each MIB (representing an allocation's call context with allocation
147/// behavior) is assigned a unique context id during the graph build. The edges
148/// and nodes in the graph are decorated with the context ids they carry. This
149/// is used to correctly update the graph when cloning is performed so that we
150/// can uniquify the context for a single (possibly cloned) allocation.
151template <typename DerivedCCG, typename FuncTy, typename CallTy>
152class CallsiteContextGraph {
153public:
154 CallsiteContextGraph() = default;
155 CallsiteContextGraph(const CallsiteContextGraph &) = default;
156 CallsiteContextGraph(CallsiteContextGraph &&) = default;
157
158 /// Main entry point to perform analysis and transformations on graph.
159 bool process();
160
161 /// Perform cloning on the graph necessary to uniquely identify the allocation
162 /// behavior of an allocation based on its context.
163 void identifyClones();
164
165 /// Assign callsite clones to functions, cloning functions as needed to
166 /// accommodate the combinations of their callsite clones reached by callers.
167 /// For regular LTO this clones functions and callsites in the IR, but for
168 /// ThinLTO the cloning decisions are noted in the summaries and later applied
169 /// in applyImport.
170 bool assignFunctions();
171
172 void dump() const;
173 void print(raw_ostream &OS) const;
174
176 const CallsiteContextGraph &CCG) {
177 CCG.print(OS);
178 return OS;
179 }
180
181 friend struct GraphTraits<
182 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
183 friend struct DOTGraphTraits<
184 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
185
186 void exportToDot(std::string Label) const;
187
188 /// Represents a function clone via FuncTy pointer and clone number pair.
189 struct FuncInfo final
190 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
191 using Base = std::pair<FuncTy *, unsigned>;
192 FuncInfo(const Base &B) : Base(B) {}
193 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
194 explicit operator bool() const { return this->first != nullptr; }
195 FuncTy *func() const { return this->first; }
196 unsigned cloneNo() const { return this->second; }
197 };
198
199 /// Represents a callsite clone via CallTy and clone number pair.
200 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
201 using Base = std::pair<CallTy, unsigned>;
202 CallInfo(const Base &B) : Base(B) {}
203 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
204 : Base(Call, CloneNo) {}
205 explicit operator bool() const { return (bool)this->first; }
206 CallTy call() const { return this->first; }
207 unsigned cloneNo() const { return this->second; }
208 void setCloneNo(unsigned N) { this->second = N; }
209 void print(raw_ostream &OS) const {
210 if (!operator bool()) {
211 assert(!cloneNo());
212 OS << "null Call";
213 return;
214 }
215 call()->print(OS);
216 OS << "\t(clone " << cloneNo() << ")";
217 }
218 void dump() const {
219 print(dbgs());
220 dbgs() << "\n";
221 }
222 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
223 Call.print(OS);
224 return OS;
225 }
226 };
227
228 struct ContextEdge;
229
230 /// Node in the Callsite Context Graph
231 struct ContextNode {
232 // Keep this for now since in the IR case where we have an Instruction* it
233 // is not as immediately discoverable. Used for printing richer information
234 // when dumping graph.
235 bool IsAllocation;
236
237 // Keeps track of when the Call was reset to null because there was
238 // recursion.
239 bool Recursive = false;
240
241 // The corresponding allocation or interior call.
243
244 // For alloc nodes this is a unique id assigned when constructed, and for
245 // callsite stack nodes it is the original stack id when the node is
246 // constructed from the memprof MIB metadata on the alloc nodes. Note that
247 // this is only used when matching callsite metadata onto the stack nodes
248 // created when processing the allocation memprof MIBs, and for labeling
249 // nodes in the dot graph. Therefore we don't bother to assign a value for
250 // clones.
251 uint64_t OrigStackOrAllocId = 0;
252
253 // This will be formed by ORing together the AllocationType enum values
254 // for contexts including this node.
255 uint8_t AllocTypes = 0;
256
257 // Edges to all callees in the profiled call stacks.
258 // TODO: Should this be a map (from Callee node) for more efficient lookup?
259 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
260
261 // Edges to all callers in the profiled call stacks.
262 // TODO: Should this be a map (from Caller node) for more efficient lookup?
263 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
264
265 // The set of IDs for contexts including this node.
266 DenseSet<uint32_t> ContextIds;
267
268 // List of clones of this ContextNode, initially empty.
269 std::vector<ContextNode *> Clones;
270
271 // If a clone, points to the original uncloned node.
272 ContextNode *CloneOf = nullptr;
273
274 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
275
276 ContextNode(bool IsAllocation, CallInfo C)
277 : IsAllocation(IsAllocation), Call(C) {}
278
279 void addClone(ContextNode *Clone) {
280 if (CloneOf) {
281 CloneOf->Clones.push_back(Clone);
282 Clone->CloneOf = CloneOf;
283 } else {
284 Clones.push_back(Clone);
285 assert(!Clone->CloneOf);
286 Clone->CloneOf = this;
287 }
288 }
289
290 ContextNode *getOrigNode() {
291 if (!CloneOf)
292 return this;
293 return CloneOf;
294 }
295
296 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
297 unsigned int ContextId);
298
299 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
300 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
301 void eraseCalleeEdge(const ContextEdge *Edge);
302 void eraseCallerEdge(const ContextEdge *Edge);
303
304 void setCall(CallInfo C) { Call = C; }
305
306 bool hasCall() const { return (bool)Call.call(); }
307
308 void printCall(raw_ostream &OS) const { Call.print(OS); }
309
310 // True if this node was effectively removed from the graph, in which case
311 // its context id set, caller edges, and callee edges should all be empty.
312 bool isRemoved() const {
313 // Note that we can have non-empty context ids with empty caller and
314 // callee edges if the graph ends up with a single node.
315 if (ContextIds.empty())
316 assert(CalleeEdges.empty() && CallerEdges.empty() &&
317 "Context ids empty but at least one of callee and caller edges "
318 "were not!");
319 return ContextIds.empty();
320 }
321
322 void dump() const;
323 void print(raw_ostream &OS) const;
324
325 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
326 Node.print(OS);
327 return OS;
328 }
329 };
330
331 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
332 /// callee.
333 struct ContextEdge {
334 ContextNode *Callee;
335 ContextNode *Caller;
336
337 // This will be formed by ORing together the AllocationType enum values
338 // for contexts including this edge.
339 uint8_t AllocTypes = 0;
340
341 // The set of IDs for contexts including this edge.
342 DenseSet<uint32_t> ContextIds;
343
344 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
345 DenseSet<uint32_t> ContextIds)
346 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
347 ContextIds(ContextIds) {}
348
349 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
350
351 void dump() const;
352 void print(raw_ostream &OS) const;
353
354 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
355 Edge.print(OS);
356 return OS;
357 }
358 };
359
360 /// Helpers to remove callee edges that have allocation type None (due to not
361 /// carrying any context ids) after transformations.
362 void removeNoneTypeCalleeEdges(ContextNode *Node);
363 void
364 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
366
367protected:
368 /// Get a list of nodes corresponding to the stack ids in the given callsite
369 /// context.
370 template <class NodeT, class IteratorT>
371 std::vector<uint64_t>
372 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
373
374 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
375 /// metadata (or summary).
376 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
377
378 /// Adds nodes for the given MIB stack ids.
379 template <class NodeT, class IteratorT>
380 void addStackNodesForMIB(ContextNode *AllocNode,
381 CallStack<NodeT, IteratorT> &StackContext,
382 CallStack<NodeT, IteratorT> &CallsiteContext,
384
385 /// Matches all callsite metadata (or summary) to the nodes created for
386 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
387 /// inlining performed on those callsite instructions.
388 void updateStackNodes();
389
390 /// Update graph to conservatively handle any callsite stack nodes that target
391 /// multiple different callee target functions.
392 void handleCallsitesWithMultipleTargets();
393
394 /// Save lists of calls with MemProf metadata in each function, for faster
395 /// iteration.
396 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
397
398 /// Map from callsite node to the enclosing caller function.
399 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
400
401private:
402 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
403
404 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
405 const FuncTy *, DenseSet<uint32_t>>;
406
407 /// Assigns the given Node to calls at or inlined into the location with
408 /// the Node's stack id, after post order traversing and processing its
409 /// caller nodes. Uses the call information recorded in the given
410 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
411 /// as needed. Called by updateStackNodes which sets up the given
412 /// StackIdToMatchingCalls map.
413 void assignStackNodesPostOrder(
414 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
415 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
416
417 /// Duplicates the given set of context ids, updating the provided
418 /// map from each original id with the newly generated context ids,
419 /// and returning the new duplicated id set.
420 DenseSet<uint32_t> duplicateContextIds(
421 const DenseSet<uint32_t> &StackSequenceContextIds,
422 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
423
424 /// Propagates all duplicated context ids across the graph.
425 void propagateDuplicateContextIds(
426 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
427
428 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
429 /// else to its callers. Also updates OrigNode's edges to remove any context
430 /// ids moved to the newly created edge.
431 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
432 bool TowardsCallee);
433
434 /// Get the stack id corresponding to the given Id or Index (for IR this will
435 /// return itself, for a summary index this will return the id recorded in the
436 /// index for that stack id index value).
437 uint64_t getStackId(uint64_t IdOrIndex) const {
438 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
439 }
440
441 /// Returns true if the given call targets the callee of the given edge, or if
442 /// we were able to identify the call chain through intermediate tail calls.
443 /// In the latter case new context nodes are added to the graph for the
444 /// identified tail calls, and their synthesized nodes are added to
445 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
446 /// next element after the input position (either incremented or updated after
447 /// removing the old edge).
448 bool
449 calleesMatch(CallTy Call, EdgeIter &EI,
450 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
451
452 /// Returns true if the given call targets the given function, or if we were
453 /// able to identify the call chain through intermediate tail calls (in which
454 /// case FoundCalleeChain will be populated).
455 bool calleeMatchesFunc(
456 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
457 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
458 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
459 Call, Func, CallerFunc, FoundCalleeChain);
460 }
461
462 /// Get a list of nodes corresponding to the stack ids in the given
463 /// callsite's context.
464 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
465 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
466 Call);
467 }
468
469 /// Get the last stack id in the context for callsite.
470 uint64_t getLastStackId(CallTy Call) {
471 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
472 }
473
474 /// Update the allocation call to record type of allocated memory.
475 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
476 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
477 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
478 }
479
480 /// Update non-allocation call to invoke (possibly cloned) function
481 /// CalleeFunc.
482 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
483 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
484 }
485
486 /// Clone the given function for the given callsite, recording mapping of all
487 /// of the functions tracked calls to their new versions in the CallMap.
488 /// Assigns new clones to clone number CloneNo.
489 FuncInfo cloneFunctionForCallsite(
490 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
491 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
492 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
493 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
494 }
495
496 /// Gets a label to use in the dot graph for the given call clone in the given
497 /// function.
498 std::string getLabel(const FuncTy *Func, const CallTy Call,
499 unsigned CloneNo) const {
500 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
501 }
502
503 /// Helpers to find the node corresponding to the given call or stackid.
504 ContextNode *getNodeForInst(const CallInfo &C);
505 ContextNode *getNodeForAlloc(const CallInfo &C);
506 ContextNode *getNodeForStackId(uint64_t StackId);
507
508 /// Removes the node information recorded for the given call.
509 void unsetNodeForInst(const CallInfo &C);
510
511 /// Computes the alloc type corresponding to the given context ids, by
512 /// unioning their recorded alloc types.
513 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
514
515 /// Returns the alloction type of the intersection of the contexts of two
516 /// nodes (based on their provided context id sets), optimized for the case
517 /// when Node1Ids is smaller than Node2Ids.
518 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
519 const DenseSet<uint32_t> &Node2Ids);
520
521 /// Returns the alloction type of the intersection of the contexts of two
522 /// nodes (based on their provided context id sets).
523 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
524 const DenseSet<uint32_t> &Node2Ids);
525
526 /// Create a clone of Edge's callee and move Edge to that new callee node,
527 /// performing the necessary context id and allocation type updates.
528 /// If callee's caller edge iterator is supplied, it is updated when removing
529 /// the edge from that list.
530 ContextNode *
531 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
532 EdgeIter *CallerEdgeI = nullptr);
533
534 /// Change the callee of Edge to existing callee clone NewCallee, performing
535 /// the necessary context id and allocation type updates.
536 /// If callee's caller edge iterator is supplied, it is updated when removing
537 /// the edge from that list.
538 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
539 ContextNode *NewCallee,
540 EdgeIter *CallerEdgeI = nullptr,
541 bool NewClone = false);
542
543 /// Recursively perform cloning on the graph for the given Node and its
544 /// callers, in order to uniquely identify the allocation behavior of an
545 /// allocation given its context.
546 void identifyClones(ContextNode *Node,
548
549 /// Map from each context ID to the AllocationType assigned to that context.
550 std::map<uint32_t, AllocationType> ContextIdToAllocationType;
551
552 /// Identifies the context node created for a stack id when adding the MIB
553 /// contexts to the graph. This is used to locate the context nodes when
554 /// trying to assign the corresponding callsites with those stack ids to these
555 /// nodes.
556 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
557
558 /// Maps to track the calls to their corresponding nodes in the graph.
559 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
560 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
561
562 /// Owner of all ContextNode unique_ptrs.
563 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
564
565 /// Perform sanity checks on graph when requested.
566 void check() const;
567
568 /// Keeps track of the last unique context id assigned.
569 unsigned int LastContextId = 0;
570};
571
572template <typename DerivedCCG, typename FuncTy, typename CallTy>
573using ContextNode =
574 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
575template <typename DerivedCCG, typename FuncTy, typename CallTy>
576using ContextEdge =
577 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
578template <typename DerivedCCG, typename FuncTy, typename CallTy>
579using FuncInfo =
580 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
581template <typename DerivedCCG, typename FuncTy, typename CallTy>
582using CallInfo =
583 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
584
585/// CRTP derived class for graphs built from IR (regular LTO).
586class ModuleCallsiteContextGraph
587 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
588 Instruction *> {
589public:
590 ModuleCallsiteContextGraph(
591 Module &M,
593
594private:
595 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
596 Instruction *>;
597
598 uint64_t getStackId(uint64_t IdOrIndex) const;
599 bool calleeMatchesFunc(
600 Instruction *Call, const Function *Func, const Function *CallerFunc,
601 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
602 bool findProfiledCalleeThroughTailCalls(
603 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
604 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
605 bool &FoundMultipleCalleeChains);
606 uint64_t getLastStackId(Instruction *Call);
607 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
608 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
609 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
610 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
611 Instruction *>::FuncInfo
612 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
613 std::map<CallInfo, CallInfo> &CallMap,
614 std::vector<CallInfo> &CallsWithMetadataInFunc,
615 unsigned CloneNo);
616 std::string getLabel(const Function *Func, const Instruction *Call,
617 unsigned CloneNo) const;
618
619 const Module &Mod;
621};
622
623/// Represents a call in the summary index graph, which can either be an
624/// allocation or an interior callsite node in an allocation's context.
625/// Holds a pointer to the corresponding data structure in the index.
626struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
627 IndexCall() : PointerUnion() {}
628 IndexCall(std::nullptr_t) : IndexCall() {}
629 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
630 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
631 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
632
633 IndexCall *operator->() { return this; }
634
635 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
636
637 void print(raw_ostream &OS) const {
638 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
639 OS << *AI;
640 } else {
641 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
642 assert(CI);
643 OS << *CI;
644 }
645 }
646};
647
648/// CRTP derived class for graphs built from summary index (ThinLTO).
649class IndexCallsiteContextGraph
650 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
651 IndexCall> {
652public:
653 IndexCallsiteContextGraph(
656 isPrevailing);
657
658 ~IndexCallsiteContextGraph() {
659 // Now that we are done with the graph it is safe to add the new
660 // CallsiteInfo structs to the function summary vectors. The graph nodes
661 // point into locations within these vectors, so we don't want to add them
662 // any earlier.
663 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
664 auto *FS = I.first;
665 for (auto &Callsite : I.second)
666 FS->addCallsite(*Callsite.second);
667 }
668 }
669
670private:
671 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
672 IndexCall>;
673
674 uint64_t getStackId(uint64_t IdOrIndex) const;
675 bool calleeMatchesFunc(
676 IndexCall &Call, const FunctionSummary *Func,
677 const FunctionSummary *CallerFunc,
678 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
679 bool findProfiledCalleeThroughTailCalls(
680 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
681 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
682 bool &FoundMultipleCalleeChains);
683 uint64_t getLastStackId(IndexCall &Call);
684 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
685 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
686 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
687 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
688 IndexCall>::FuncInfo
689 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
690 std::map<CallInfo, CallInfo> &CallMap,
691 std::vector<CallInfo> &CallsWithMetadataInFunc,
692 unsigned CloneNo);
693 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
694 unsigned CloneNo) const;
695
696 // Saves mapping from function summaries containing memprof records back to
697 // its VI, for use in checking and debugging.
698 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
699
702 isPrevailing;
703
704 // Saves/owns the callsite info structures synthesized for missing tail call
705 // frames that we discover while building the graph.
706 // It maps from the summary of the function making the tail call, to a map
707 // of callee ValueInfo to corresponding synthesized callsite info.
708 std::unordered_map<FunctionSummary *,
709 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
710 FunctionCalleesToSynthesizedCallsiteInfos;
711};
712} // namespace
713
714namespace llvm {
715template <>
716struct DenseMapInfo<typename CallsiteContextGraph<
717 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
719template <>
720struct DenseMapInfo<typename CallsiteContextGraph<
721 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
722 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
723template <>
724struct DenseMapInfo<IndexCall>
725 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
726} // end namespace llvm
727
728namespace {
729
730struct FieldSeparator {
731 bool Skip = true;
732 const char *Sep;
733
734 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
735};
736
737raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
738 if (FS.Skip) {
739 FS.Skip = false;
740 return OS;
741 }
742 return OS << FS.Sep;
743}
744
745// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
746// type we should actually use on the corresponding allocation.
747// If we can't clone a node that has NotCold+Cold alloc type, we will fall
748// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
749// from NotCold.
750AllocationType allocTypeToUse(uint8_t AllocTypes) {
751 assert(AllocTypes != (uint8_t)AllocationType::None);
752 if (AllocTypes ==
753 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
754 return AllocationType::NotCold;
755 else
756 return (AllocationType)AllocTypes;
757}
758
759// Helper to check if the alloc types for all edges recorded in the
760// InAllocTypes vector match the alloc types for all edges in the Edges
761// vector.
762template <typename DerivedCCG, typename FuncTy, typename CallTy>
763bool allocTypesMatch(
764 const std::vector<uint8_t> &InAllocTypes,
765 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
766 &Edges) {
767 return std::equal(
768 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
769 [](const uint8_t &l,
770 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
771 // Can share if one of the edges is None type - don't
772 // care about the type along that edge as it doesn't
773 // exist for those context ids.
774 if (l == (uint8_t)AllocationType::None ||
775 r->AllocTypes == (uint8_t)AllocationType::None)
776 return true;
777 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
778 });
779}
780
781} // end anonymous namespace
782
783template <typename DerivedCCG, typename FuncTy, typename CallTy>
784typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
785CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
786 const CallInfo &C) {
787 ContextNode *Node = getNodeForAlloc(C);
788 if (Node)
789 return Node;
790
791 return NonAllocationCallToContextNodeMap.lookup(C);
792}
793
794template <typename DerivedCCG, typename FuncTy, typename CallTy>
795typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
796CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
797 const CallInfo &C) {
798 return AllocationCallToContextNodeMap.lookup(C);
799}
800
801template <typename DerivedCCG, typename FuncTy, typename CallTy>
802typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
803CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
804 uint64_t StackId) {
805 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
806 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
807 return StackEntryNode->second;
808 return nullptr;
809}
810
811template <typename DerivedCCG, typename FuncTy, typename CallTy>
812void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
813 const CallInfo &C) {
814 AllocationCallToContextNodeMap.erase(C) ||
815 NonAllocationCallToContextNodeMap.erase(C);
816 assert(!AllocationCallToContextNodeMap.count(C) &&
817 !NonAllocationCallToContextNodeMap.count(C));
818}
819
820template <typename DerivedCCG, typename FuncTy, typename CallTy>
821void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
822 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
823 unsigned int ContextId) {
824 for (auto &Edge : CallerEdges) {
825 if (Edge->Caller == Caller) {
826 Edge->AllocTypes |= (uint8_t)AllocType;
827 Edge->getContextIds().insert(ContextId);
828 return;
829 }
830 }
831 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
832 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
833 CallerEdges.push_back(Edge);
834 Caller->CalleeEdges.push_back(Edge);
835}
836
837template <typename DerivedCCG, typename FuncTy, typename CallTy>
838void CallsiteContextGraph<
839 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
840 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
841 auto Edge = *EI;
842 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
843 assert(Edge->ContextIds.empty());
844 Edge->Callee->eraseCallerEdge(Edge.get());
845 EI = Node->CalleeEdges.erase(EI);
846 } else
847 ++EI;
848 }
849}
850
851template <typename DerivedCCG, typename FuncTy, typename CallTy>
852typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
853CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
854 findEdgeFromCallee(const ContextNode *Callee) {
855 for (const auto &Edge : CalleeEdges)
856 if (Edge->Callee == Callee)
857 return Edge.get();
858 return nullptr;
859}
860
861template <typename DerivedCCG, typename FuncTy, typename CallTy>
862typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
863CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
864 findEdgeFromCaller(const ContextNode *Caller) {
865 for (const auto &Edge : CallerEdges)
866 if (Edge->Caller == Caller)
867 return Edge.get();
868 return nullptr;
869}
870
871template <typename DerivedCCG, typename FuncTy, typename CallTy>
872void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
873 eraseCalleeEdge(const ContextEdge *Edge) {
874 auto EI = llvm::find_if(
875 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
876 return CalleeEdge.get() == Edge;
877 });
878 assert(EI != CalleeEdges.end());
879 CalleeEdges.erase(EI);
880}
881
882template <typename DerivedCCG, typename FuncTy, typename CallTy>
883void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
884 eraseCallerEdge(const ContextEdge *Edge) {
885 auto EI = llvm::find_if(
886 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
887 return CallerEdge.get() == Edge;
888 });
889 assert(EI != CallerEdges.end());
890 CallerEdges.erase(EI);
891}
892
893template <typename DerivedCCG, typename FuncTy, typename CallTy>
894uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
895 DenseSet<uint32_t> &ContextIds) {
896 uint8_t BothTypes =
897 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
898 uint8_t AllocType = (uint8_t)AllocationType::None;
899 for (auto Id : ContextIds) {
900 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
901 // Bail early if alloc type reached both, no further refinement.
902 if (AllocType == BothTypes)
903 return AllocType;
904 }
905 return AllocType;
906}
907
908template <typename DerivedCCG, typename FuncTy, typename CallTy>
909uint8_t
910CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
911 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
912 uint8_t BothTypes =
913 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
914 uint8_t AllocType = (uint8_t)AllocationType::None;
915 for (auto Id : Node1Ids) {
916 if (!Node2Ids.count(Id))
917 continue;
918 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
919 // Bail early if alloc type reached both, no further refinement.
920 if (AllocType == BothTypes)
921 return AllocType;
922 }
923 return AllocType;
924}
925
926template <typename DerivedCCG, typename FuncTy, typename CallTy>
927uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
928 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
929 if (Node1Ids.size() < Node2Ids.size())
930 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
931 else
932 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
933}
934
935template <typename DerivedCCG, typename FuncTy, typename CallTy>
936typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
937CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
938 CallInfo Call, const FuncTy *F) {
939 assert(!getNodeForAlloc(Call));
940 NodeOwner.push_back(
941 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
942 ContextNode *AllocNode = NodeOwner.back().get();
943 AllocationCallToContextNodeMap[Call] = AllocNode;
944 NodeToCallingFunc[AllocNode] = F;
945 // Use LastContextId as a uniq id for MIB allocation nodes.
946 AllocNode->OrigStackOrAllocId = LastContextId;
947 // Alloc type should be updated as we add in the MIBs. We should assert
948 // afterwards that it is not still None.
949 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
950
951 return AllocNode;
952}
953
954template <typename DerivedCCG, typename FuncTy, typename CallTy>
955template <class NodeT, class IteratorT>
956void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
957 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
959 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
960 // is done.
961 if (AllocType == AllocationType::Hot)
962 AllocType = AllocationType::NotCold;
963
964 ContextIdToAllocationType[++LastContextId] = AllocType;
965
966 // Update alloc type and context ids for this MIB.
967 AllocNode->AllocTypes |= (uint8_t)AllocType;
968 AllocNode->ContextIds.insert(LastContextId);
969
970 // Now add or update nodes for each stack id in alloc's context.
971 // Later when processing the stack ids on non-alloc callsites we will adjust
972 // for any inlining in the context.
973 ContextNode *PrevNode = AllocNode;
974 // Look for recursion (direct recursion should have been collapsed by
975 // module summary analysis, here we should just be detecting mutual
976 // recursion). Mark these nodes so we don't try to clone.
977 SmallSet<uint64_t, 8> StackIdSet;
978 // Skip any on the allocation call (inlining).
979 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
980 ContextIter != StackContext.end(); ++ContextIter) {
981 auto StackId = getStackId(*ContextIter);
982 ContextNode *StackNode = getNodeForStackId(StackId);
983 if (!StackNode) {
984 NodeOwner.push_back(
985 std::make_unique<ContextNode>(/*IsAllocation=*/false));
986 StackNode = NodeOwner.back().get();
987 StackEntryIdToContextNodeMap[StackId] = StackNode;
988 StackNode->OrigStackOrAllocId = StackId;
989 }
990 auto Ins = StackIdSet.insert(StackId);
991 if (!Ins.second)
992 StackNode->Recursive = true;
993 StackNode->ContextIds.insert(LastContextId);
994 StackNode->AllocTypes |= (uint8_t)AllocType;
995 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
996 PrevNode = StackNode;
997 }
998}
999
1000template <typename DerivedCCG, typename FuncTy, typename CallTy>
1002CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1003 const DenseSet<uint32_t> &StackSequenceContextIds,
1004 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1005 DenseSet<uint32_t> NewContextIds;
1006 for (auto OldId : StackSequenceContextIds) {
1007 NewContextIds.insert(++LastContextId);
1008 OldToNewContextIds[OldId].insert(LastContextId);
1009 assert(ContextIdToAllocationType.count(OldId));
1010 // The new context has the same allocation type as original.
1011 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1012 }
1013 return NewContextIds;
1014}
1015
1016template <typename DerivedCCG, typename FuncTy, typename CallTy>
1017void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1018 propagateDuplicateContextIds(
1019 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1020 // Build a set of duplicated context ids corresponding to the input id set.
1021 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1022 DenseSet<uint32_t> NewIds;
1023 for (auto Id : ContextIds)
1024 if (auto NewId = OldToNewContextIds.find(Id);
1025 NewId != OldToNewContextIds.end())
1026 NewIds.insert(NewId->second.begin(), NewId->second.end());
1027 return NewIds;
1028 };
1029
1030 // Recursively update context ids sets along caller edges.
1031 auto UpdateCallers = [&](ContextNode *Node,
1033 auto &&UpdateCallers) -> void {
1034 for (const auto &Edge : Node->CallerEdges) {
1035 auto Inserted = Visited.insert(Edge.get());
1036 if (!Inserted.second)
1037 continue;
1038 ContextNode *NextNode = Edge->Caller;
1039 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1040 // Only need to recursively iterate to NextNode via this caller edge if
1041 // it resulted in any added ids to NextNode.
1042 if (!NewIdsToAdd.empty()) {
1043 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1044 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1045 UpdateCallers(NextNode, Visited, UpdateCallers);
1046 }
1047 }
1048 };
1049
1051 for (auto &Entry : AllocationCallToContextNodeMap) {
1052 auto *Node = Entry.second;
1053 // Update ids on the allocation nodes before calling the recursive
1054 // update along caller edges, since this simplifies the logic during
1055 // that traversal.
1056 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1057 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1058 UpdateCallers(Node, Visited, UpdateCallers);
1059 }
1060}
1061
1062template <typename DerivedCCG, typename FuncTy, typename CallTy>
1063void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1064 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
1065 // Make a copy of the context ids, since this will be adjusted below as they
1066 // are moved.
1067 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1068 auto &OrigEdges =
1069 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1070 // Increment iterator in loop so that we can remove edges as needed.
1071 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1072 auto Edge = *EI;
1073 // Remove any matching context ids from Edge, return set that were found and
1074 // removed, these are the new edge's context ids. Also update the remaining
1075 // (not found ids).
1076 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1077 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1078 NotFoundContextIds);
1079 RemainingContextIds.swap(NotFoundContextIds);
1080 // If no matching context ids for this edge, skip it.
1081 if (NewEdgeContextIds.empty()) {
1082 ++EI;
1083 continue;
1084 }
1085 if (TowardsCallee) {
1086 auto NewEdge = std::make_shared<ContextEdge>(
1087 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1088 NewEdgeContextIds);
1089 NewNode->CalleeEdges.push_back(NewEdge);
1090 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1091 } else {
1092 auto NewEdge = std::make_shared<ContextEdge>(
1093 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1094 NewEdgeContextIds);
1095 NewNode->CallerEdges.push_back(NewEdge);
1096 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1097 }
1098 // Remove old edge if context ids empty.
1099 if (Edge->getContextIds().empty()) {
1100 if (TowardsCallee) {
1101 Edge->Callee->eraseCallerEdge(Edge.get());
1102 EI = OrigNode->CalleeEdges.erase(EI);
1103 } else {
1104 Edge->Caller->eraseCalleeEdge(Edge.get());
1105 EI = OrigNode->CallerEdges.erase(EI);
1106 }
1107 continue;
1108 }
1109 ++EI;
1110 }
1111}
1112
1113template <typename DerivedCCG, typename FuncTy, typename CallTy>
1114void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1115 assignStackNodesPostOrder(ContextNode *Node,
1117 DenseMap<uint64_t, std::vector<CallContextInfo>>
1118 &StackIdToMatchingCalls) {
1119 auto Inserted = Visited.insert(Node);
1120 if (!Inserted.second)
1121 return;
1122 // Post order traversal. Iterate over a copy since we may add nodes and
1123 // therefore new callers during the recursive call, invalidating any
1124 // iterator over the original edge vector. We don't need to process these
1125 // new nodes as they were already processed on creation.
1126 auto CallerEdges = Node->CallerEdges;
1127 for (auto &Edge : CallerEdges) {
1128 // Skip any that have been removed during the recursion.
1129 if (!Edge)
1130 continue;
1131 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1132 }
1133
1134 // If this node's stack id is in the map, update the graph to contain new
1135 // nodes representing any inlining at interior callsites. Note we move the
1136 // associated context ids over to the new nodes.
1137
1138 // Ignore this node if it is for an allocation or we didn't record any
1139 // stack id lists ending at it.
1140 if (Node->IsAllocation ||
1141 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1142 return;
1143
1144 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1145 // Handle the simple case first. A single call with a single stack id.
1146 // In this case there is no need to create any new context nodes, simply
1147 // assign the context node for stack id to this Call.
1148 if (Calls.size() == 1) {
1149 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1150 if (Ids.size() == 1) {
1151 assert(SavedContextIds.empty());
1152 // It should be this Node
1153 assert(Node == getNodeForStackId(Ids[0]));
1154 if (Node->Recursive)
1155 return;
1156 Node->setCall(Call);
1157 NonAllocationCallToContextNodeMap[Call] = Node;
1158 NodeToCallingFunc[Node] = Func;
1159 return;
1160 }
1161 }
1162
1163 // Find the node for the last stack id, which should be the same
1164 // across all calls recorded for this id, and is this node's id.
1165 uint64_t LastId = Node->OrigStackOrAllocId;
1166 ContextNode *LastNode = getNodeForStackId(LastId);
1167 // We should only have kept stack ids that had nodes.
1168 assert(LastNode);
1169
1170 for (unsigned I = 0; I < Calls.size(); I++) {
1171 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1172 // Skip any for which we didn't assign any ids, these don't get a node in
1173 // the graph.
1174 if (SavedContextIds.empty())
1175 continue;
1176
1177 assert(LastId == Ids.back());
1178
1179 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1180 assert(FirstNode);
1181
1182 // Recompute the context ids for this stack id sequence (the
1183 // intersection of the context ids of the corresponding nodes).
1184 // Start with the ids we saved in the map for this call, which could be
1185 // duplicated context ids. We have to recompute as we might have overlap
1186 // overlap between the saved context ids for different last nodes, and
1187 // removed them already during the post order traversal.
1188 set_intersect(SavedContextIds, FirstNode->ContextIds);
1189 ContextNode *PrevNode = nullptr;
1190 for (auto Id : Ids) {
1191 ContextNode *CurNode = getNodeForStackId(Id);
1192 // We should only have kept stack ids that had nodes and weren't
1193 // recursive.
1194 assert(CurNode);
1195 assert(!CurNode->Recursive);
1196 if (!PrevNode) {
1197 PrevNode = CurNode;
1198 continue;
1199 }
1200 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1201 if (!Edge) {
1202 SavedContextIds.clear();
1203 break;
1204 }
1205 PrevNode = CurNode;
1206 set_intersect(SavedContextIds, Edge->getContextIds());
1207
1208 // If we now have no context ids for clone, skip this call.
1209 if (SavedContextIds.empty())
1210 break;
1211 }
1212 if (SavedContextIds.empty())
1213 continue;
1214
1215 // Create new context node.
1216 NodeOwner.push_back(
1217 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1218 ContextNode *NewNode = NodeOwner.back().get();
1219 NodeToCallingFunc[NewNode] = Func;
1220 NonAllocationCallToContextNodeMap[Call] = NewNode;
1221 NewNode->ContextIds = SavedContextIds;
1222 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1223
1224 // Connect to callees of innermost stack frame in inlined call chain.
1225 // This updates context ids for FirstNode's callee's to reflect those
1226 // moved to NewNode.
1227 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1228
1229 // Connect to callers of outermost stack frame in inlined call chain.
1230 // This updates context ids for FirstNode's caller's to reflect those
1231 // moved to NewNode.
1232 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1233
1234 // Now we need to remove context ids from edges/nodes between First and
1235 // Last Node.
1236 PrevNode = nullptr;
1237 for (auto Id : Ids) {
1238 ContextNode *CurNode = getNodeForStackId(Id);
1239 // We should only have kept stack ids that had nodes.
1240 assert(CurNode);
1241
1242 // Remove the context ids moved to NewNode from CurNode, and the
1243 // edge from the prior node.
1244 set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1245 if (PrevNode) {
1246 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1247 assert(PrevEdge);
1248 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1249 if (PrevEdge->getContextIds().empty()) {
1250 PrevNode->eraseCallerEdge(PrevEdge);
1251 CurNode->eraseCalleeEdge(PrevEdge);
1252 }
1253 }
1254 PrevNode = CurNode;
1255 }
1256 }
1257}
1258
1259template <typename DerivedCCG, typename FuncTy, typename CallTy>
1260void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1261 // Map of stack id to all calls with that as the last (outermost caller)
1262 // callsite id that has a context node (some might not due to pruning
1263 // performed during matching of the allocation profile contexts).
1264 // The CallContextInfo contains the Call and a list of its stack ids with
1265 // ContextNodes, the function containing Call, and the set of context ids
1266 // the analysis will eventually identify for use in any new node created
1267 // for that callsite.
1269 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1270 for (auto &Call : CallsWithMetadata) {
1271 // Ignore allocations, already handled.
1272 if (AllocationCallToContextNodeMap.count(Call))
1273 continue;
1274 auto StackIdsWithContextNodes =
1275 getStackIdsWithContextNodesForCall(Call.call());
1276 // If there were no nodes created for MIBs on allocs (maybe this was in
1277 // the unambiguous part of the MIB stack that was pruned), ignore.
1278 if (StackIdsWithContextNodes.empty())
1279 continue;
1280 // Otherwise, record this Call along with the list of ids for the last
1281 // (outermost caller) stack id with a node.
1282 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1283 {Call.call(), StackIdsWithContextNodes, Func, {}});
1284 }
1285 }
1286
1287 // First make a pass through all stack ids that correspond to a call,
1288 // as identified in the above loop. Compute the context ids corresponding to
1289 // each of these calls when they correspond to multiple stack ids due to
1290 // due to inlining. Perform any duplication of context ids required when
1291 // there is more than one call with the same stack ids. Their (possibly newly
1292 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1293 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1294 for (auto &It : StackIdToMatchingCalls) {
1295 auto &Calls = It.getSecond();
1296 // Skip single calls with a single stack id. These don't need a new node.
1297 if (Calls.size() == 1) {
1298 auto &Ids = std::get<1>(Calls[0]);
1299 if (Ids.size() == 1)
1300 continue;
1301 }
1302 // In order to do the best and maximal matching of inlined calls to context
1303 // node sequences we will sort the vectors of stack ids in descending order
1304 // of length, and within each length, lexicographically by stack id. The
1305 // latter is so that we can specially handle calls that have identical stack
1306 // id sequences (either due to cloning or artificially because of the MIB
1307 // context pruning).
1308 std::stable_sort(Calls.begin(), Calls.end(),
1309 [](const CallContextInfo &A, const CallContextInfo &B) {
1310 auto &IdsA = std::get<1>(A);
1311 auto &IdsB = std::get<1>(B);
1312 return IdsA.size() > IdsB.size() ||
1313 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1314 });
1315
1316 // Find the node for the last stack id, which should be the same
1317 // across all calls recorded for this id, and is the id for this
1318 // entry in the StackIdToMatchingCalls map.
1319 uint64_t LastId = It.getFirst();
1320 ContextNode *LastNode = getNodeForStackId(LastId);
1321 // We should only have kept stack ids that had nodes.
1322 assert(LastNode);
1323
1324 if (LastNode->Recursive)
1325 continue;
1326
1327 // Initialize the context ids with the last node's. We will subsequently
1328 // refine the context ids by computing the intersection along all edges.
1329 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1330 assert(!LastNodeContextIds.empty());
1331
1332 for (unsigned I = 0; I < Calls.size(); I++) {
1333 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1334 assert(SavedContextIds.empty());
1335 assert(LastId == Ids.back());
1336
1337 // First compute the context ids for this stack id sequence (the
1338 // intersection of the context ids of the corresponding nodes).
1339 // Start with the remaining saved ids for the last node.
1340 assert(!LastNodeContextIds.empty());
1341 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1342
1343 ContextNode *PrevNode = LastNode;
1344 ContextNode *CurNode = LastNode;
1345 bool Skip = false;
1346
1347 // Iterate backwards through the stack Ids, starting after the last Id
1348 // in the list, which was handled once outside for all Calls.
1349 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1350 auto Id = *IdIter;
1351 CurNode = getNodeForStackId(Id);
1352 // We should only have kept stack ids that had nodes.
1353 assert(CurNode);
1354
1355 if (CurNode->Recursive) {
1356 Skip = true;
1357 break;
1358 }
1359
1360 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1361 // If there is no edge then the nodes belong to different MIB contexts,
1362 // and we should skip this inlined context sequence. For example, this
1363 // particular inlined context may include stack ids A->B, and we may
1364 // indeed have nodes for both A and B, but it is possible that they were
1365 // never profiled in sequence in a single MIB for any allocation (i.e.
1366 // we might have profiled an allocation that involves the callsite A,
1367 // but through a different one of its callee callsites, and we might
1368 // have profiled an allocation that involves callsite B, but reached
1369 // from a different caller callsite).
1370 if (!Edge) {
1371 Skip = true;
1372 break;
1373 }
1374 PrevNode = CurNode;
1375
1376 // Update the context ids, which is the intersection of the ids along
1377 // all edges in the sequence.
1378 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1379
1380 // If we now have no context ids for clone, skip this call.
1381 if (StackSequenceContextIds.empty()) {
1382 Skip = true;
1383 break;
1384 }
1385 }
1386 if (Skip)
1387 continue;
1388
1389 // If some of this call's stack ids did not have corresponding nodes (due
1390 // to pruning), don't include any context ids for contexts that extend
1391 // beyond these nodes. Otherwise we would be matching part of unrelated /
1392 // not fully matching stack contexts. To do this, subtract any context ids
1393 // found in caller nodes of the last node found above.
1394 if (Ids.back() != getLastStackId(Call)) {
1395 for (const auto &PE : LastNode->CallerEdges) {
1396 set_subtract(StackSequenceContextIds, PE->getContextIds());
1397 if (StackSequenceContextIds.empty())
1398 break;
1399 }
1400 // If we now have no context ids for clone, skip this call.
1401 if (StackSequenceContextIds.empty())
1402 continue;
1403 }
1404
1405 // Check if the next set of stack ids is the same (since the Calls vector
1406 // of tuples is sorted by the stack ids we can just look at the next one).
1407 bool DuplicateContextIds = false;
1408 if (I + 1 < Calls.size()) {
1409 auto NextIds = std::get<1>(Calls[I + 1]);
1410 DuplicateContextIds = Ids == NextIds;
1411 }
1412
1413 // If we don't have duplicate context ids, then we can assign all the
1414 // context ids computed for the original node sequence to this call.
1415 // If there are duplicate calls with the same stack ids then we synthesize
1416 // new context ids that are duplicates of the originals. These are
1417 // assigned to SavedContextIds, which is a reference into the map entry
1418 // for this call, allowing us to access these ids later on.
1419 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1420 StackSequenceContextIds.size());
1421 SavedContextIds =
1422 DuplicateContextIds
1423 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1424 : StackSequenceContextIds;
1425 assert(!SavedContextIds.empty());
1426
1427 if (!DuplicateContextIds) {
1428 // Update saved last node's context ids to remove those that are
1429 // assigned to other calls, so that it is ready for the next call at
1430 // this stack id.
1431 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1432 if (LastNodeContextIds.empty())
1433 break;
1434 }
1435 }
1436 }
1437
1438 // Propagate the duplicate context ids over the graph.
1439 propagateDuplicateContextIds(OldToNewContextIds);
1440
1441 if (VerifyCCG)
1442 check();
1443
1444 // Now perform a post-order traversal over the graph, starting with the
1445 // allocation nodes, essentially processing nodes from callers to callees.
1446 // For any that contains an id in the map, update the graph to contain new
1447 // nodes representing any inlining at interior callsites. Note we move the
1448 // associated context ids over to the new nodes.
1450 for (auto &Entry : AllocationCallToContextNodeMap)
1451 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1452}
1453
1454uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1456 Call->getMetadata(LLVMContext::MD_callsite));
1457 return CallsiteContext.back();
1458}
1459
1460uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1461 assert(isa<CallsiteInfo *>(Call.getBase()));
1463 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1464 // Need to convert index into stack id.
1465 return Index.getStackIdAtIndex(CallsiteContext.back());
1466}
1467
1468static const std::string MemProfCloneSuffix = ".memprof.";
1469
1470static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1471 // We use CloneNo == 0 to refer to the original version, which doesn't get
1472 // renamed with a suffix.
1473 if (!CloneNo)
1474 return Base.str();
1475 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1476}
1477
1478std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1479 const Instruction *Call,
1480 unsigned CloneNo) const {
1481 return (Twine(Call->getFunction()->getName()) + " -> " +
1482 cast<CallBase>(Call)->getCalledFunction()->getName())
1483 .str();
1484}
1485
1486std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1487 const IndexCall &Call,
1488 unsigned CloneNo) const {
1489 auto VI = FSToVIMap.find(Func);
1490 assert(VI != FSToVIMap.end());
1491 if (isa<AllocInfo *>(Call.getBase()))
1492 return (VI->second.name() + " -> alloc").str();
1493 else {
1494 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1495 return (VI->second.name() + " -> " +
1496 getMemProfFuncName(Callsite->Callee.name(),
1497 Callsite->Clones[CloneNo]))
1498 .str();
1499 }
1500}
1501
1502std::vector<uint64_t>
1503ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1504 Instruction *Call) {
1506 Call->getMetadata(LLVMContext::MD_callsite));
1507 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1508 CallsiteContext);
1509}
1510
1511std::vector<uint64_t>
1512IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1513 assert(isa<CallsiteInfo *>(Call.getBase()));
1515 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1516 return getStackIdsWithContextNodes<CallsiteInfo,
1518 CallsiteContext);
1519}
1520
1521template <typename DerivedCCG, typename FuncTy, typename CallTy>
1522template <class NodeT, class IteratorT>
1523std::vector<uint64_t>
1524CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1525 CallStack<NodeT, IteratorT> &CallsiteContext) {
1526 std::vector<uint64_t> StackIds;
1527 for (auto IdOrIndex : CallsiteContext) {
1528 auto StackId = getStackId(IdOrIndex);
1529 ContextNode *Node = getNodeForStackId(StackId);
1530 if (!Node)
1531 break;
1532 StackIds.push_back(StackId);
1533 }
1534 return StackIds;
1535}
1536
1537ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1538 Module &M,
1540 : Mod(M), OREGetter(OREGetter) {
1541 for (auto &F : M) {
1542 std::vector<CallInfo> CallsWithMetadata;
1543 for (auto &BB : F) {
1544 for (auto &I : BB) {
1545 if (!isa<CallBase>(I))
1546 continue;
1547 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1548 CallsWithMetadata.push_back(&I);
1549 auto *AllocNode = addAllocNode(&I, &F);
1550 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1551 assert(CallsiteMD);
1552 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1553 // Add all of the MIBs and their stack nodes.
1554 for (auto &MDOp : MemProfMD->operands()) {
1555 auto *MIBMD = cast<const MDNode>(MDOp);
1556 MDNode *StackNode = getMIBStackNode(MIBMD);
1557 assert(StackNode);
1558 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1559 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1560 AllocNode, StackContext, CallsiteContext,
1561 getMIBAllocType(MIBMD));
1562 }
1563 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1564 // Memprof and callsite metadata on memory allocations no longer
1565 // needed.
1566 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1567 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1568 }
1569 // For callsite metadata, add to list for this function for later use.
1570 else if (I.getMetadata(LLVMContext::MD_callsite))
1571 CallsWithMetadata.push_back(&I);
1572 }
1573 }
1574 if (!CallsWithMetadata.empty())
1575 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1576 }
1577
1578 if (DumpCCG) {
1579 dbgs() << "CCG before updating call stack chains:\n";
1580 dbgs() << *this;
1581 }
1582
1583 if (ExportToDot)
1584 exportToDot("prestackupdate");
1585
1586 updateStackNodes();
1587
1588 handleCallsitesWithMultipleTargets();
1589
1590 // Strip off remaining callsite metadata, no longer needed.
1591 for (auto &FuncEntry : FuncToCallsWithMetadata)
1592 for (auto &Call : FuncEntry.second)
1593 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1594}
1595
1596IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1599 isPrevailing)
1600 : Index(Index), isPrevailing(isPrevailing) {
1601 for (auto &I : Index) {
1602 auto VI = Index.getValueInfo(I);
1603 for (auto &S : VI.getSummaryList()) {
1604 // We should only add the prevailing nodes. Otherwise we may try to clone
1605 // in a weak copy that won't be linked (and may be different than the
1606 // prevailing version).
1607 // We only keep the memprof summary on the prevailing copy now when
1608 // building the combined index, as a space optimization, however don't
1609 // rely on this optimization. The linker doesn't resolve local linkage
1610 // values so don't check whether those are prevailing.
1611 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1612 !isPrevailing(VI.getGUID(), S.get()))
1613 continue;
1614 auto *FS = dyn_cast<FunctionSummary>(S.get());
1615 if (!FS)
1616 continue;
1617 std::vector<CallInfo> CallsWithMetadata;
1618 if (!FS->allocs().empty()) {
1619 for (auto &AN : FS->mutableAllocs()) {
1620 // This can happen because of recursion elimination handling that
1621 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1622 // We still added them to the summary because we need to be able to
1623 // correlate properly in applyImport in the backends.
1624 if (AN.MIBs.empty())
1625 continue;
1626 CallsWithMetadata.push_back({&AN});
1627 auto *AllocNode = addAllocNode({&AN}, FS);
1628 // Pass an empty CallStack to the CallsiteContext (second)
1629 // parameter, since for ThinLTO we already collapsed out the inlined
1630 // stack ids on the allocation call during ModuleSummaryAnalysis.
1632 EmptyContext;
1633 // Now add all of the MIBs and their stack nodes.
1634 for (auto &MIB : AN.MIBs) {
1636 StackContext(&MIB);
1637 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1638 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1639 }
1640 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1641 // Initialize version 0 on the summary alloc node to the current alloc
1642 // type, unless it has both types in which case make it default, so
1643 // that in the case where we aren't able to clone the original version
1644 // always ends up with the default allocation behavior.
1645 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1646 }
1647 }
1648 // For callsite metadata, add to list for this function for later use.
1649 if (!FS->callsites().empty())
1650 for (auto &SN : FS->mutableCallsites())
1651 CallsWithMetadata.push_back({&SN});
1652
1653 if (!CallsWithMetadata.empty())
1654 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1655
1656 if (!FS->allocs().empty() || !FS->callsites().empty())
1657 FSToVIMap[FS] = VI;
1658 }
1659 }
1660
1661 if (DumpCCG) {
1662 dbgs() << "CCG before updating call stack chains:\n";
1663 dbgs() << *this;
1664 }
1665
1666 if (ExportToDot)
1667 exportToDot("prestackupdate");
1668
1669 updateStackNodes();
1670
1671 handleCallsitesWithMultipleTargets();
1672}
1673
1674template <typename DerivedCCG, typename FuncTy, typename CallTy>
1675void CallsiteContextGraph<DerivedCCG, FuncTy,
1676 CallTy>::handleCallsitesWithMultipleTargets() {
1677 // Look for and workaround callsites that call multiple functions.
1678 // This can happen for indirect calls, which needs better handling, and in
1679 // more rare cases (e.g. macro expansion).
1680 // TODO: To fix this for indirect calls we will want to perform speculative
1681 // devirtualization using either the normal PGO info with ICP, or using the
1682 // information in the profiled MemProf contexts. We can do this prior to
1683 // this transformation for regular LTO, and for ThinLTO we can simulate that
1684 // effect in the summary and perform the actual speculative devirtualization
1685 // while cloning in the ThinLTO backend.
1686
1687 // Keep track of the new nodes synthesized for discovered tail calls missing
1688 // from the profiled contexts.
1689 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1690
1691 for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1692 Entry != NonAllocationCallToContextNodeMap.end();) {
1693 auto *Node = Entry->second;
1694 assert(Node->Clones.empty());
1695 // Check all node callees and see if in the same function.
1696 bool Removed = false;
1697 auto Call = Node->Call.call();
1698 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1699 auto Edge = *EI;
1700 if (!Edge->Callee->hasCall()) {
1701 ++EI;
1702 continue;
1703 }
1704 assert(NodeToCallingFunc.count(Edge->Callee));
1705 // Check if the called function matches that of the callee node.
1706 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1707 continue;
1708 RemovedEdgesWithMismatchedCallees++;
1709 // Work around by setting Node to have a null call, so it gets
1710 // skipped during cloning. Otherwise assignFunctions will assert
1711 // because its data structures are not designed to handle this case.
1712 Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1713 Node->setCall(CallInfo());
1714 Removed = true;
1715 break;
1716 }
1717 if (!Removed)
1718 Entry++;
1719 }
1720
1721 // Add the new nodes after the above loop so that the iteration is not
1722 // invalidated.
1723 for (auto &[Call, Node] : TailCallToContextNodeMap)
1724 NonAllocationCallToContextNodeMap[Call] = Node;
1725}
1726
1727uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1728 // In the Module (IR) case this is already the Id.
1729 return IdOrIndex;
1730}
1731
1732uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1733 // In the Index case this is an index into the stack id list in the summary
1734 // index, convert it to an Id.
1735 return Index.getStackIdAtIndex(IdOrIndex);
1736}
1737
1738template <typename DerivedCCG, typename FuncTy, typename CallTy>
1739bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1740 CallTy Call, EdgeIter &EI,
1741 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1742 auto Edge = *EI;
1743 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1744 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1745 // Will be populated in order of callee to caller if we find a chain of tail
1746 // calls between the profiled caller and callee.
1747 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1748 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1749 FoundCalleeChain)) {
1750 ++EI;
1751 return false;
1752 }
1753
1754 // The usual case where the profiled callee matches that of the IR/summary.
1755 if (FoundCalleeChain.empty()) {
1756 ++EI;
1757 return true;
1758 }
1759
1760 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1761 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1762 // If there is already an edge between these nodes, simply update it and
1763 // return.
1764 if (CurEdge) {
1765 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1766 Edge->ContextIds.end());
1767 CurEdge->AllocTypes |= Edge->AllocTypes;
1768 return;
1769 }
1770 // Otherwise, create a new edge and insert it into the caller and callee
1771 // lists.
1772 auto NewEdge = std::make_shared<ContextEdge>(
1773 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1774 Callee->CallerEdges.push_back(NewEdge);
1775 if (Caller == Edge->Caller) {
1776 // If we are inserting the new edge into the current edge's caller, insert
1777 // the new edge before the current iterator position, and then increment
1778 // back to the current edge.
1779 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1780 ++EI;
1781 assert(*EI == Edge &&
1782 "Iterator position not restored after insert and increment");
1783 } else
1784 Caller->CalleeEdges.push_back(NewEdge);
1785 };
1786
1787 // Create new nodes for each found callee and connect in between the profiled
1788 // caller and callee.
1789 auto *CurCalleeNode = Edge->Callee;
1790 for (auto &[NewCall, Func] : FoundCalleeChain) {
1791 ContextNode *NewNode = nullptr;
1792 // First check if we have already synthesized a node for this tail call.
1793 if (TailCallToContextNodeMap.count(NewCall)) {
1794 NewNode = TailCallToContextNodeMap[NewCall];
1795 NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1796 Edge->ContextIds.end());
1797 NewNode->AllocTypes |= Edge->AllocTypes;
1798 } else {
1799 FuncToCallsWithMetadata[Func].push_back({NewCall});
1800 // Create Node and record node info.
1801 NodeOwner.push_back(
1802 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1803 NewNode = NodeOwner.back().get();
1804 NodeToCallingFunc[NewNode] = Func;
1805 TailCallToContextNodeMap[NewCall] = NewNode;
1806 NewNode->ContextIds = Edge->ContextIds;
1807 NewNode->AllocTypes = Edge->AllocTypes;
1808 }
1809
1810 // Hook up node to its callee node
1811 AddEdge(NewNode, CurCalleeNode);
1812
1813 CurCalleeNode = NewNode;
1814 }
1815
1816 // Hook up edge's original caller to new callee node.
1817 AddEdge(Edge->Caller, CurCalleeNode);
1818
1819 // Remove old edge
1820 Edge->Callee->eraseCallerEdge(Edge.get());
1821 EI = Edge->Caller->CalleeEdges.erase(EI);
1822
1823 return true;
1824}
1825
1826bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1827 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1828 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1829 bool &FoundMultipleCalleeChains) {
1830 // Stop recursive search if we have already explored the maximum specified
1831 // depth.
1833 return false;
1834
1835 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1836 FoundCalleeChain.push_back({Callsite, F});
1837 };
1838
1839 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1840 if (!CalleeFunc) {
1841 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1842 assert(Alias);
1843 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1844 assert(CalleeFunc);
1845 }
1846
1847 // Look for tail calls in this function, and check if they either call the
1848 // profiled callee directly, or indirectly (via a recursive search).
1849 // Only succeed if there is a single unique tail call chain found between the
1850 // profiled caller and callee, otherwise we could perform incorrect cloning.
1851 bool FoundSingleCalleeChain = false;
1852 for (auto &BB : *CalleeFunc) {
1853 for (auto &I : BB) {
1854 auto *CB = dyn_cast<CallBase>(&I);
1855 if (!CB || !CB->isTailCall())
1856 continue;
1857 auto *CalledValue = CB->getCalledOperand();
1858 auto *CalledFunction = CB->getCalledFunction();
1859 if (CalledValue && !CalledFunction) {
1860 CalledValue = CalledValue->stripPointerCasts();
1861 // Stripping pointer casts can reveal a called function.
1862 CalledFunction = dyn_cast<Function>(CalledValue);
1863 }
1864 // Check if this is an alias to a function. If so, get the
1865 // called aliasee for the checks below.
1866 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
1867 assert(!CalledFunction &&
1868 "Expected null called function in callsite for alias");
1869 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
1870 }
1871 if (!CalledFunction)
1872 continue;
1873 if (CalledFunction == ProfiledCallee) {
1874 if (FoundSingleCalleeChain) {
1875 FoundMultipleCalleeChains = true;
1876 return false;
1877 }
1878 FoundSingleCalleeChain = true;
1879 FoundProfiledCalleeCount++;
1880 FoundProfiledCalleeDepth += Depth;
1881 if (Depth > FoundProfiledCalleeMaxDepth)
1882 FoundProfiledCalleeMaxDepth = Depth;
1883 SaveCallsiteInfo(&I, CalleeFunc);
1884 } else if (findProfiledCalleeThroughTailCalls(
1885 ProfiledCallee, CalledFunction, Depth + 1,
1886 FoundCalleeChain, FoundMultipleCalleeChains)) {
1887 if (FoundMultipleCalleeChains)
1888 return false;
1889 if (FoundSingleCalleeChain) {
1890 FoundMultipleCalleeChains = true;
1891 return false;
1892 }
1893 FoundSingleCalleeChain = true;
1894 SaveCallsiteInfo(&I, CalleeFunc);
1895 }
1896 }
1897 }
1898
1899 return FoundSingleCalleeChain;
1900}
1901
1902bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1903 Instruction *Call, const Function *Func, const Function *CallerFunc,
1904 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
1905 auto *CB = dyn_cast<CallBase>(Call);
1906 if (!CB->getCalledOperand())
1907 return false;
1908 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1909 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1910 if (CalleeFunc == Func)
1911 return true;
1912 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1913 if (Alias && Alias->getAliasee() == Func)
1914 return true;
1915
1916 // Recursively search for the profiled callee through tail calls starting with
1917 // the actual Callee. The discovered tail call chain is saved in
1918 // FoundCalleeChain, and we will fixup the graph to include these callsites
1919 // after returning.
1920 // FIXME: We will currently redo the same recursive walk if we find the same
1921 // mismatched callee from another callsite. We can improve this with more
1922 // bookkeeping of the created chain of new nodes for each mismatch.
1923 unsigned Depth = 1;
1924 bool FoundMultipleCalleeChains = false;
1925 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
1926 FoundCalleeChain,
1927 FoundMultipleCalleeChains)) {
1928 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1929 << Func->getName() << " from " << CallerFunc->getName()
1930 << " that actually called " << CalleeVal->getName()
1931 << (FoundMultipleCalleeChains
1932 ? " (found multiple possible chains)"
1933 : "")
1934 << "\n");
1935 if (FoundMultipleCalleeChains)
1936 FoundProfiledCalleeNonUniquelyCount++;
1937 return false;
1938 }
1939
1940 return true;
1941}
1942
1943bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1944 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1945 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1946 bool &FoundMultipleCalleeChains) {
1947 // Stop recursive search if we have already explored the maximum specified
1948 // depth.
1950 return false;
1951
1952 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1953 // Make a CallsiteInfo for each discovered callee, if one hasn't already
1954 // been synthesized.
1955 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
1956 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
1957 // StackIds is empty (we don't have debug info available in the index for
1958 // these callsites)
1959 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
1960 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
1961 CallsiteInfo *NewCallsiteInfo =
1962 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
1963 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
1964 };
1965
1966 // Look for tail calls in this function, and check if they either call the
1967 // profiled callee directly, or indirectly (via a recursive search).
1968 // Only succeed if there is a single unique tail call chain found between the
1969 // profiled caller and callee, otherwise we could perform incorrect cloning.
1970 bool FoundSingleCalleeChain = false;
1971 for (auto &S : CurCallee.getSummaryList()) {
1972 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1973 !isPrevailing(CurCallee.getGUID(), S.get()))
1974 continue;
1975 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
1976 if (!FS)
1977 continue;
1978 auto FSVI = CurCallee;
1979 auto *AS = dyn_cast<AliasSummary>(S.get());
1980 if (AS)
1981 FSVI = AS->getAliaseeVI();
1982 for (auto &CallEdge : FS->calls()) {
1983 if (!CallEdge.second.hasTailCall())
1984 continue;
1985 if (CallEdge.first == ProfiledCallee) {
1986 if (FoundSingleCalleeChain) {
1987 FoundMultipleCalleeChains = true;
1988 return false;
1989 }
1990 FoundSingleCalleeChain = true;
1991 FoundProfiledCalleeCount++;
1992 FoundProfiledCalleeDepth += Depth;
1993 if (Depth > FoundProfiledCalleeMaxDepth)
1994 FoundProfiledCalleeMaxDepth = Depth;
1995 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1996 // Add FS to FSToVIMap in case it isn't already there.
1997 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
1998 FSToVIMap[FS] = FSVI;
1999 } else if (findProfiledCalleeThroughTailCalls(
2000 ProfiledCallee, CallEdge.first, Depth + 1,
2001 FoundCalleeChain, FoundMultipleCalleeChains)) {
2002 if (FoundMultipleCalleeChains)
2003 return false;
2004 if (FoundSingleCalleeChain) {
2005 FoundMultipleCalleeChains = true;
2006 return false;
2007 }
2008 FoundSingleCalleeChain = true;
2009 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2010 // Add FS to FSToVIMap in case it isn't already there.
2011 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2012 FSToVIMap[FS] = FSVI;
2013 }
2014 }
2015 }
2016
2017 return FoundSingleCalleeChain;
2018}
2019
2020bool IndexCallsiteContextGraph::calleeMatchesFunc(
2021 IndexCall &Call, const FunctionSummary *Func,
2022 const FunctionSummary *CallerFunc,
2023 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2025 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2026 // If there is no summary list then this is a call to an externally defined
2027 // symbol.
2028 AliasSummary *Alias =
2029 Callee.getSummaryList().empty()
2030 ? nullptr
2031 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2032 assert(FSToVIMap.count(Func));
2033 auto FuncVI = FSToVIMap[Func];
2034 if (Callee == FuncVI ||
2035 // If callee is an alias, check the aliasee, since only function
2036 // summary base objects will contain the stack node summaries and thus
2037 // get a context node.
2038 (Alias && Alias->getAliaseeVI() == FuncVI))
2039 return true;
2040
2041 // Recursively search for the profiled callee through tail calls starting with
2042 // the actual Callee. The discovered tail call chain is saved in
2043 // FoundCalleeChain, and we will fixup the graph to include these callsites
2044 // after returning.
2045 // FIXME: We will currently redo the same recursive walk if we find the same
2046 // mismatched callee from another callsite. We can improve this with more
2047 // bookkeeping of the created chain of new nodes for each mismatch.
2048 unsigned Depth = 1;
2049 bool FoundMultipleCalleeChains = false;
2050 if (!findProfiledCalleeThroughTailCalls(
2051 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2052 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2053 << " from " << FSToVIMap[CallerFunc]
2054 << " that actually called " << Callee
2055 << (FoundMultipleCalleeChains
2056 ? " (found multiple possible chains)"
2057 : "")
2058 << "\n");
2059 if (FoundMultipleCalleeChains)
2060 FoundProfiledCalleeNonUniquelyCount++;
2061 return false;
2062 }
2063
2064 return true;
2065}
2066
2067static std::string getAllocTypeString(uint8_t AllocTypes) {
2068 if (!AllocTypes)
2069 return "None";
2070 std::string Str;
2071 if (AllocTypes & (uint8_t)AllocationType::NotCold)
2072 Str += "NotCold";
2073 if (AllocTypes & (uint8_t)AllocationType::Cold)
2074 Str += "Cold";
2075 return Str;
2076}
2077
2078template <typename DerivedCCG, typename FuncTy, typename CallTy>
2079void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2080 const {
2081 print(dbgs());
2082 dbgs() << "\n";
2083}
2084
2085template <typename DerivedCCG, typename FuncTy, typename CallTy>
2086void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2087 raw_ostream &OS) const {
2088 OS << "Node " << this << "\n";
2089 OS << "\t";
2090 printCall(OS);
2091 if (Recursive)
2092 OS << " (recursive)";
2093 OS << "\n";
2094 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2095 OS << "\tContextIds:";
2096 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2097 std::sort(SortedIds.begin(), SortedIds.end());
2098 for (auto Id : SortedIds)
2099 OS << " " << Id;
2100 OS << "\n";
2101 OS << "\tCalleeEdges:\n";
2102 for (auto &Edge : CalleeEdges)
2103 OS << "\t\t" << *Edge << "\n";
2104 OS << "\tCallerEdges:\n";
2105 for (auto &Edge : CallerEdges)
2106 OS << "\t\t" << *Edge << "\n";
2107 if (!Clones.empty()) {
2108 OS << "\tClones: ";
2109 FieldSeparator FS;
2110 for (auto *Clone : Clones)
2111 OS << FS << Clone;
2112 OS << "\n";
2113 } else if (CloneOf) {
2114 OS << "\tClone of " << CloneOf << "\n";
2115 }
2116}
2117
2118template <typename DerivedCCG, typename FuncTy, typename CallTy>
2119void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2120 const {
2121 print(dbgs());
2122 dbgs() << "\n";
2123}
2124
2125template <typename DerivedCCG, typename FuncTy, typename CallTy>
2126void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2127 raw_ostream &OS) const {
2128 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2129 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2130 OS << " ContextIds:";
2131 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2132 std::sort(SortedIds.begin(), SortedIds.end());
2133 for (auto Id : SortedIds)
2134 OS << " " << Id;
2135}
2136
2137template <typename DerivedCCG, typename FuncTy, typename CallTy>
2138void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2139 print(dbgs());
2140}
2141
2142template <typename DerivedCCG, typename FuncTy, typename CallTy>
2143void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2144 raw_ostream &OS) const {
2145 OS << "Callsite Context Graph:\n";
2146 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2147 for (const auto Node : nodes<GraphType>(this)) {
2148 if (Node->isRemoved())
2149 continue;
2150 Node->print(OS);
2151 OS << "\n";
2152 }
2153}
2154
2155template <typename DerivedCCG, typename FuncTy, typename CallTy>
2156static void checkEdge(
2157 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2158 // Confirm that alloc type is not None and that we have at least one context
2159 // id.
2160 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2161 assert(!Edge->ContextIds.empty());
2162}
2163
2164template <typename DerivedCCG, typename FuncTy, typename CallTy>
2165static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2166 bool CheckEdges = true) {
2167 if (Node->isRemoved())
2168 return;
2169 // Node's context ids should be the union of both its callee and caller edge
2170 // context ids.
2171 if (Node->CallerEdges.size()) {
2172 auto EI = Node->CallerEdges.begin();
2173 auto &FirstEdge = *EI;
2174 EI++;
2175 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2176 for (; EI != Node->CallerEdges.end(); EI++) {
2177 const auto &Edge = *EI;
2178 if (CheckEdges)
2179 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2180 set_union(CallerEdgeContextIds, Edge->ContextIds);
2181 }
2182 // Node can have more context ids than callers if some contexts terminate at
2183 // node and some are longer.
2184 assert(Node->ContextIds == CallerEdgeContextIds ||
2185 set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2186 }
2187 if (Node->CalleeEdges.size()) {
2188 auto EI = Node->CalleeEdges.begin();
2189 auto &FirstEdge = *EI;
2190 EI++;
2191 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2192 for (; EI != Node->CalleeEdges.end(); EI++) {
2193 const auto &Edge = *EI;
2194 if (CheckEdges)
2195 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2196 set_union(CalleeEdgeContextIds, Edge->ContextIds);
2197 }
2198 assert(Node->ContextIds == CalleeEdgeContextIds);
2199 }
2200}
2201
2202template <typename DerivedCCG, typename FuncTy, typename CallTy>
2203void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2204 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2205 for (const auto Node : nodes<GraphType>(this)) {
2206 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2207 for (auto &Edge : Node->CallerEdges)
2208 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2209 }
2210}
2211
2212template <typename DerivedCCG, typename FuncTy, typename CallTy>
2213struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2214 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2215 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2216
2217 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2218 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2219
2222 decltype(&getNode)>;
2223
2225 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2226 }
2227
2229 return nodes_iterator(G->NodeOwner.end(), &getNode);
2230 }
2231
2233 return G->NodeOwner.begin()->get();
2234 }
2235
2236 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2237 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2239 return P->Callee;
2240 }
2241
2243 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2244 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2245 decltype(&GetCallee)>;
2246
2248 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2249 }
2250
2252 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2253 }
2254};
2255
2256template <typename DerivedCCG, typename FuncTy, typename CallTy>
2257struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2258 : public DefaultDOTGraphTraits {
2259 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2260
2261 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2263 using NodeRef = typename GTraits::NodeRef;
2264 using ChildIteratorType = typename GTraits::ChildIteratorType;
2265
2266 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2267 std::string LabelString =
2268 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2269 Twine(Node->OrigStackOrAllocId))
2270 .str();
2271 LabelString += "\n";
2272 if (Node->hasCall()) {
2273 auto Func = G->NodeToCallingFunc.find(Node);
2274 assert(Func != G->NodeToCallingFunc.end());
2275 LabelString +=
2276 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2277 } else {
2278 LabelString += "null call";
2279 if (Node->Recursive)
2280 LabelString += " (recursive)";
2281 else
2282 LabelString += " (external)";
2283 }
2284 return LabelString;
2285 }
2286
2287 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2288 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2289 getContextIds(Node->ContextIds) + "\"")
2290 .str();
2292 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2293 AttributeString += ",style=\"filled\"";
2294 if (Node->CloneOf) {
2295 AttributeString += ",color=\"blue\"";
2296 AttributeString += ",style=\"filled,bold,dashed\"";
2297 } else
2298 AttributeString += ",style=\"filled\"";
2299 return AttributeString;
2300 }
2301
2302 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2303 GraphType) {
2304 auto &Edge = *(ChildIter.getCurrent());
2305 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2306 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2307 .str();
2308 }
2309
2310 // Since the NodeOwners list includes nodes that are no longer connected to
2311 // the graph, skip them here.
2312 static bool isNodeHidden(NodeRef Node, GraphType) {
2313 return Node->isRemoved();
2314 }
2315
2316private:
2317 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2318 std::string IdString = "ContextIds:";
2319 if (ContextIds.size() < 100) {
2320 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2321 std::sort(SortedIds.begin(), SortedIds.end());
2322 for (auto Id : SortedIds)
2323 IdString += (" " + Twine(Id)).str();
2324 } else {
2325 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2326 }
2327 return IdString;
2328 }
2329
2330 static std::string getColor(uint8_t AllocTypes) {
2331 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2332 // Color "brown1" actually looks like a lighter red.
2333 return "brown1";
2334 if (AllocTypes == (uint8_t)AllocationType::Cold)
2335 return "cyan";
2336 if (AllocTypes ==
2337 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2338 // Lighter purple.
2339 return "mediumorchid1";
2340 return "gray";
2341 }
2342
2343 static std::string getNodeId(NodeRef Node) {
2344 std::stringstream SStream;
2345 SStream << std::hex << "N0x" << (unsigned long long)Node;
2346 std::string Result = SStream.str();
2347 return Result;
2348 }
2349};
2350
2351template <typename DerivedCCG, typename FuncTy, typename CallTy>
2352void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2353 std::string Label) const {
2354 WriteGraph(this, "", false, Label,
2355 DotFilePathPrefix + "ccg." + Label + ".dot");
2356}
2357
2358template <typename DerivedCCG, typename FuncTy, typename CallTy>
2359typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2360CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2361 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
2362 ContextNode *Node = Edge->Callee;
2363 NodeOwner.push_back(
2364 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2365 ContextNode *Clone = NodeOwner.back().get();
2366 Node->addClone(Clone);
2367 assert(NodeToCallingFunc.count(Node));
2368 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2369 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
2370 return Clone;
2371}
2372
2373template <typename DerivedCCG, typename FuncTy, typename CallTy>
2374void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2375 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2376 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2377 bool NewClone) {
2378 // NewCallee and Edge's current callee must be clones of the same original
2379 // node (Edge's current callee may be the original node too).
2380 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2381 auto &EdgeContextIds = Edge->getContextIds();
2382 ContextNode *OldCallee = Edge->Callee;
2383 if (CallerEdgeI)
2384 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2385 else
2386 OldCallee->eraseCallerEdge(Edge.get());
2387 Edge->Callee = NewCallee;
2388 NewCallee->CallerEdges.push_back(Edge);
2389 // Don't need to update Edge's context ids since we are simply reconnecting
2390 // it.
2391 set_subtract(OldCallee->ContextIds, EdgeContextIds);
2392 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2393 NewCallee->AllocTypes |= Edge->AllocTypes;
2394 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2395 // OldCallee alloc type should be None iff its context id set is now empty.
2396 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2397 OldCallee->ContextIds.empty());
2398 // Now walk the old callee node's callee edges and move Edge's context ids
2399 // over to the corresponding edge into the clone (which is created here if
2400 // this is a newly created clone).
2401 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2402 // The context ids moving to the new callee are the subset of this edge's
2403 // context ids and the context ids on the caller edge being moved.
2404 DenseSet<uint32_t> EdgeContextIdsToMove =
2405 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2406 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2407 OldCalleeEdge->AllocTypes =
2408 computeAllocType(OldCalleeEdge->getContextIds());
2409 if (!NewClone) {
2410 // Update context ids / alloc type on corresponding edge to NewCallee.
2411 // There is a chance this may not exist if we are reusing an existing
2412 // clone, specifically during function assignment, where we would have
2413 // removed none type edges after creating the clone. If we can't find
2414 // a corresponding edge there, fall through to the cloning below.
2415 if (auto *NewCalleeEdge =
2416 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2417 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2418 EdgeContextIdsToMove.end());
2419 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2420 continue;
2421 }
2422 }
2423 auto NewEdge = std::make_shared<ContextEdge>(
2424 OldCalleeEdge->Callee, NewCallee,
2425 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2426 NewCallee->CalleeEdges.push_back(NewEdge);
2427 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2428 }
2429 if (VerifyCCG) {
2430 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2431 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2432 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2433 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2434 /*CheckEdges=*/false);
2435 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2436 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2437 /*CheckEdges=*/false);
2438 }
2439}
2440
2441template <typename DerivedCCG, typename FuncTy, typename CallTy>
2442void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2443 recursivelyRemoveNoneTypeCalleeEdges(
2444 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2445 auto Inserted = Visited.insert(Node);
2446 if (!Inserted.second)
2447 return;
2448
2449 removeNoneTypeCalleeEdges(Node);
2450
2451 for (auto *Clone : Node->Clones)
2452 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2453
2454 // The recursive call may remove some of this Node's caller edges.
2455 // Iterate over a copy and skip any that were removed.
2456 auto CallerEdges = Node->CallerEdges;
2457 for (auto &Edge : CallerEdges) {
2458 // Skip any that have been removed by an earlier recursive call.
2459 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2460 assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2461 Edge));
2462 continue;
2463 }
2464 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2465 }
2466}
2467
2468template <typename DerivedCCG, typename FuncTy, typename CallTy>
2469void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2471 for (auto &Entry : AllocationCallToContextNodeMap)
2472 identifyClones(Entry.second, Visited);
2473 Visited.clear();
2474 for (auto &Entry : AllocationCallToContextNodeMap)
2475 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2476 if (VerifyCCG)
2477 check();
2478}
2479
2480// helper function to check an AllocType is cold or notcold or both.
2482 return (AllocType == (uint8_t)AllocationType::Cold) ||
2483 (AllocType == (uint8_t)AllocationType::NotCold) ||
2484 (AllocType ==
2485 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2486}
2487
2488template <typename DerivedCCG, typename FuncTy, typename CallTy>
2489void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2490 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2491 if (VerifyNodes)
2492 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2493 assert(!Node->CloneOf);
2494
2495 // If Node as a null call, then either it wasn't found in the module (regular
2496 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2497 // cloning (e.g. recursion, calls multiple targets, etc).
2498 // Do this here so that we don't try to recursively clone callers below, which
2499 // isn't useful at least for this node.
2500 if (!Node->hasCall())
2501 return;
2502
2503#ifndef NDEBUG
2504 auto Insert =
2505#endif
2506 Visited.insert(Node);
2507 // We should not have visited this node yet.
2508 assert(Insert.second);
2509 // The recursive call to identifyClones may delete the current edge from the
2510 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2511 // in an iterator and having recursive call erase from it. Other edges may
2512 // also get removed during the recursion, which will have null Callee and
2513 // Caller pointers (and are deleted later), so we skip those below.
2514 {
2515 auto CallerEdges = Node->CallerEdges;
2516 for (auto &Edge : CallerEdges) {
2517 // Skip any that have been removed by an earlier recursive call.
2518 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2519 assert(!llvm::count(Node->CallerEdges, Edge));
2520 continue;
2521 }
2522 // Ignore any caller we previously visited via another edge.
2523 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2524 identifyClones(Edge->Caller, Visited);
2525 }
2526 }
2527 }
2528
2529 // Check if we reached an unambiguous call or have have only a single caller.
2530 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2531 return;
2532
2533 // We need to clone.
2534
2535 // Try to keep the original version as alloc type NotCold. This will make
2536 // cases with indirect calls or any other situation with an unknown call to
2537 // the original function get the default behavior. We do this by sorting the
2538 // CallerEdges of the Node we will clone by alloc type.
2539 //
2540 // Give NotCold edge the lowest sort priority so those edges are at the end of
2541 // the caller edges vector, and stay on the original version (since the below
2542 // code clones greedily until it finds all remaining edges have the same type
2543 // and leaves the remaining ones on the original Node).
2544 //
2545 // We shouldn't actually have any None type edges, so the sorting priority for
2546 // that is arbitrary, and we assert in that case below.
2547 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2548 /*Cold*/ 1,
2549 /*NotColdCold*/ 2};
2550 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2551 [&](const std::shared_ptr<ContextEdge> &A,
2552 const std::shared_ptr<ContextEdge> &B) {
2553 // Nodes with non-empty context ids should be sorted before
2554 // those with empty context ids.
2555 if (A->ContextIds.empty())
2556 // Either B ContextIds are non-empty (in which case we
2557 // should return false because B < A), or B ContextIds
2558 // are empty, in which case they are equal, and we should
2559 // maintain the original relative ordering.
2560 return false;
2561 if (B->ContextIds.empty())
2562 return true;
2563
2564 if (A->AllocTypes == B->AllocTypes)
2565 // Use the first context id for each edge as a
2566 // tie-breaker.
2567 return *A->ContextIds.begin() < *B->ContextIds.begin();
2568 return AllocTypeCloningPriority[A->AllocTypes] <
2569 AllocTypeCloningPriority[B->AllocTypes];
2570 });
2571
2572 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2573
2574 // Iterate until we find no more opportunities for disambiguating the alloc
2575 // types via cloning. In most cases this loop will terminate once the Node
2576 // has a single allocation type, in which case no more cloning is needed.
2577 // We need to be able to remove Edge from CallerEdges, so need to adjust
2578 // iterator inside the loop.
2579 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2580 auto CallerEdge = *EI;
2581
2582 // See if cloning the prior caller edge left this node with a single alloc
2583 // type or a single caller. In that case no more cloning of Node is needed.
2584 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2585 break;
2586
2587 // Compute the node callee edge alloc types corresponding to the context ids
2588 // for this caller edge.
2589 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2590 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2591 for (auto &CalleeEdge : Node->CalleeEdges)
2592 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2593 CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2594
2595 // Don't clone if doing so will not disambiguate any alloc types amongst
2596 // caller edges (including the callee edges that would be cloned).
2597 // Otherwise we will simply move all edges to the clone.
2598 //
2599 // First check if by cloning we will disambiguate the caller allocation
2600 // type from node's allocation type. Query allocTypeToUse so that we don't
2601 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2602 // neither of these should be None type.
2603 //
2604 // Then check if by cloning node at least one of the callee edges will be
2605 // disambiguated by splitting out different context ids.
2606 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2607 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2608 if (allocTypeToUse(CallerEdge->AllocTypes) ==
2609 allocTypeToUse(Node->AllocTypes) &&
2610 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2611 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2612 ++EI;
2613 continue;
2614 }
2615
2616 // First see if we can use an existing clone. Check each clone and its
2617 // callee edges for matching alloc types.
2618 ContextNode *Clone = nullptr;
2619 for (auto *CurClone : Node->Clones) {
2620 if (allocTypeToUse(CurClone->AllocTypes) !=
2621 allocTypeToUse(CallerEdge->AllocTypes))
2622 continue;
2623
2624 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2625 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2626 continue;
2627 Clone = CurClone;
2628 break;
2629 }
2630
2631 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2632 if (Clone)
2633 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2634 else
2635 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2636
2637 assert(EI == Node->CallerEdges.end() ||
2638 Node->AllocTypes != (uint8_t)AllocationType::None);
2639 // Sanity check that no alloc types on clone or its edges are None.
2640 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2641 }
2642
2643 // We should still have some context ids on the original Node.
2644 assert(!Node->ContextIds.empty());
2645
2646 // Sanity check that no alloc types on node or edges are None.
2647 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2648
2649 if (VerifyNodes)
2650 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2651}
2652
2653void ModuleCallsiteContextGraph::updateAllocationCall(
2655 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2656 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2657 "memprof", AllocTypeString);
2658 cast<CallBase>(Call.call())->addFnAttr(A);
2659 OREGetter(Call.call()->getFunction())
2660 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2661 << ore::NV("AllocationCall", Call.call()) << " in clone "
2662 << ore::NV("Caller", Call.call()->getFunction())
2663 << " marked with memprof allocation attribute "
2664 << ore::NV("Attribute", AllocTypeString));
2665}
2666
2667void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2669 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2670 assert(AI);
2671 assert(AI->Versions.size() > Call.cloneNo());
2672 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2673}
2674
2675void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2676 FuncInfo CalleeFunc) {
2677 if (CalleeFunc.cloneNo() > 0)
2678 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2679 OREGetter(CallerCall.call()->getFunction())
2680 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2681 << ore::NV("Call", CallerCall.call()) << " in clone "
2682 << ore::NV("Caller", CallerCall.call()->getFunction())
2683 << " assigned to call function clone "
2684 << ore::NV("Callee", CalleeFunc.func()));
2685}
2686
2687void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2688 FuncInfo CalleeFunc) {
2689 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2690 assert(CI &&
2691 "Caller cannot be an allocation which should not have profiled calls");
2692 assert(CI->Clones.size() > CallerCall.cloneNo());
2693 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2694}
2695
2696CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2697 Instruction *>::FuncInfo
2698ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2699 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2700 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2701 // Use existing LLVM facilities for cloning and obtaining Call in clone
2702 ValueToValueMapTy VMap;
2703 auto *NewFunc = CloneFunction(Func.func(), VMap);
2704 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2705 assert(!Func.func()->getParent()->getFunction(Name));
2706 NewFunc->setName(Name);
2707 for (auto &Inst : CallsWithMetadataInFunc) {
2708 // This map always has the initial version in it.
2709 assert(Inst.cloneNo() == 0);
2710 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2711 }
2712 OREGetter(Func.func())
2713 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2714 << "created clone " << ore::NV("NewFunction", NewFunc));
2715 return {NewFunc, CloneNo};
2716}
2717
2718CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2719 IndexCall>::FuncInfo
2720IndexCallsiteContextGraph::cloneFunctionForCallsite(
2721 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2722 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2723 // Check how many clones we have of Call (and therefore function).
2724 // The next clone number is the current size of versions array.
2725 // Confirm this matches the CloneNo provided by the caller, which is based on
2726 // the number of function clones we have.
2727 assert(CloneNo ==
2728 (Call.call().is<AllocInfo *>()
2729 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2730 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2731 // Walk all the instructions in this function. Create a new version for
2732 // each (by adding an entry to the Versions/Clones summary array), and copy
2733 // over the version being called for the function clone being cloned here.
2734 // Additionally, add an entry to the CallMap for the new function clone,
2735 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2736 // to the new call clone.
2737 for (auto &Inst : CallsWithMetadataInFunc) {
2738 // This map always has the initial version in it.
2739 assert(Inst.cloneNo() == 0);
2740 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2741 assert(AI->Versions.size() == CloneNo);
2742 // We assign the allocation type later (in updateAllocationCall), just add
2743 // an entry for it here.
2744 AI->Versions.push_back(0);
2745 } else {
2746 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2747 assert(CI && CI->Clones.size() == CloneNo);
2748 // We assign the clone number later (in updateCall), just add an entry for
2749 // it here.
2750 CI->Clones.push_back(0);
2751 }
2752 CallMap[Inst] = {Inst.call(), CloneNo};
2753 }
2754 return {Func.func(), CloneNo};
2755}
2756
2757// This method assigns cloned callsites to functions, cloning the functions as
2758// needed. The assignment is greedy and proceeds roughly as follows:
2759//
2760// For each function Func:
2761// For each call with graph Node having clones:
2762// Initialize ClonesWorklist to Node and its clones
2763// Initialize NodeCloneCount to 0
2764// While ClonesWorklist is not empty:
2765// Clone = pop front ClonesWorklist
2766// NodeCloneCount++
2767// If Func has been cloned less than NodeCloneCount times:
2768// If NodeCloneCount is 1:
2769// Assign Clone to original Func
2770// Continue
2771// Create a new function clone
2772// If other callers not assigned to call a function clone yet:
2773// Assign them to call new function clone
2774// Continue
2775// Assign any other caller calling the cloned version to new clone
2776//
2777// For each caller of Clone:
2778// If caller is assigned to call a specific function clone:
2779// If we cannot assign Clone to that function clone:
2780// Create new callsite Clone NewClone
2781// Add NewClone to ClonesWorklist
2782// Continue
2783// Assign Clone to existing caller's called function clone
2784// Else:
2785// If Clone not already assigned to a function clone:
2786// Assign to first function clone without assignment
2787// Assign caller to selected function clone
2788template <typename DerivedCCG, typename FuncTy, typename CallTy>
2789bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2790 bool Changed = false;
2791
2792 // Keep track of the assignment of nodes (callsites) to function clones they
2793 // call.
2794 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2795
2796 // Update caller node to call function version CalleeFunc, by recording the
2797 // assignment in CallsiteToCalleeFuncCloneMap.
2798 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2799 const FuncInfo &CalleeFunc) {
2800 assert(Caller->hasCall());
2801 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2802 };
2803
2804 // Walk all functions for which we saw calls with memprof metadata, and handle
2805 // cloning for each of its calls.
2806 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2807 FuncInfo OrigFunc(Func);
2808 // Map from each clone of OrigFunc to a map of remappings of each call of
2809 // interest (from original uncloned call to the corresponding cloned call in
2810 // that function clone).
2811 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2812 for (auto &Call : CallsWithMetadata) {
2813 ContextNode *Node = getNodeForInst(Call);
2814 // Skip call if we do not have a node for it (all uses of its stack ids
2815 // were either on inlined chains or pruned from the MIBs), or if we did
2816 // not create any clones for it.
2817 if (!Node || Node->Clones.empty())
2818 continue;
2819 assert(Node->hasCall() &&
2820 "Not having a call should have prevented cloning");
2821
2822 // Track the assignment of function clones to clones of the current
2823 // callsite Node being handled.
2824 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2825
2826 // Assign callsite version CallsiteClone to function version FuncClone,
2827 // and also assign (possibly cloned) Call to CallsiteClone.
2828 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2829 CallInfo &Call,
2830 ContextNode *CallsiteClone,
2831 bool IsAlloc) {
2832 // Record the clone of callsite node assigned to this function clone.
2833 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2834
2835 assert(FuncClonesToCallMap.count(FuncClone));
2836 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2837 CallInfo CallClone(Call);
2838 if (CallMap.count(Call))
2839 CallClone = CallMap[Call];
2840 CallsiteClone->setCall(CallClone);
2841 };
2842
2843 // Keep track of the clones of callsite Node that need to be assigned to
2844 // function clones. This list may be expanded in the loop body below if we
2845 // find additional cloning is required.
2846 std::deque<ContextNode *> ClonesWorklist;
2847 // Ignore original Node if we moved all of its contexts to clones.
2848 if (!Node->ContextIds.empty())
2849 ClonesWorklist.push_back(Node);
2850 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2851 Node->Clones.end());
2852
2853 // Now walk through all of the clones of this callsite Node that we need,
2854 // and determine the assignment to a corresponding clone of the current
2855 // function (creating new function clones as needed).
2856 unsigned NodeCloneCount = 0;
2857 while (!ClonesWorklist.empty()) {
2858 ContextNode *Clone = ClonesWorklist.front();
2859 ClonesWorklist.pop_front();
2860 NodeCloneCount++;
2861 if (VerifyNodes)
2862 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2863
2864 // Need to create a new function clone if we have more callsite clones
2865 // than existing function clones, which would have been assigned to an
2866 // earlier clone in the list (we assign callsite clones to function
2867 // clones greedily).
2868 if (FuncClonesToCallMap.size() < NodeCloneCount) {
2869 // If this is the first callsite copy, assign to original function.
2870 if (NodeCloneCount == 1) {
2871 // Since FuncClonesToCallMap is empty in this case, no clones have
2872 // been created for this function yet, and no callers should have
2873 // been assigned a function clone for this callee node yet.
2875 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2876 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2877 }));
2878 // Initialize with empty call map, assign Clone to original function
2879 // and its callers, and skip to the next clone.
2880 FuncClonesToCallMap[OrigFunc] = {};
2881 AssignCallsiteCloneToFuncClone(
2882 OrigFunc, Call, Clone,
2883 AllocationCallToContextNodeMap.count(Call));
2884 for (auto &CE : Clone->CallerEdges) {
2885 // Ignore any caller that does not have a recorded callsite Call.
2886 if (!CE->Caller->hasCall())
2887 continue;
2888 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2889 }
2890 continue;
2891 }
2892
2893 // First locate which copy of OrigFunc to clone again. If a caller
2894 // of this callsite clone was already assigned to call a particular
2895 // function clone, we need to redirect all of those callers to the
2896 // new function clone, and update their other callees within this
2897 // function.
2898 FuncInfo PreviousAssignedFuncClone;
2899 auto EI = llvm::find_if(
2900 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2901 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2902 });
2903 bool CallerAssignedToCloneOfFunc = false;
2904 if (EI != Clone->CallerEdges.end()) {
2905 const std::shared_ptr<ContextEdge> &Edge = *EI;
2906 PreviousAssignedFuncClone =
2907 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2908 CallerAssignedToCloneOfFunc = true;
2909 }
2910
2911 // Clone function and save it along with the CallInfo map created
2912 // during cloning in the FuncClonesToCallMap.
2913 std::map<CallInfo, CallInfo> NewCallMap;
2914 unsigned CloneNo = FuncClonesToCallMap.size();
2915 assert(CloneNo > 0 && "Clone 0 is the original function, which "
2916 "should already exist in the map");
2917 FuncInfo NewFuncClone = cloneFunctionForCallsite(
2918 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2919 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2920 FunctionClonesAnalysis++;
2921 Changed = true;
2922
2923 // If no caller callsites were already assigned to a clone of this
2924 // function, we can simply assign this clone to the new func clone
2925 // and update all callers to it, then skip to the next clone.
2926 if (!CallerAssignedToCloneOfFunc) {
2927 AssignCallsiteCloneToFuncClone(
2928 NewFuncClone, Call, Clone,
2929 AllocationCallToContextNodeMap.count(Call));
2930 for (auto &CE : Clone->CallerEdges) {
2931 // Ignore any caller that does not have a recorded callsite Call.
2932 if (!CE->Caller->hasCall())
2933 continue;
2934 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2935 }
2936 continue;
2937 }
2938
2939 // We may need to do additional node cloning in this case.
2940 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2941 // that were previously assigned to call PreviousAssignedFuncClone,
2942 // to record that they now call NewFuncClone.
2943 for (auto CE : Clone->CallerEdges) {
2944 // Skip any that have been removed on an earlier iteration.
2945 if (!CE)
2946 continue;
2947 // Ignore any caller that does not have a recorded callsite Call.
2948 if (!CE->Caller->hasCall())
2949 continue;
2950
2951 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2952 // We subsequently fall through to later handling that
2953 // will perform any additional cloning required for
2954 // callers that were calling other function clones.
2955 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2956 PreviousAssignedFuncClone)
2957 continue;
2958
2959 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2960
2961 // If we are cloning a function that was already assigned to some
2962 // callers, then essentially we are creating new callsite clones
2963 // of the other callsites in that function that are reached by those
2964 // callers. Clone the other callees of the current callsite's caller
2965 // that were already assigned to PreviousAssignedFuncClone
2966 // accordingly. This is important since we subsequently update the
2967 // calls from the nodes in the graph and their assignments to callee
2968 // functions recorded in CallsiteToCalleeFuncCloneMap.
2969 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2970 // Skip any that have been removed on an earlier iteration when
2971 // cleaning up newly None type callee edges.
2972 if (!CalleeEdge)
2973 continue;
2974 ContextNode *Callee = CalleeEdge->Callee;
2975 // Skip the current callsite, we are looking for other
2976 // callsites Caller calls, as well as any that does not have a
2977 // recorded callsite Call.
2978 if (Callee == Clone || !Callee->hasCall())
2979 continue;
2980 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2981 removeNoneTypeCalleeEdges(NewClone);
2982 // Moving the edge may have resulted in some none type
2983 // callee edges on the original Callee.
2984 removeNoneTypeCalleeEdges(Callee);
2985 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2986 // If the Callee node was already assigned to call a specific
2987 // function version, make sure its new clone is assigned to call
2988 // that same function clone.
2989 if (CallsiteToCalleeFuncCloneMap.count(Callee))
2990 RecordCalleeFuncOfCallsite(
2991 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2992 // Update NewClone with the new Call clone of this callsite's Call
2993 // created for the new function clone created earlier.
2994 // Recall that we have already ensured when building the graph
2995 // that each caller can only call callsites within the same
2996 // function, so we are guaranteed that Callee Call is in the
2997 // current OrigFunc.
2998 // CallMap is set up as indexed by original Call at clone 0.
2999 CallInfo OrigCall(Callee->getOrigNode()->Call);
3000 OrigCall.setCloneNo(0);
3001 std::map<CallInfo, CallInfo> &CallMap =
3002 FuncClonesToCallMap[NewFuncClone];
3003 assert(CallMap.count(OrigCall));
3004 CallInfo NewCall(CallMap[OrigCall]);
3005 assert(NewCall);
3006 NewClone->setCall(NewCall);
3007 }
3008 }
3009 // Fall through to handling below to perform the recording of the
3010 // function for this callsite clone. This enables handling of cases
3011 // where the callers were assigned to different clones of a function.
3012 }
3013
3014 // See if we can use existing function clone. Walk through
3015 // all caller edges to see if any have already been assigned to
3016 // a clone of this callsite's function. If we can use it, do so. If not,
3017 // because that function clone is already assigned to a different clone
3018 // of this callsite, then we need to clone again.
3019 // Basically, this checking is needed to handle the case where different
3020 // caller functions/callsites may need versions of this function
3021 // containing different mixes of callsite clones across the different
3022 // callsites within the function. If that happens, we need to create
3023 // additional function clones to handle the various combinations.
3024 //
3025 // Keep track of any new clones of this callsite created by the
3026 // following loop, as well as any existing clone that we decided to
3027 // assign this clone to.
3028 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3029 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3030 // We need to be able to remove Edge from CallerEdges, so need to adjust
3031 // iterator in the loop.
3032 for (auto EI = Clone->CallerEdges.begin();
3033 EI != Clone->CallerEdges.end();) {
3034 auto Edge = *EI;
3035 // Ignore any caller that does not have a recorded callsite Call.
3036 if (!Edge->Caller->hasCall()) {
3037 EI++;
3038 continue;
3039 }
3040 // If this caller already assigned to call a version of OrigFunc, need
3041 // to ensure we can assign this callsite clone to that function clone.
3042 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3043 FuncInfo FuncCloneCalledByCaller =
3044 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3045 // First we need to confirm that this function clone is available
3046 // for use by this callsite node clone.
3047 //
3048 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3049 // its callsite clones, one of those callsite clones X could have
3050 // been assigned to the same function clone called by Edge's caller
3051 // - if Edge's caller calls another callsite within Node's original
3052 // function, and that callsite has another caller reaching clone X.
3053 // We need to clone Node again in this case.
3054 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3055 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3056 Clone) ||
3057 // Detect when we have multiple callers of this callsite that
3058 // have already been assigned to specific, and different, clones
3059 // of OrigFunc (due to other unrelated callsites in Func they
3060 // reach via call contexts). Is this Clone of callsite Node
3061 // assigned to a different clone of OrigFunc? If so, clone Node
3062 // again.
3063 (FuncCloneAssignedToCurCallsiteClone &&
3064 FuncCloneAssignedToCurCallsiteClone !=
3065 FuncCloneCalledByCaller)) {
3066 // We need to use a different newly created callsite clone, in
3067 // order to assign it to another new function clone on a
3068 // subsequent iteration over the Clones array (adjusted below).
3069 // Note we specifically do not reset the
3070 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3071 // when this new clone is processed later we know which version of
3072 // the function to copy (so that other callsite clones we have
3073 // assigned to that function clone are properly cloned over). See
3074 // comments in the function cloning handling earlier.
3075
3076 // Check if we already have cloned this callsite again while
3077 // walking through caller edges, for a caller calling the same
3078 // function clone. If so, we can move this edge to that new clone
3079 // rather than creating yet another new clone.
3080 if (FuncCloneToNewCallsiteCloneMap.count(
3081 FuncCloneCalledByCaller)) {
3082 ContextNode *NewClone =
3083 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3084 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3085 // Cleanup any none type edges cloned over.
3086 removeNoneTypeCalleeEdges(NewClone);
3087 } else {
3088 // Create a new callsite clone.
3089 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3090 removeNoneTypeCalleeEdges(NewClone);
3091 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3092 NewClone;
3093 // Add to list of clones and process later.
3094 ClonesWorklist.push_back(NewClone);
3095 assert(EI == Clone->CallerEdges.end() ||
3096 Clone->AllocTypes != (uint8_t)AllocationType::None);
3097 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3098 }
3099 // Moving the caller edge may have resulted in some none type
3100 // callee edges.
3101 removeNoneTypeCalleeEdges(Clone);
3102 // We will handle the newly created callsite clone in a subsequent
3103 // iteration over this Node's Clones. Continue here since we
3104 // already adjusted iterator EI while moving the edge.
3105 continue;
3106 }
3107
3108 // Otherwise, we can use the function clone already assigned to this
3109 // caller.
3110 if (!FuncCloneAssignedToCurCallsiteClone) {
3111 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3112 // Assign Clone to FuncCloneCalledByCaller
3113 AssignCallsiteCloneToFuncClone(
3114 FuncCloneCalledByCaller, Call, Clone,
3115 AllocationCallToContextNodeMap.count(Call));
3116 } else
3117 // Don't need to do anything - callsite is already calling this
3118 // function clone.
3119 assert(FuncCloneAssignedToCurCallsiteClone ==
3120 FuncCloneCalledByCaller);
3121
3122 } else {
3123 // We have not already assigned this caller to a version of
3124 // OrigFunc. Do the assignment now.
3125
3126 // First check if we have already assigned this callsite clone to a
3127 // clone of OrigFunc for another caller during this iteration over
3128 // its caller edges.
3129 if (!FuncCloneAssignedToCurCallsiteClone) {
3130 // Find first function in FuncClonesToCallMap without an assigned
3131 // clone of this callsite Node. We should always have one
3132 // available at this point due to the earlier cloning when the
3133 // FuncClonesToCallMap size was smaller than the clone number.
3134 for (auto &CF : FuncClonesToCallMap) {
3135 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3136 FuncCloneAssignedToCurCallsiteClone = CF.first;
3137 break;
3138 }
3139 }
3140 assert(FuncCloneAssignedToCurCallsiteClone);
3141 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3142 AssignCallsiteCloneToFuncClone(
3143 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3144 AllocationCallToContextNodeMap.count(Call));
3145 } else
3146 assert(FuncCloneToCurNodeCloneMap
3147 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3148 // Update callers to record function version called.
3149 RecordCalleeFuncOfCallsite(Edge->Caller,
3150 FuncCloneAssignedToCurCallsiteClone);
3151 }
3152
3153 EI++;
3154 }
3155 }
3156 if (VerifyCCG) {
3157 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3158 for (const auto &PE : Node->CalleeEdges)
3159 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3160 for (const auto &CE : Node->CallerEdges)
3161 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3162 for (auto *Clone : Node->Clones) {
3163 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3164 for (const auto &PE : Clone->CalleeEdges)
3165 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3166 for (const auto &CE : Clone->CallerEdges)
3167 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3168 }
3169 }
3170 }
3171 }
3172
3173 auto UpdateCalls = [&](ContextNode *Node,
3175 auto &&UpdateCalls) {
3176 auto Inserted = Visited.insert(Node);
3177 if (!Inserted.second)
3178 return;
3179
3180 for (auto *Clone : Node->Clones)
3181 UpdateCalls(Clone, Visited, UpdateCalls);
3182
3183 for (auto &Edge : Node->CallerEdges)
3184 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3185
3186 // Skip if either no call to update, or if we ended up with no context ids
3187 // (we moved all edges onto other clones).
3188 if (!Node->hasCall() || Node->ContextIds.empty())
3189 return;
3190
3191 if (Node->IsAllocation) {
3192 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3193 return;
3194 }
3195
3196 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3197 return;
3198
3199 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3200 updateCall(Node->Call, CalleeFunc);
3201 };
3202
3203 // Performs DFS traversal starting from allocation nodes to update calls to
3204 // reflect cloning decisions recorded earlier. For regular LTO this will
3205 // update the actual calls in the IR to call the appropriate function clone
3206 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3207 // are recorded in the summary entries.
3209 for (auto &Entry : AllocationCallToContextNodeMap)
3210 UpdateCalls(Entry.second, Visited, UpdateCalls);
3211
3212 return Changed;
3213}
3214
3216 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3218 &FuncToAliasMap) {
3219 // The first "clone" is the original copy, we should only call this if we
3220 // needed to create new clones.
3221 assert(NumClones > 1);
3223 VMaps.reserve(NumClones - 1);
3224 FunctionsClonedThinBackend++;
3225 for (unsigned I = 1; I < NumClones; I++) {
3226 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3227 auto *NewF = CloneFunction(&F, *VMaps.back());
3228 FunctionClonesThinBackend++;
3229 // Strip memprof and callsite metadata from clone as they are no longer
3230 // needed.
3231 for (auto &BB : *NewF) {
3232 for (auto &Inst : BB) {
3233 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3234 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3235 }
3236 }
3237 std::string Name = getMemProfFuncName(F.getName(), I);
3238 auto *PrevF = M.getFunction(Name);
3239 if (PrevF) {
3240 // We might have created this when adjusting callsite in another
3241 // function. It should be a declaration.
3242 assert(PrevF->isDeclaration());
3243 NewF->takeName(PrevF);
3244 PrevF->replaceAllUsesWith(NewF);
3245 PrevF->eraseFromParent();
3246 } else
3247 NewF->setName(Name);
3248 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3249 << "created clone " << ore::NV("NewFunction", NewF));
3250
3251 // Now handle aliases to this function, and clone those as well.
3252 if (!FuncToAliasMap.count(&F))
3253 continue;
3254 for (auto *A : FuncToAliasMap[&F]) {
3255 std::string Name = getMemProfFuncName(A->getName(), I);
3256 auto *PrevA = M.getNamedAlias(Name);
3257 auto *NewA = GlobalAlias::create(A->getValueType(),
3258 A->getType()->getPointerAddressSpace(),
3259 A->getLinkage(), Name, NewF);
3260 NewA->copyAttributesFrom(A);
3261 if (PrevA) {
3262 // We might have created this when adjusting callsite in another
3263 // function. It should be a declaration.
3264 assert(PrevA->isDeclaration());
3265 NewA->takeName(PrevA);
3266 PrevA->replaceAllUsesWith(NewA);
3267 PrevA->eraseFromParent();
3268 }
3269 }
3270 }
3271 return VMaps;
3272}
3273
3274// Locate the summary for F. This is complicated by the fact that it might
3275// have been internalized or promoted.
3277 const ModuleSummaryIndex *ImportSummary) {
3278 // FIXME: Ideally we would retain the original GUID in some fashion on the
3279 // function (e.g. as metadata), but for now do our best to locate the
3280 // summary without that information.
3281 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3282 if (!TheFnVI)
3283 // See if theFn was internalized, by checking index directly with
3284 // original name (this avoids the name adjustment done by getGUID() for
3285 // internal symbols).
3286 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3287 if (TheFnVI)
3288 return TheFnVI;
3289 // Now query with the original name before any promotion was performed.
3290 StringRef OrigName =
3292 std::string OrigId = GlobalValue::getGlobalIdentifier(
3293 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3294 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3295 if (TheFnVI)
3296 return TheFnVI;
3297 // Could be a promoted local imported from another module. We need to pass
3298 // down more info here to find the original module id. For now, try with
3299 // the OrigName which might have been stored in the OidGuidMap in the
3300 // index. This would not work if there were same-named locals in multiple
3301 // modules, however.
3302 auto OrigGUID =
3303 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3304 if (OrigGUID)
3305 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3306 return TheFnVI;
3307}
3308
3309bool MemProfContextDisambiguation::applyImport(Module &M) {
3310 assert(ImportSummary);
3311 bool Changed = false;
3312
3313 auto IsMemProfClone = [](const Function &F) {
3314 return F.getName().contains(MemProfCloneSuffix);
3315 };
3316
3317 // We also need to clone any aliases that reference cloned functions, because
3318 // the modified callsites may invoke via the alias. Keep track of the aliases
3319 // for each function.
3320 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3321 FuncToAliasMap;
3322 for (auto &A : M.aliases()) {
3323 auto *Aliasee = A.getAliaseeObject();
3324 if (auto *F = dyn_cast<Function>(Aliasee))
3325 FuncToAliasMap[F].insert(&A);
3326 }
3327
3328 for (auto &F : M) {
3329 if (F.isDeclaration() || IsMemProfClone(F))
3330 continue;
3331
3333
3335 bool ClonesCreated = false;
3336 unsigned NumClonesCreated = 0;
3337 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3338 // We should at least have version 0 which is the original copy.
3339 assert(NumClones > 0);
3340 // If only one copy needed use original.
3341 if (NumClones == 1)
3342 return;
3343 // If we already performed cloning of this function, confirm that the
3344 // requested number of clones matches (the thin link should ensure the
3345 // number of clones for each constituent callsite is consistent within
3346 // each function), before returning.
3347 if (ClonesCreated) {
3348 assert(NumClonesCreated == NumClones);
3349 return;
3350 }
3351 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3352 // The first "clone" is the original copy, which doesn't have a VMap.
3353 assert(VMaps.size() == NumClones - 1);
3354 Changed = true;
3355 ClonesCreated = true;
3356 NumClonesCreated = NumClones;
3357 };
3358
3359 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3360 Function *CalledFunction) {
3361 // Perform cloning if not yet done.
3362 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3363
3364 // Should have skipped indirect calls via mayHaveMemprofSummary.
3365 assert(CalledFunction);
3366 assert(!IsMemProfClone(*CalledFunction));
3367
3368 // Update the calls per the summary info.
3369 // Save orig name since it gets updated in the first iteration
3370 // below.
3371 auto CalleeOrigName = CalledFunction->getName();
3372 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3373 // Do nothing if this version calls the original version of its
3374 // callee.
3375 if (!StackNode.Clones[J])
3376 continue;
3377 auto NewF = M.getOrInsertFunction(
3378 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3379 CalledFunction->getFunctionType());
3380 CallBase *CBClone;
3381 // Copy 0 is the original function.
3382 if (!J)
3383 CBClone = CB;
3384 else
3385 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3386 CBClone->setCalledFunction(NewF);
3387 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3388 << ore::NV("Call", CBClone) << " in clone "
3389 << ore::NV("Caller", CBClone->getFunction())
3390 << " assigned to call function clone "
3391 << ore::NV("Callee", NewF.getCallee()));
3392 }
3393 };
3394
3395 // Locate the summary for F.
3396 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3397 // If not found, this could be an imported local (see comment in
3398 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3399 // module (where it would have been promoted to global scope so should
3400 // satisfy any reference in this module).
3401 if (!TheFnVI)
3402 continue;
3403
3404 auto *GVSummary =
3405 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3406 if (!GVSummary) {
3407 // Must have been imported, use the summary which matches the definition。
3408 // (might be multiple if this was a linkonce_odr).
3409 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3410 assert(SrcModuleMD &&
3411 "enable-import-metadata is needed to emit thinlto_src_module");
3412 StringRef SrcModule =
3413 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3414 for (auto &GVS : TheFnVI.getSummaryList()) {
3415 if (GVS->modulePath() == SrcModule) {
3416 GVSummary = GVS.get();
3417 break;
3418 }
3419 }
3420 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3421 }
3422
3423 // If this was an imported alias skip it as we won't have the function
3424 // summary, and it should be cloned in the original module.
3425 if (isa<AliasSummary>(GVSummary))
3426 continue;
3427
3428 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3429
3430 if (FS->allocs().empty() && FS->callsites().empty())
3431 continue;
3432
3433 auto SI = FS->callsites().begin();
3434 auto AI = FS->allocs().begin();
3435
3436 // To handle callsite infos synthesized for tail calls which have missing
3437 // frames in the profiled context, map callee VI to the synthesized callsite
3438 // info.
3439 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3440 // Iterate the callsites for this function in reverse, since we place all
3441 // those synthesized for tail calls at the end.
3442 for (auto CallsiteIt = FS->callsites().rbegin();
3443 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3444 auto &Callsite = *CallsiteIt;
3445 // Stop as soon as we see a non-synthesized callsite info (see comment
3446 // above loop). All the entries added for discovered tail calls have empty
3447 // stack ids.
3448 if (!Callsite.StackIdIndices.empty())
3449 break;
3450 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3451 }
3452
3453 // Assume for now that the instructions are in the exact same order
3454 // as when the summary was created, but confirm this is correct by
3455 // matching the stack ids.
3456 for (auto &BB : F) {
3457 for (auto &I : BB) {
3458 auto *CB = dyn_cast<CallBase>(&I);
3459 // Same handling as when creating module summary.
3460 if (!mayHaveMemprofSummary(CB))
3461 continue;
3462
3463 auto *CalledValue = CB->getCalledOperand();
3464 auto *CalledFunction = CB->getCalledFunction();
3465 if (CalledValue && !CalledFunction) {
3466 CalledValue = CalledValue->stripPointerCasts();
3467 // Stripping pointer casts can reveal a called function.
3468 CalledFunction = dyn_cast<Function>(CalledValue);
3469 }
3470 // Check if this is an alias to a function. If so, get the
3471 // called aliasee for the checks below.
3472 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3473 assert(!CalledFunction &&
3474 "Expected null called function in callsite for alias");
3475 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3476 }
3477
3479 I.getMetadata(LLVMContext::MD_callsite));
3480 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3481
3482 // Include allocs that were already assigned a memprof function
3483 // attribute in the statistics.
3484 if (CB->getAttributes().hasFnAttr("memprof")) {
3485 assert(!MemProfMD);
3486 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3487 ? AllocTypeColdThinBackend++
3488 : AllocTypeNotColdThinBackend++;
3489 OrigAllocsThinBackend++;
3490 AllocVersionsThinBackend++;
3491 if (!MaxAllocVersionsThinBackend)
3492 MaxAllocVersionsThinBackend = 1;
3493 // Remove any remaining callsite metadata and we can skip the rest of
3494 // the handling for this instruction, since no cloning needed.
3495 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3496 continue;
3497 }
3498
3499 if (MemProfMD) {
3500 // Consult the next alloc node.
3501 assert(AI != FS->allocs().end());
3502 auto &AllocNode = *(AI++);
3503
3504 // Sanity check that the MIB stack ids match between the summary and
3505 // instruction metadata.
3506 auto MIBIter = AllocNode.MIBs.begin();
3507 for (auto &MDOp : MemProfMD->operands()) {
3508 assert(MIBIter != AllocNode.MIBs.end());
3509 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3510 MIBIter->StackIdIndices.begin();
3511 auto *MIBMD = cast<const MDNode>(MDOp);
3512 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3513 assert(StackMDNode);
3514 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3515 auto ContextIterBegin =
3516 StackContext.beginAfterSharedPrefix(CallsiteContext);
3517 // Skip the checking on the first iteration.
3518 uint64_t LastStackContextId =
3519 (ContextIterBegin != StackContext.end() &&
3520 *ContextIterBegin == 0)
3521 ? 1
3522 : 0;
3523 for (auto ContextIter = ContextIterBegin;
3524 ContextIter != StackContext.end(); ++ContextIter) {
3525 // If this is a direct recursion, simply skip the duplicate
3526 // entries, to be consistent with how the summary ids were
3527 // generated during ModuleSummaryAnalysis.
3528 if (LastStackContextId == *ContextIter)
3529 continue;
3530 LastStackContextId = *ContextIter;
3531 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3532 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3533 *ContextIter);
3534 StackIdIndexIter++;
3535 }
3536 MIBIter++;
3537 }
3538
3539 // Perform cloning if not yet done.
3540 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3541
3542 OrigAllocsThinBackend++;
3543 AllocVersionsThinBackend += AllocNode.Versions.size();
3544 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3545 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3546
3547 // If there is only one version that means we didn't end up
3548 // considering this function for cloning, and in that case the alloc
3549 // will still be none type or should have gotten the default NotCold.
3550 // Skip that after calling clone helper since that does some sanity
3551 // checks that confirm we haven't decided yet that we need cloning.
3552 if (AllocNode.Versions.size() == 1) {
3553 assert((AllocationType)AllocNode.Versions[0] ==
3555 (AllocationType)AllocNode.Versions[0] ==
3557 UnclonableAllocsThinBackend++;
3558 continue;
3559 }
3560
3561 // All versions should have a singular allocation type.
3562 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3563 return Type == ((uint8_t)AllocationType::NotCold |
3564 (uint8_t)AllocationType::Cold);
3565 }));
3566
3567 // Update the allocation types per the summary info.
3568 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3569 // Ignore any that didn't get an assigned allocation type.
3570 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3571 continue;
3572 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3573 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3574 : AllocTypeNotColdThinBackend++;
3575 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3576 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3577 AllocTypeString);
3578 CallBase *CBClone;
3579 // Copy 0 is the original function.
3580 if (!J)
3581 CBClone = CB;
3582 else
3583 // Since VMaps are only created for new clones, we index with
3584 // clone J-1 (J==0 is the original clone and does not have a VMaps
3585 // entry).
3586 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3587 CBClone->addFnAttr(A);
3588 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3589 << ore::NV("AllocationCall", CBClone) << " in clone "
3590 << ore::NV("Caller", CBClone->getFunction())
3591 << " marked with memprof allocation attribute "
3592 << ore::NV("Attribute", AllocTypeString));
3593 }
3594 } else if (!CallsiteContext.empty()) {
3595 // Consult the next callsite node.
3596 assert(SI != FS->callsites().end());
3597 auto &StackNode = *(SI++);
3598
3599#ifndef NDEBUG
3600 // Sanity check that the stack ids match between the summary and
3601 // instruction metadata.
3602 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3603 for (auto StackId : CallsiteContext) {
3604 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3605 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3606 StackId);
3607 StackIdIndexIter++;
3608 }
3609#endif
3610
3611 CloneCallsite(StackNode, CB, CalledFunction);
3612 } else if (CB->isTailCall()) {
3613 // Locate the synthesized callsite info for the callee VI, if any was
3614 // created, and use that for cloning.
3615 ValueInfo CalleeVI =
3616 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3617 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3618 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3619 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3620 CloneCallsite(Callsite->second, CB, CalledFunction);
3621 }
3622 }
3623 // Memprof and callsite metadata on memory allocations no longer needed.
3624 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3625 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3626 }
3627 }
3628 }
3629
3630 return Changed;
3631}
3632
3633template <typename DerivedCCG, typename FuncTy, typename CallTy>
3634bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3635 if (DumpCCG) {
3636 dbgs() << "CCG before cloning:\n";
3637 dbgs() << *this;
3638 }
3639 if (ExportToDot)
3640 exportToDot("postbuild");
3641
3642 if (VerifyCCG) {
3643 check();
3644 }
3645
3646 identifyClones();
3647
3648 if (VerifyCCG) {
3649 check();
3650 }
3651
3652 if (DumpCCG) {
3653 dbgs() << "CCG after cloning:\n";
3654 dbgs() << *this;
3655 }
3656 if (ExportToDot)
3657 exportToDot("cloned");
3658
3659 bool Changed = assignFunctions();
3660
3661 if (DumpCCG) {
3662 dbgs() << "CCG after assigning function clones:\n";
3663 dbgs() << *this;
3664 }
3665 if (ExportToDot)
3666 exportToDot("clonefuncassign");
3667
3668 return Changed;
3669}
3670
3671bool MemProfContextDisambiguation::processModule(
3672 Module &M,
3674
3675 // If we have an import summary, then the cloning decisions were made during
3676 // the thin link on the index. Apply them and return.
3677 if (ImportSummary)
3678 return applyImport(M);
3679
3680 // TODO: If/when other types of memprof cloning are enabled beyond just for
3681 // hot and cold, we will need to change this to individually control the
3682 // AllocationType passed to addStackNodesForMIB during CCG construction.
3683 // Note that we specifically check this after applying imports above, so that
3684 // the option isn't needed to be passed to distributed ThinLTO backend
3685 // clang processes, which won't necessarily have visibility into the linker
3686 // dependences. Instead the information is communicated from the LTO link to
3687 // the backends via the combined summary index.
3688 if (!SupportsHotColdNew)
3689 return false;
3690
3691 ModuleCallsiteContextGraph CCG(M, OREGetter);
3692 return CCG.process();
3693}
3694
3696 const ModuleSummaryIndex *Summary)
3697 : ImportSummary(Summary) {
3698 if (ImportSummary) {
3699 // The MemProfImportSummary should only be used for testing ThinLTO
3700 // distributed backend handling via opt, in which case we don't have a
3701 // summary from the pass pipeline.
3703 return;
3704 }
3705 if (MemProfImportSummary.empty())
3706 return;
3707
3708 auto ReadSummaryFile =
3710 if (!ReadSummaryFile) {
3711 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3712 "Error loading file '" + MemProfImportSummary +
3713 "': ");
3714 return;
3715 }
3716 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3717 if (!ImportSummaryForTestingOrErr) {
3718 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3719 "Error parsing file '" + MemProfImportSummary +
3720 "': ");
3721 return;
3722 }
3723 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3724 ImportSummary = ImportSummaryForTesting.get();
3725}
3726
3729 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3730 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3732 };
3733 if (!processModule(M, OREGetter))
3734 return PreservedAnalyses::all();
3735 return PreservedAnalyses::none();
3736}
3737
3741 isPrevailing) {
3742 // TODO: If/when other types of memprof cloning are enabled beyond just for
3743 // hot and cold, we will need to change this to individually control the
3744 // AllocationType passed to addStackNodesForMIB during CCG construction.
3745 // The index was set from the option, so these should be in sync.
3746 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3747 if (!SupportsHotColdNew)
3748 return;
3749
3750 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3751 CCG.process();
3752}
aarch64 promote const
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
void print(OutputBuffer &OB) const
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1461
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1818
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1748
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:518
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:408
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:594
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:169
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:658
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:84
Metadata node.
Definition: Metadata.h:1067
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:206
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
Definition: SetOperations.h:82
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1745
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:23
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:60
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1923
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1198
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const