LLVM 19.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125namespace llvm {
127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130// Indicate we are linking with an allocator that supports hot/cold operator
131// new interfaces.
133 "supports-hot-cold-new", cl::init(false), cl::Hidden,
134 cl::desc("Linking with hot/cold operator new interfaces"));
135} // namespace llvm
136
138
139namespace {
140/// CRTP base for graphs built from either IR or ThinLTO summary index.
141///
142/// The graph represents the call contexts in all memprof metadata on allocation
143/// calls, with nodes for the allocations themselves, as well as for the calls
144/// in each context. The graph is initially built from the allocation memprof
145/// metadata (or summary) MIBs. It is then updated to match calls with callsite
146/// metadata onto the nodes, updating it to reflect any inlining performed on
147/// those calls.
148///
149/// Each MIB (representing an allocation's call context with allocation
150/// behavior) is assigned a unique context id during the graph build. The edges
151/// and nodes in the graph are decorated with the context ids they carry. This
152/// is used to correctly update the graph when cloning is performed so that we
153/// can uniquify the context for a single (possibly cloned) allocation.
154template <typename DerivedCCG, typename FuncTy, typename CallTy>
155class CallsiteContextGraph {
156public:
157 CallsiteContextGraph() = default;
158 CallsiteContextGraph(const CallsiteContextGraph &) = default;
159 CallsiteContextGraph(CallsiteContextGraph &&) = default;
160
161 /// Main entry point to perform analysis and transformations on graph.
162 bool process();
163
164 /// Perform cloning on the graph necessary to uniquely identify the allocation
165 /// behavior of an allocation based on its context.
166 void identifyClones();
167
168 /// Assign callsite clones to functions, cloning functions as needed to
169 /// accommodate the combinations of their callsite clones reached by callers.
170 /// For regular LTO this clones functions and callsites in the IR, but for
171 /// ThinLTO the cloning decisions are noted in the summaries and later applied
172 /// in applyImport.
173 bool assignFunctions();
174
175 void dump() const;
176 void print(raw_ostream &OS) const;
177 void printTotalSizes(raw_ostream &OS) const;
178
180 const CallsiteContextGraph &CCG) {
181 CCG.print(OS);
182 return OS;
183 }
184
185 friend struct GraphTraits<
186 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
187 friend struct DOTGraphTraits<
188 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
189
190 void exportToDot(std::string Label) const;
191
192 /// Represents a function clone via FuncTy pointer and clone number pair.
193 struct FuncInfo final
194 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
195 using Base = std::pair<FuncTy *, unsigned>;
196 FuncInfo(const Base &B) : Base(B) {}
197 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
198 explicit operator bool() const { return this->first != nullptr; }
199 FuncTy *func() const { return this->first; }
200 unsigned cloneNo() const { return this->second; }
201 };
202
203 /// Represents a callsite clone via CallTy and clone number pair.
204 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
205 using Base = std::pair<CallTy, unsigned>;
206 CallInfo(const Base &B) : Base(B) {}
207 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
208 : Base(Call, CloneNo) {}
209 explicit operator bool() const { return (bool)this->first; }
210 CallTy call() const { return this->first; }
211 unsigned cloneNo() const { return this->second; }
212 void setCloneNo(unsigned N) { this->second = N; }
213 void print(raw_ostream &OS) const {
214 if (!operator bool()) {
215 assert(!cloneNo());
216 OS << "null Call";
217 return;
218 }
219 call()->print(OS);
220 OS << "\t(clone " << cloneNo() << ")";
221 }
222 void dump() const {
223 print(dbgs());
224 dbgs() << "\n";
225 }
226 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
227 Call.print(OS);
228 return OS;
229 }
230 };
231
232 struct ContextEdge;
233
234 /// Node in the Callsite Context Graph
235 struct ContextNode {
236 // Keep this for now since in the IR case where we have an Instruction* it
237 // is not as immediately discoverable. Used for printing richer information
238 // when dumping graph.
239 bool IsAllocation;
240
241 // Keeps track of when the Call was reset to null because there was
242 // recursion.
243 bool Recursive = false;
244
245 // The corresponding allocation or interior call.
247
248 // For alloc nodes this is a unique id assigned when constructed, and for
249 // callsite stack nodes it is the original stack id when the node is
250 // constructed from the memprof MIB metadata on the alloc nodes. Note that
251 // this is only used when matching callsite metadata onto the stack nodes
252 // created when processing the allocation memprof MIBs, and for labeling
253 // nodes in the dot graph. Therefore we don't bother to assign a value for
254 // clones.
255 uint64_t OrigStackOrAllocId = 0;
256
257 // This will be formed by ORing together the AllocationType enum values
258 // for contexts including this node.
259 uint8_t AllocTypes = 0;
260
261 // Edges to all callees in the profiled call stacks.
262 // TODO: Should this be a map (from Callee node) for more efficient lookup?
263 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
264
265 // Edges to all callers in the profiled call stacks.
266 // TODO: Should this be a map (from Caller node) for more efficient lookup?
267 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
268
269 // Get the list of edges from which we can compute allocation information
270 // such as the context ids and allocation type of this node.
271 const std::vector<std::shared_ptr<ContextEdge>> *
272 getEdgesWithAllocInfo() const {
273 // If node has any callees, compute from those, otherwise compute from
274 // callers (i.e. if this is the leaf allocation node).
275 if (!CalleeEdges.empty())
276 return &CalleeEdges;
277 if (!CallerEdges.empty()) {
278 // A node with caller edges but no callee edges must be the allocation
279 // node.
280 assert(IsAllocation);
281 return &CallerEdges;
282 }
283 return nullptr;
284 }
285
286 // Compute the context ids for this node from the union of its edge context
287 // ids.
288 DenseSet<uint32_t> getContextIds() const {
289 DenseSet<uint32_t> ContextIds;
290 auto *Edges = getEdgesWithAllocInfo();
291 if (!Edges)
292 return {};
293 unsigned Count = 0;
294 for (auto &Edge : *Edges)
295 Count += Edge->getContextIds().size();
296 ContextIds.reserve(Count);
297 for (auto &Edge : *Edges)
298 ContextIds.insert(Edge->getContextIds().begin(),
299 Edge->getContextIds().end());
300 return ContextIds;
301 }
302
303 // Compute the allocation type for this node from the OR of its edge
304 // allocation types.
305 uint8_t computeAllocType() const {
306 auto *Edges = getEdgesWithAllocInfo();
307 if (!Edges)
308 return (uint8_t)AllocationType::None;
309 uint8_t BothTypes =
310 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
311 uint8_t AllocType = (uint8_t)AllocationType::None;
312 for (auto &Edge : *Edges) {
313 AllocType |= Edge->AllocTypes;
314 // Bail early if alloc type reached both, no further refinement.
315 if (AllocType == BothTypes)
316 return AllocType;
317 }
318 return AllocType;
319 }
320
321 // The context ids set for this node is empty if its edge context ids are
322 // also all empty.
323 bool emptyContextIds() const {
324 auto *Edges = getEdgesWithAllocInfo();
325 if (!Edges)
326 return true;
327 for (auto &Edge : *Edges) {
328 if (!Edge->getContextIds().empty())
329 return false;
330 }
331 return true;
332 }
333
334 // List of clones of this ContextNode, initially empty.
335 std::vector<ContextNode *> Clones;
336
337 // If a clone, points to the original uncloned node.
338 ContextNode *CloneOf = nullptr;
339
340 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
341
342 ContextNode(bool IsAllocation, CallInfo C)
343 : IsAllocation(IsAllocation), Call(C) {}
344
345 void addClone(ContextNode *Clone) {
346 if (CloneOf) {
347 CloneOf->Clones.push_back(Clone);
348 Clone->CloneOf = CloneOf;
349 } else {
350 Clones.push_back(Clone);
351 assert(!Clone->CloneOf);
352 Clone->CloneOf = this;
353 }
354 }
355
356 ContextNode *getOrigNode() {
357 if (!CloneOf)
358 return this;
359 return CloneOf;
360 }
361
362 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
363 unsigned int ContextId);
364
365 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
366 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
367 void eraseCalleeEdge(const ContextEdge *Edge);
368 void eraseCallerEdge(const ContextEdge *Edge);
369
370 void setCall(CallInfo C) { Call = C; }
371
372 bool hasCall() const { return (bool)Call.call(); }
373
374 void printCall(raw_ostream &OS) const { Call.print(OS); }
375
376 // True if this node was effectively removed from the graph, in which case
377 // it should have an allocation type of None and empty context ids.
378 bool isRemoved() const {
379 assert((AllocTypes == (uint8_t)AllocationType::None) ==
380 emptyContextIds());
381 return AllocTypes == (uint8_t)AllocationType::None;
382 }
383
384 void dump() const;
385 void print(raw_ostream &OS) const;
386
387 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
388 Node.print(OS);
389 return OS;
390 }
391 };
392
393 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
394 /// callee.
395 struct ContextEdge {
396 ContextNode *Callee;
397 ContextNode *Caller;
398
399 // This will be formed by ORing together the AllocationType enum values
400 // for contexts including this edge.
401 uint8_t AllocTypes = 0;
402
403 // The set of IDs for contexts including this edge.
404 DenseSet<uint32_t> ContextIds;
405
406 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
407 DenseSet<uint32_t> ContextIds)
408 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
409 ContextIds(std::move(ContextIds)) {}
410
411 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
412
413 void dump() const;
414 void print(raw_ostream &OS) const;
415
416 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
417 Edge.print(OS);
418 return OS;
419 }
420 };
421
422 /// Helpers to remove callee edges that have allocation type None (due to not
423 /// carrying any context ids) after transformations.
424 void removeNoneTypeCalleeEdges(ContextNode *Node);
425 void
426 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
428
429protected:
430 /// Get a list of nodes corresponding to the stack ids in the given callsite
431 /// context.
432 template <class NodeT, class IteratorT>
433 std::vector<uint64_t>
434 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
435
436 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
437 /// metadata (or summary).
438 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
439
440 /// Adds nodes for the given MIB stack ids.
441 template <class NodeT, class IteratorT>
442 void addStackNodesForMIB(ContextNode *AllocNode,
443 CallStack<NodeT, IteratorT> &StackContext,
444 CallStack<NodeT, IteratorT> &CallsiteContext,
446
447 /// Matches all callsite metadata (or summary) to the nodes created for
448 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
449 /// inlining performed on those callsite instructions.
450 void updateStackNodes();
451
452 /// Update graph to conservatively handle any callsite stack nodes that target
453 /// multiple different callee target functions.
454 void handleCallsitesWithMultipleTargets();
455
456 /// Save lists of calls with MemProf metadata in each function, for faster
457 /// iteration.
458 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
459
460 /// Map from callsite node to the enclosing caller function.
461 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
462
463private:
464 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
465
466 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
467 const FuncTy *, DenseSet<uint32_t>>;
468
469 /// Assigns the given Node to calls at or inlined into the location with
470 /// the Node's stack id, after post order traversing and processing its
471 /// caller nodes. Uses the call information recorded in the given
472 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
473 /// as needed. Called by updateStackNodes which sets up the given
474 /// StackIdToMatchingCalls map.
475 void assignStackNodesPostOrder(
476 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
477 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
478
479 /// Duplicates the given set of context ids, updating the provided
480 /// map from each original id with the newly generated context ids,
481 /// and returning the new duplicated id set.
482 DenseSet<uint32_t> duplicateContextIds(
483 const DenseSet<uint32_t> &StackSequenceContextIds,
484 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
485
486 /// Propagates all duplicated context ids across the graph.
487 void propagateDuplicateContextIds(
488 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
489
490 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
491 /// else to its callers. Also updates OrigNode's edges to remove any context
492 /// ids moved to the newly created edge.
493 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
494 bool TowardsCallee,
495 DenseSet<uint32_t> RemainingContextIds);
496
497 /// Get the stack id corresponding to the given Id or Index (for IR this will
498 /// return itself, for a summary index this will return the id recorded in the
499 /// index for that stack id index value).
500 uint64_t getStackId(uint64_t IdOrIndex) const {
501 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
502 }
503
504 /// Returns true if the given call targets the callee of the given edge, or if
505 /// we were able to identify the call chain through intermediate tail calls.
506 /// In the latter case new context nodes are added to the graph for the
507 /// identified tail calls, and their synthesized nodes are added to
508 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
509 /// the updated edges and to prepare it for an increment in the caller.
510 bool
511 calleesMatch(CallTy Call, EdgeIter &EI,
512 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
513
514 /// Returns true if the given call targets the given function, or if we were
515 /// able to identify the call chain through intermediate tail calls (in which
516 /// case FoundCalleeChain will be populated).
517 bool calleeMatchesFunc(
518 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
519 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
520 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
521 Call, Func, CallerFunc, FoundCalleeChain);
522 }
523
524 /// Get a list of nodes corresponding to the stack ids in the given
525 /// callsite's context.
526 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
527 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
528 Call);
529 }
530
531 /// Get the last stack id in the context for callsite.
532 uint64_t getLastStackId(CallTy Call) {
533 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
534 }
535
536 /// Update the allocation call to record type of allocated memory.
537 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
538 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
539 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
540 }
541
542 /// Update non-allocation call to invoke (possibly cloned) function
543 /// CalleeFunc.
544 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
545 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
546 }
547
548 /// Clone the given function for the given callsite, recording mapping of all
549 /// of the functions tracked calls to their new versions in the CallMap.
550 /// Assigns new clones to clone number CloneNo.
551 FuncInfo cloneFunctionForCallsite(
552 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
553 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
554 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
555 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
556 }
557
558 /// Gets a label to use in the dot graph for the given call clone in the given
559 /// function.
560 std::string getLabel(const FuncTy *Func, const CallTy Call,
561 unsigned CloneNo) const {
562 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
563 }
564
565 /// Helpers to find the node corresponding to the given call or stackid.
566 ContextNode *getNodeForInst(const CallInfo &C);
567 ContextNode *getNodeForAlloc(const CallInfo &C);
568 ContextNode *getNodeForStackId(uint64_t StackId);
569
570 /// Computes the alloc type corresponding to the given context ids, by
571 /// unioning their recorded alloc types.
572 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
573
574 /// Returns the allocation type of the intersection of the contexts of two
575 /// nodes (based on their provided context id sets), optimized for the case
576 /// when Node1Ids is smaller than Node2Ids.
577 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
578 const DenseSet<uint32_t> &Node2Ids);
579
580 /// Returns the allocation type of the intersection of the contexts of two
581 /// nodes (based on their provided context id sets).
582 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
583 const DenseSet<uint32_t> &Node2Ids);
584
585 /// Create a clone of Edge's callee and move Edge to that new callee node,
586 /// performing the necessary context id and allocation type updates.
587 /// If callee's caller edge iterator is supplied, it is updated when removing
588 /// the edge from that list. If ContextIdsToMove is non-empty, only that
589 /// subset of Edge's ids are moved to an edge to the new callee.
590 ContextNode *
591 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
592 EdgeIter *CallerEdgeI = nullptr,
593 DenseSet<uint32_t> ContextIdsToMove = {});
594
595 /// Change the callee of Edge to existing callee clone NewCallee, performing
596 /// the necessary context id and allocation type updates.
597 /// If callee's caller edge iterator is supplied, it is updated when removing
598 /// the edge from that list. If ContextIdsToMove is non-empty, only that
599 /// subset of Edge's ids are moved to an edge to the new callee.
600 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
601 ContextNode *NewCallee,
602 EdgeIter *CallerEdgeI = nullptr,
603 bool NewClone = false,
604 DenseSet<uint32_t> ContextIdsToMove = {});
605
606 /// Recursively perform cloning on the graph for the given Node and its
607 /// callers, in order to uniquely identify the allocation behavior of an
608 /// allocation given its context. The context ids of the allocation being
609 /// processed are given in AllocContextIds.
610 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
611 const DenseSet<uint32_t> &AllocContextIds);
612
613 /// Map from each context ID to the AllocationType assigned to that context.
614 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
615
616 /// Map from each contextID to the profiled aggregate allocation size,
617 /// optionally populated when requested (via MemProfReportHintedSizes).
618 DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
619
620 /// Identifies the context node created for a stack id when adding the MIB
621 /// contexts to the graph. This is used to locate the context nodes when
622 /// trying to assign the corresponding callsites with those stack ids to these
623 /// nodes.
624 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
625
626 /// Maps to track the calls to their corresponding nodes in the graph.
627 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
628 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
629
630 /// Owner of all ContextNode unique_ptrs.
631 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
632
633 /// Perform sanity checks on graph when requested.
634 void check() const;
635
636 /// Keeps track of the last unique context id assigned.
637 unsigned int LastContextId = 0;
638};
639
640template <typename DerivedCCG, typename FuncTy, typename CallTy>
641using ContextNode =
642 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
643template <typename DerivedCCG, typename FuncTy, typename CallTy>
644using ContextEdge =
645 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
646template <typename DerivedCCG, typename FuncTy, typename CallTy>
647using FuncInfo =
648 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
649template <typename DerivedCCG, typename FuncTy, typename CallTy>
650using CallInfo =
651 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
652
653/// CRTP derived class for graphs built from IR (regular LTO).
654class ModuleCallsiteContextGraph
655 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
656 Instruction *> {
657public:
658 ModuleCallsiteContextGraph(
659 Module &M,
661
662private:
663 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
664 Instruction *>;
665
666 uint64_t getStackId(uint64_t IdOrIndex) const;
667 bool calleeMatchesFunc(
668 Instruction *Call, const Function *Func, const Function *CallerFunc,
669 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
670 bool findProfiledCalleeThroughTailCalls(
671 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
672 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
673 bool &FoundMultipleCalleeChains);
674 uint64_t getLastStackId(Instruction *Call);
675 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
676 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
677 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
678 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
679 Instruction *>::FuncInfo
680 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
681 std::map<CallInfo, CallInfo> &CallMap,
682 std::vector<CallInfo> &CallsWithMetadataInFunc,
683 unsigned CloneNo);
684 std::string getLabel(const Function *Func, const Instruction *Call,
685 unsigned CloneNo) const;
686
687 const Module &Mod;
689};
690
691/// Represents a call in the summary index graph, which can either be an
692/// allocation or an interior callsite node in an allocation's context.
693/// Holds a pointer to the corresponding data structure in the index.
694struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
695 IndexCall() : PointerUnion() {}
696 IndexCall(std::nullptr_t) : IndexCall() {}
698 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
699 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
700
701 IndexCall *operator->() { return this; }
702
703 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
704
705 void print(raw_ostream &OS) const {
706 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
707 OS << *AI;
708 } else {
709 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
710 assert(CI);
711 OS << *CI;
712 }
713 }
714};
715
716/// CRTP derived class for graphs built from summary index (ThinLTO).
717class IndexCallsiteContextGraph
718 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
719 IndexCall> {
720public:
721 IndexCallsiteContextGraph(
724 isPrevailing);
725
726 ~IndexCallsiteContextGraph() {
727 // Now that we are done with the graph it is safe to add the new
728 // CallsiteInfo structs to the function summary vectors. The graph nodes
729 // point into locations within these vectors, so we don't want to add them
730 // any earlier.
731 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
732 auto *FS = I.first;
733 for (auto &Callsite : I.second)
734 FS->addCallsite(*Callsite.second);
735 }
736 }
737
738private:
739 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
740 IndexCall>;
741
742 uint64_t getStackId(uint64_t IdOrIndex) const;
743 bool calleeMatchesFunc(
744 IndexCall &Call, const FunctionSummary *Func,
745 const FunctionSummary *CallerFunc,
746 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
747 bool findProfiledCalleeThroughTailCalls(
748 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
749 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
750 bool &FoundMultipleCalleeChains);
751 uint64_t getLastStackId(IndexCall &Call);
752 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
753 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
754 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
755 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
756 IndexCall>::FuncInfo
757 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
758 std::map<CallInfo, CallInfo> &CallMap,
759 std::vector<CallInfo> &CallsWithMetadataInFunc,
760 unsigned CloneNo);
761 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
762 unsigned CloneNo) const;
763
764 // Saves mapping from function summaries containing memprof records back to
765 // its VI, for use in checking and debugging.
766 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
767
770 isPrevailing;
771
772 // Saves/owns the callsite info structures synthesized for missing tail call
773 // frames that we discover while building the graph.
774 // It maps from the summary of the function making the tail call, to a map
775 // of callee ValueInfo to corresponding synthesized callsite info.
776 std::unordered_map<FunctionSummary *,
777 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
778 FunctionCalleesToSynthesizedCallsiteInfos;
779};
780} // namespace
781
782namespace llvm {
783template <>
784struct DenseMapInfo<typename CallsiteContextGraph<
785 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
787template <>
788struct DenseMapInfo<typename CallsiteContextGraph<
789 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
790 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
791template <>
792struct DenseMapInfo<IndexCall>
793 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
794} // end namespace llvm
795
796namespace {
797
798struct FieldSeparator {
799 bool Skip = true;
800 const char *Sep;
801
802 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
803};
804
805raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
806 if (FS.Skip) {
807 FS.Skip = false;
808 return OS;
809 }
810 return OS << FS.Sep;
811}
812
813// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
814// type we should actually use on the corresponding allocation.
815// If we can't clone a node that has NotCold+Cold alloc type, we will fall
816// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
817// from NotCold.
818AllocationType allocTypeToUse(uint8_t AllocTypes) {
819 assert(AllocTypes != (uint8_t)AllocationType::None);
820 if (AllocTypes ==
821 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
822 return AllocationType::NotCold;
823 else
824 return (AllocationType)AllocTypes;
825}
826
827// Helper to check if the alloc types for all edges recorded in the
828// InAllocTypes vector match the alloc types for all edges in the Edges
829// vector.
830template <typename DerivedCCG, typename FuncTy, typename CallTy>
831bool allocTypesMatch(
832 const std::vector<uint8_t> &InAllocTypes,
833 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
834 &Edges) {
835 return std::equal(
836 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
837 [](const uint8_t &l,
838 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
839 // Can share if one of the edges is None type - don't
840 // care about the type along that edge as it doesn't
841 // exist for those context ids.
842 if (l == (uint8_t)AllocationType::None ||
843 r->AllocTypes == (uint8_t)AllocationType::None)
844 return true;
845 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
846 });
847}
848
849} // end anonymous namespace
850
851template <typename DerivedCCG, typename FuncTy, typename CallTy>
852typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
853CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
854 const CallInfo &C) {
855 ContextNode *Node = getNodeForAlloc(C);
856 if (Node)
857 return Node;
858
859 return NonAllocationCallToContextNodeMap.lookup(C);
860}
861
862template <typename DerivedCCG, typename FuncTy, typename CallTy>
863typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
864CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
865 const CallInfo &C) {
866 return AllocationCallToContextNodeMap.lookup(C);
867}
868
869template <typename DerivedCCG, typename FuncTy, typename CallTy>
870typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
871CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
872 uint64_t StackId) {
873 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
874 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
875 return StackEntryNode->second;
876 return nullptr;
877}
878
879template <typename DerivedCCG, typename FuncTy, typename CallTy>
880void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
881 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
882 unsigned int ContextId) {
883 for (auto &Edge : CallerEdges) {
884 if (Edge->Caller == Caller) {
885 Edge->AllocTypes |= (uint8_t)AllocType;
886 Edge->getContextIds().insert(ContextId);
887 return;
888 }
889 }
890 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
891 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
892 CallerEdges.push_back(Edge);
893 Caller->CalleeEdges.push_back(Edge);
894}
895
896template <typename DerivedCCG, typename FuncTy, typename CallTy>
897void CallsiteContextGraph<
898 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
899 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
900 auto Edge = *EI;
901 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
902 assert(Edge->ContextIds.empty());
903 Edge->Callee->eraseCallerEdge(Edge.get());
904 EI = Node->CalleeEdges.erase(EI);
905 } else
906 ++EI;
907 }
908}
909
910template <typename DerivedCCG, typename FuncTy, typename CallTy>
911typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
912CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
913 findEdgeFromCallee(const ContextNode *Callee) {
914 for (const auto &Edge : CalleeEdges)
915 if (Edge->Callee == Callee)
916 return Edge.get();
917 return nullptr;
918}
919
920template <typename DerivedCCG, typename FuncTy, typename CallTy>
921typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
922CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
923 findEdgeFromCaller(const ContextNode *Caller) {
924 for (const auto &Edge : CallerEdges)
925 if (Edge->Caller == Caller)
926 return Edge.get();
927 return nullptr;
928}
929
930template <typename DerivedCCG, typename FuncTy, typename CallTy>
931void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
932 eraseCalleeEdge(const ContextEdge *Edge) {
933 auto EI = llvm::find_if(
934 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
935 return CalleeEdge.get() == Edge;
936 });
937 assert(EI != CalleeEdges.end());
938 CalleeEdges.erase(EI);
939}
940
941template <typename DerivedCCG, typename FuncTy, typename CallTy>
942void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
943 eraseCallerEdge(const ContextEdge *Edge) {
944 auto EI = llvm::find_if(
945 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
946 return CallerEdge.get() == Edge;
947 });
948 assert(EI != CallerEdges.end());
949 CallerEdges.erase(EI);
950}
951
952template <typename DerivedCCG, typename FuncTy, typename CallTy>
953uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
954 DenseSet<uint32_t> &ContextIds) {
955 uint8_t BothTypes =
956 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
957 uint8_t AllocType = (uint8_t)AllocationType::None;
958 for (auto Id : ContextIds) {
959 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
960 // Bail early if alloc type reached both, no further refinement.
961 if (AllocType == BothTypes)
962 return AllocType;
963 }
964 return AllocType;
965}
966
967template <typename DerivedCCG, typename FuncTy, typename CallTy>
968uint8_t
969CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
970 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
971 uint8_t BothTypes =
972 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
973 uint8_t AllocType = (uint8_t)AllocationType::None;
974 for (auto Id : Node1Ids) {
975 if (!Node2Ids.count(Id))
976 continue;
977 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
978 // Bail early if alloc type reached both, no further refinement.
979 if (AllocType == BothTypes)
980 return AllocType;
981 }
982 return AllocType;
983}
984
985template <typename DerivedCCG, typename FuncTy, typename CallTy>
986uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
987 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
988 if (Node1Ids.size() < Node2Ids.size())
989 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
990 else
991 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
992}
993
994template <typename DerivedCCG, typename FuncTy, typename CallTy>
995typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
996CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
997 CallInfo Call, const FuncTy *F) {
998 assert(!getNodeForAlloc(Call));
999 NodeOwner.push_back(
1000 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
1001 ContextNode *AllocNode = NodeOwner.back().get();
1002 AllocationCallToContextNodeMap[Call] = AllocNode;
1003 NodeToCallingFunc[AllocNode] = F;
1004 // Use LastContextId as a uniq id for MIB allocation nodes.
1005 AllocNode->OrigStackOrAllocId = LastContextId;
1006 // Alloc type should be updated as we add in the MIBs. We should assert
1007 // afterwards that it is not still None.
1008 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1009
1010 return AllocNode;
1011}
1012
1013static std::string getAllocTypeString(uint8_t AllocTypes) {
1014 if (!AllocTypes)
1015 return "None";
1016 std::string Str;
1017 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1018 Str += "NotCold";
1019 if (AllocTypes & (uint8_t)AllocationType::Cold)
1020 Str += "Cold";
1021 return Str;
1022}
1023
1024template <typename DerivedCCG, typename FuncTy, typename CallTy>
1025template <class NodeT, class IteratorT>
1026void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1027 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1029 uint64_t TotalSize) {
1030 assert(!MemProfReportHintedSizes || TotalSize > 0);
1031 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1032 // is done.
1033 if (AllocType == AllocationType::Hot)
1034 AllocType = AllocationType::NotCold;
1035
1036 ContextIdToAllocationType[++LastContextId] = AllocType;
1037
1039 assert(TotalSize);
1040 ContextIdToTotalSize[LastContextId] = TotalSize;
1041 }
1042
1043 // Update alloc type and context ids for this MIB.
1044 AllocNode->AllocTypes |= (uint8_t)AllocType;
1045
1046 // Now add or update nodes for each stack id in alloc's context.
1047 // Later when processing the stack ids on non-alloc callsites we will adjust
1048 // for any inlining in the context.
1049 ContextNode *PrevNode = AllocNode;
1050 // Look for recursion (direct recursion should have been collapsed by
1051 // module summary analysis, here we should just be detecting mutual
1052 // recursion). Mark these nodes so we don't try to clone.
1053 SmallSet<uint64_t, 8> StackIdSet;
1054 // Skip any on the allocation call (inlining).
1055 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1056 ContextIter != StackContext.end(); ++ContextIter) {
1057 auto StackId = getStackId(*ContextIter);
1058 ContextNode *StackNode = getNodeForStackId(StackId);
1059 if (!StackNode) {
1060 NodeOwner.push_back(
1061 std::make_unique<ContextNode>(/*IsAllocation=*/false));
1062 StackNode = NodeOwner.back().get();
1063 StackEntryIdToContextNodeMap[StackId] = StackNode;
1064 StackNode->OrigStackOrAllocId = StackId;
1065 }
1066 auto Ins = StackIdSet.insert(StackId);
1067 if (!Ins.second)
1068 StackNode->Recursive = true;
1069 StackNode->AllocTypes |= (uint8_t)AllocType;
1070 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1071 PrevNode = StackNode;
1072 }
1073}
1074
1075template <typename DerivedCCG, typename FuncTy, typename CallTy>
1077CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1078 const DenseSet<uint32_t> &StackSequenceContextIds,
1079 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1080 DenseSet<uint32_t> NewContextIds;
1081 for (auto OldId : StackSequenceContextIds) {
1082 NewContextIds.insert(++LastContextId);
1083 OldToNewContextIds[OldId].insert(LastContextId);
1084 assert(ContextIdToAllocationType.count(OldId));
1085 // The new context has the same allocation type as original.
1086 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1087 // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
1088 // up the size. Assume that if we are able to duplicate context ids that we
1089 // will be able to disambiguate all copies.
1090 ContextIdToTotalSize[LastContextId] = 0;
1091 }
1092 return NewContextIds;
1093}
1094
1095template <typename DerivedCCG, typename FuncTy, typename CallTy>
1096void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1097 propagateDuplicateContextIds(
1098 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1099 // Build a set of duplicated context ids corresponding to the input id set.
1100 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1101 DenseSet<uint32_t> NewIds;
1102 for (auto Id : ContextIds)
1103 if (auto NewId = OldToNewContextIds.find(Id);
1104 NewId != OldToNewContextIds.end())
1105 NewIds.insert(NewId->second.begin(), NewId->second.end());
1106 return NewIds;
1107 };
1108
1109 // Recursively update context ids sets along caller edges.
1110 auto UpdateCallers = [&](ContextNode *Node,
1112 auto &&UpdateCallers) -> void {
1113 for (const auto &Edge : Node->CallerEdges) {
1114 auto Inserted = Visited.insert(Edge.get());
1115 if (!Inserted.second)
1116 continue;
1117 ContextNode *NextNode = Edge->Caller;
1118 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1119 // Only need to recursively iterate to NextNode via this caller edge if
1120 // it resulted in any added ids to NextNode.
1121 if (!NewIdsToAdd.empty()) {
1122 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1123 UpdateCallers(NextNode, Visited, UpdateCallers);
1124 }
1125 }
1126 };
1127
1129 for (auto &Entry : AllocationCallToContextNodeMap) {
1130 auto *Node = Entry.second;
1131 UpdateCallers(Node, Visited, UpdateCallers);
1132 }
1133}
1134
1135template <typename DerivedCCG, typename FuncTy, typename CallTy>
1136void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1137 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1138 // This must be passed by value to make a copy since it will be adjusted
1139 // as ids are moved.
1140 DenseSet<uint32_t> RemainingContextIds) {
1141 auto &OrigEdges =
1142 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1143 // Increment iterator in loop so that we can remove edges as needed.
1144 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1145 auto Edge = *EI;
1146 // Remove any matching context ids from Edge, return set that were found and
1147 // removed, these are the new edge's context ids. Also update the remaining
1148 // (not found ids).
1149 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1150 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1151 NotFoundContextIds);
1152 RemainingContextIds.swap(NotFoundContextIds);
1153 // If no matching context ids for this edge, skip it.
1154 if (NewEdgeContextIds.empty()) {
1155 ++EI;
1156 continue;
1157 }
1158 if (TowardsCallee) {
1159 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1160 auto NewEdge = std::make_shared<ContextEdge>(
1161 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1162 NewNode->CalleeEdges.push_back(NewEdge);
1163 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1164 } else {
1165 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1166 auto NewEdge = std::make_shared<ContextEdge>(
1167 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1168 NewNode->CallerEdges.push_back(NewEdge);
1169 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1170 }
1171 // Remove old edge if context ids empty.
1172 if (Edge->getContextIds().empty()) {
1173 if (TowardsCallee) {
1174 Edge->Callee->eraseCallerEdge(Edge.get());
1175 EI = OrigNode->CalleeEdges.erase(EI);
1176 } else {
1177 Edge->Caller->eraseCalleeEdge(Edge.get());
1178 EI = OrigNode->CallerEdges.erase(EI);
1179 }
1180 continue;
1181 }
1182 ++EI;
1183 }
1184}
1185
1186template <typename DerivedCCG, typename FuncTy, typename CallTy>
1187static void checkEdge(
1188 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1189 // Confirm that alloc type is not None and that we have at least one context
1190 // id.
1191 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1192 assert(!Edge->ContextIds.empty());
1193}
1194
1195template <typename DerivedCCG, typename FuncTy, typename CallTy>
1196static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1197 bool CheckEdges = true) {
1198 if (Node->isRemoved())
1199 return;
1200#ifndef NDEBUG
1201 // Compute node's context ids once for use in asserts.
1202 auto NodeContextIds = Node->getContextIds();
1203#endif
1204 // Node's context ids should be the union of both its callee and caller edge
1205 // context ids.
1206 if (Node->CallerEdges.size()) {
1207 DenseSet<uint32_t> CallerEdgeContextIds(
1208 Node->CallerEdges.front()->ContextIds);
1209 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1210 if (CheckEdges)
1211 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1212 set_union(CallerEdgeContextIds, Edge->ContextIds);
1213 }
1214 // Node can have more context ids than callers if some contexts terminate at
1215 // node and some are longer.
1216 assert(NodeContextIds == CallerEdgeContextIds ||
1217 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1218 }
1219 if (Node->CalleeEdges.size()) {
1220 DenseSet<uint32_t> CalleeEdgeContextIds(
1221 Node->CalleeEdges.front()->ContextIds);
1222 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1223 if (CheckEdges)
1224 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1225 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1226 }
1227 assert(NodeContextIds == CalleeEdgeContextIds);
1228 }
1229}
1230
1231template <typename DerivedCCG, typename FuncTy, typename CallTy>
1232void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1233 assignStackNodesPostOrder(ContextNode *Node,
1235 DenseMap<uint64_t, std::vector<CallContextInfo>>
1236 &StackIdToMatchingCalls) {
1237 auto Inserted = Visited.insert(Node);
1238 if (!Inserted.second)
1239 return;
1240 // Post order traversal. Iterate over a copy since we may add nodes and
1241 // therefore new callers during the recursive call, invalidating any
1242 // iterator over the original edge vector. We don't need to process these
1243 // new nodes as they were already processed on creation.
1244 auto CallerEdges = Node->CallerEdges;
1245 for (auto &Edge : CallerEdges) {
1246 // Skip any that have been removed during the recursion.
1247 if (!Edge)
1248 continue;
1249 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1250 }
1251
1252 // If this node's stack id is in the map, update the graph to contain new
1253 // nodes representing any inlining at interior callsites. Note we move the
1254 // associated context ids over to the new nodes.
1255
1256 // Ignore this node if it is for an allocation or we didn't record any
1257 // stack id lists ending at it.
1258 if (Node->IsAllocation ||
1259 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1260 return;
1261
1262 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1263 // Handle the simple case first. A single call with a single stack id.
1264 // In this case there is no need to create any new context nodes, simply
1265 // assign the context node for stack id to this Call.
1266 if (Calls.size() == 1) {
1267 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1268 if (Ids.size() == 1) {
1269 assert(SavedContextIds.empty());
1270 // It should be this Node
1271 assert(Node == getNodeForStackId(Ids[0]));
1272 if (Node->Recursive)
1273 return;
1274 Node->setCall(Call);
1275 NonAllocationCallToContextNodeMap[Call] = Node;
1276 NodeToCallingFunc[Node] = Func;
1277 return;
1278 }
1279 }
1280
1281 // Find the node for the last stack id, which should be the same
1282 // across all calls recorded for this id, and is this node's id.
1283 uint64_t LastId = Node->OrigStackOrAllocId;
1284 ContextNode *LastNode = getNodeForStackId(LastId);
1285 // We should only have kept stack ids that had nodes.
1286 assert(LastNode);
1287
1288 for (unsigned I = 0; I < Calls.size(); I++) {
1289 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1290 // Skip any for which we didn't assign any ids, these don't get a node in
1291 // the graph.
1292 if (SavedContextIds.empty())
1293 continue;
1294
1295 assert(LastId == Ids.back());
1296
1297 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1298 assert(FirstNode);
1299
1300 // Recompute the context ids for this stack id sequence (the
1301 // intersection of the context ids of the corresponding nodes).
1302 // Start with the ids we saved in the map for this call, which could be
1303 // duplicated context ids. We have to recompute as we might have overlap
1304 // overlap between the saved context ids for different last nodes, and
1305 // removed them already during the post order traversal.
1306 set_intersect(SavedContextIds, FirstNode->getContextIds());
1307 ContextNode *PrevNode = nullptr;
1308 for (auto Id : Ids) {
1309 ContextNode *CurNode = getNodeForStackId(Id);
1310 // We should only have kept stack ids that had nodes and weren't
1311 // recursive.
1312 assert(CurNode);
1313 assert(!CurNode->Recursive);
1314 if (!PrevNode) {
1315 PrevNode = CurNode;
1316 continue;
1317 }
1318 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1319 if (!Edge) {
1320 SavedContextIds.clear();
1321 break;
1322 }
1323 PrevNode = CurNode;
1324 set_intersect(SavedContextIds, Edge->getContextIds());
1325
1326 // If we now have no context ids for clone, skip this call.
1327 if (SavedContextIds.empty())
1328 break;
1329 }
1330 if (SavedContextIds.empty())
1331 continue;
1332
1333 // Create new context node.
1334 NodeOwner.push_back(
1335 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1336 ContextNode *NewNode = NodeOwner.back().get();
1337 NodeToCallingFunc[NewNode] = Func;
1338 NonAllocationCallToContextNodeMap[Call] = NewNode;
1339 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1340
1341 // Connect to callees of innermost stack frame in inlined call chain.
1342 // This updates context ids for FirstNode's callee's to reflect those
1343 // moved to NewNode.
1344 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1345
1346 // Connect to callers of outermost stack frame in inlined call chain.
1347 // This updates context ids for FirstNode's caller's to reflect those
1348 // moved to NewNode.
1349 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1350
1351 // Now we need to remove context ids from edges/nodes between First and
1352 // Last Node.
1353 PrevNode = nullptr;
1354 for (auto Id : Ids) {
1355 ContextNode *CurNode = getNodeForStackId(Id);
1356 // We should only have kept stack ids that had nodes.
1357 assert(CurNode);
1358
1359 // Remove the context ids moved to NewNode from CurNode, and the
1360 // edge from the prior node.
1361 if (PrevNode) {
1362 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1363 assert(PrevEdge);
1364 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1365 if (PrevEdge->getContextIds().empty()) {
1366 PrevNode->eraseCallerEdge(PrevEdge);
1367 CurNode->eraseCalleeEdge(PrevEdge);
1368 }
1369 }
1370 // Since we update the edges from leaf to tail, only look at the callee
1371 // edges. This isn't an alloc node, so if there are no callee edges, the
1372 // alloc type is None.
1373 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1374 ? (uint8_t)AllocationType::None
1375 : CurNode->computeAllocType();
1376 PrevNode = CurNode;
1377 }
1378 if (VerifyNodes) {
1379 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1380 for (auto Id : Ids) {
1381 ContextNode *CurNode = getNodeForStackId(Id);
1382 // We should only have kept stack ids that had nodes.
1383 assert(CurNode);
1384 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1385 }
1386 }
1387 }
1388}
1389
1390template <typename DerivedCCG, typename FuncTy, typename CallTy>
1391void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1392 // Map of stack id to all calls with that as the last (outermost caller)
1393 // callsite id that has a context node (some might not due to pruning
1394 // performed during matching of the allocation profile contexts).
1395 // The CallContextInfo contains the Call and a list of its stack ids with
1396 // ContextNodes, the function containing Call, and the set of context ids
1397 // the analysis will eventually identify for use in any new node created
1398 // for that callsite.
1400 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1401 for (auto &Call : CallsWithMetadata) {
1402 // Ignore allocations, already handled.
1403 if (AllocationCallToContextNodeMap.count(Call))
1404 continue;
1405 auto StackIdsWithContextNodes =
1406 getStackIdsWithContextNodesForCall(Call.call());
1407 // If there were no nodes created for MIBs on allocs (maybe this was in
1408 // the unambiguous part of the MIB stack that was pruned), ignore.
1409 if (StackIdsWithContextNodes.empty())
1410 continue;
1411 // Otherwise, record this Call along with the list of ids for the last
1412 // (outermost caller) stack id with a node.
1413 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1414 {Call.call(), StackIdsWithContextNodes, Func, {}});
1415 }
1416 }
1417
1418 // First make a pass through all stack ids that correspond to a call,
1419 // as identified in the above loop. Compute the context ids corresponding to
1420 // each of these calls when they correspond to multiple stack ids due to
1421 // due to inlining. Perform any duplication of context ids required when
1422 // there is more than one call with the same stack ids. Their (possibly newly
1423 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1424 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1425 for (auto &It : StackIdToMatchingCalls) {
1426 auto &Calls = It.getSecond();
1427 // Skip single calls with a single stack id. These don't need a new node.
1428 if (Calls.size() == 1) {
1429 auto &Ids = std::get<1>(Calls[0]);
1430 if (Ids.size() == 1)
1431 continue;
1432 }
1433 // In order to do the best and maximal matching of inlined calls to context
1434 // node sequences we will sort the vectors of stack ids in descending order
1435 // of length, and within each length, lexicographically by stack id. The
1436 // latter is so that we can specially handle calls that have identical stack
1437 // id sequences (either due to cloning or artificially because of the MIB
1438 // context pruning).
1439 std::stable_sort(Calls.begin(), Calls.end(),
1440 [](const CallContextInfo &A, const CallContextInfo &B) {
1441 auto &IdsA = std::get<1>(A);
1442 auto &IdsB = std::get<1>(B);
1443 return IdsA.size() > IdsB.size() ||
1444 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1445 });
1446
1447 // Find the node for the last stack id, which should be the same
1448 // across all calls recorded for this id, and is the id for this
1449 // entry in the StackIdToMatchingCalls map.
1450 uint64_t LastId = It.getFirst();
1451 ContextNode *LastNode = getNodeForStackId(LastId);
1452 // We should only have kept stack ids that had nodes.
1453 assert(LastNode);
1454
1455 if (LastNode->Recursive)
1456 continue;
1457
1458 // Initialize the context ids with the last node's. We will subsequently
1459 // refine the context ids by computing the intersection along all edges.
1460 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1461 assert(!LastNodeContextIds.empty());
1462
1463 for (unsigned I = 0; I < Calls.size(); I++) {
1464 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1465 assert(SavedContextIds.empty());
1466 assert(LastId == Ids.back());
1467
1468 // First compute the context ids for this stack id sequence (the
1469 // intersection of the context ids of the corresponding nodes).
1470 // Start with the remaining saved ids for the last node.
1471 assert(!LastNodeContextIds.empty());
1472 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1473
1474 ContextNode *PrevNode = LastNode;
1475 ContextNode *CurNode = LastNode;
1476 bool Skip = false;
1477
1478 // Iterate backwards through the stack Ids, starting after the last Id
1479 // in the list, which was handled once outside for all Calls.
1480 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1481 auto Id = *IdIter;
1482 CurNode = getNodeForStackId(Id);
1483 // We should only have kept stack ids that had nodes.
1484 assert(CurNode);
1485
1486 if (CurNode->Recursive) {
1487 Skip = true;
1488 break;
1489 }
1490
1491 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1492 // If there is no edge then the nodes belong to different MIB contexts,
1493 // and we should skip this inlined context sequence. For example, this
1494 // particular inlined context may include stack ids A->B, and we may
1495 // indeed have nodes for both A and B, but it is possible that they were
1496 // never profiled in sequence in a single MIB for any allocation (i.e.
1497 // we might have profiled an allocation that involves the callsite A,
1498 // but through a different one of its callee callsites, and we might
1499 // have profiled an allocation that involves callsite B, but reached
1500 // from a different caller callsite).
1501 if (!Edge) {
1502 Skip = true;
1503 break;
1504 }
1505 PrevNode = CurNode;
1506
1507 // Update the context ids, which is the intersection of the ids along
1508 // all edges in the sequence.
1509 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1510
1511 // If we now have no context ids for clone, skip this call.
1512 if (StackSequenceContextIds.empty()) {
1513 Skip = true;
1514 break;
1515 }
1516 }
1517 if (Skip)
1518 continue;
1519
1520 // If some of this call's stack ids did not have corresponding nodes (due
1521 // to pruning), don't include any context ids for contexts that extend
1522 // beyond these nodes. Otherwise we would be matching part of unrelated /
1523 // not fully matching stack contexts. To do this, subtract any context ids
1524 // found in caller nodes of the last node found above.
1525 if (Ids.back() != getLastStackId(Call)) {
1526 for (const auto &PE : LastNode->CallerEdges) {
1527 set_subtract(StackSequenceContextIds, PE->getContextIds());
1528 if (StackSequenceContextIds.empty())
1529 break;
1530 }
1531 // If we now have no context ids for clone, skip this call.
1532 if (StackSequenceContextIds.empty())
1533 continue;
1534 }
1535
1536 // Check if the next set of stack ids is the same (since the Calls vector
1537 // of tuples is sorted by the stack ids we can just look at the next one).
1538 bool DuplicateContextIds = false;
1539 if (I + 1 < Calls.size()) {
1540 auto NextIds = std::get<1>(Calls[I + 1]);
1541 DuplicateContextIds = Ids == NextIds;
1542 }
1543
1544 // If we don't have duplicate context ids, then we can assign all the
1545 // context ids computed for the original node sequence to this call.
1546 // If there are duplicate calls with the same stack ids then we synthesize
1547 // new context ids that are duplicates of the originals. These are
1548 // assigned to SavedContextIds, which is a reference into the map entry
1549 // for this call, allowing us to access these ids later on.
1550 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1551 StackSequenceContextIds.size());
1552 SavedContextIds =
1553 DuplicateContextIds
1554 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1555 : StackSequenceContextIds;
1556 assert(!SavedContextIds.empty());
1557
1558 if (!DuplicateContextIds) {
1559 // Update saved last node's context ids to remove those that are
1560 // assigned to other calls, so that it is ready for the next call at
1561 // this stack id.
1562 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1563 if (LastNodeContextIds.empty())
1564 break;
1565 }
1566 }
1567 }
1568
1569 // Propagate the duplicate context ids over the graph.
1570 propagateDuplicateContextIds(OldToNewContextIds);
1571
1572 if (VerifyCCG)
1573 check();
1574
1575 // Now perform a post-order traversal over the graph, starting with the
1576 // allocation nodes, essentially processing nodes from callers to callees.
1577 // For any that contains an id in the map, update the graph to contain new
1578 // nodes representing any inlining at interior callsites. Note we move the
1579 // associated context ids over to the new nodes.
1581 for (auto &Entry : AllocationCallToContextNodeMap)
1582 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1583 if (VerifyCCG)
1584 check();
1585}
1586
1587uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1589 Call->getMetadata(LLVMContext::MD_callsite));
1590 return CallsiteContext.back();
1591}
1592
1593uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1594 assert(isa<CallsiteInfo *>(Call.getBase()));
1596 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1597 // Need to convert index into stack id.
1598 return Index.getStackIdAtIndex(CallsiteContext.back());
1599}
1600
1601static const std::string MemProfCloneSuffix = ".memprof.";
1602
1603static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1604 // We use CloneNo == 0 to refer to the original version, which doesn't get
1605 // renamed with a suffix.
1606 if (!CloneNo)
1607 return Base.str();
1608 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1609}
1610
1611std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1612 const Instruction *Call,
1613 unsigned CloneNo) const {
1614 return (Twine(Call->getFunction()->getName()) + " -> " +
1615 cast<CallBase>(Call)->getCalledFunction()->getName())
1616 .str();
1617}
1618
1619std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1620 const IndexCall &Call,
1621 unsigned CloneNo) const {
1622 auto VI = FSToVIMap.find(Func);
1623 assert(VI != FSToVIMap.end());
1624 if (isa<AllocInfo *>(Call.getBase()))
1625 return (VI->second.name() + " -> alloc").str();
1626 else {
1627 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1628 return (VI->second.name() + " -> " +
1629 getMemProfFuncName(Callsite->Callee.name(),
1630 Callsite->Clones[CloneNo]))
1631 .str();
1632 }
1633}
1634
1635std::vector<uint64_t>
1636ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1637 Instruction *Call) {
1639 Call->getMetadata(LLVMContext::MD_callsite));
1640 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1641 CallsiteContext);
1642}
1643
1644std::vector<uint64_t>
1645IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1646 assert(isa<CallsiteInfo *>(Call.getBase()));
1648 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1649 return getStackIdsWithContextNodes<CallsiteInfo,
1651 CallsiteContext);
1652}
1653
1654template <typename DerivedCCG, typename FuncTy, typename CallTy>
1655template <class NodeT, class IteratorT>
1656std::vector<uint64_t>
1657CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1658 CallStack<NodeT, IteratorT> &CallsiteContext) {
1659 std::vector<uint64_t> StackIds;
1660 for (auto IdOrIndex : CallsiteContext) {
1661 auto StackId = getStackId(IdOrIndex);
1662 ContextNode *Node = getNodeForStackId(StackId);
1663 if (!Node)
1664 break;
1665 StackIds.push_back(StackId);
1666 }
1667 return StackIds;
1668}
1669
1670ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1671 Module &M,
1673 : Mod(M), OREGetter(OREGetter) {
1674 for (auto &F : M) {
1675 std::vector<CallInfo> CallsWithMetadata;
1676 for (auto &BB : F) {
1677 for (auto &I : BB) {
1678 if (!isa<CallBase>(I))
1679 continue;
1680 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1681 CallsWithMetadata.push_back(&I);
1682 auto *AllocNode = addAllocNode(&I, &F);
1683 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1684 assert(CallsiteMD);
1685 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1686 // Add all of the MIBs and their stack nodes.
1687 for (auto &MDOp : MemProfMD->operands()) {
1688 auto *MIBMD = cast<const MDNode>(MDOp);
1692 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1693 AllocNode, StackContext, CallsiteContext,
1694 getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
1695 }
1696 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1697 // Memprof and callsite metadata on memory allocations no longer
1698 // needed.
1699 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1700 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1701 }
1702 // For callsite metadata, add to list for this function for later use.
1703 else if (I.getMetadata(LLVMContext::MD_callsite))
1704 CallsWithMetadata.push_back(&I);
1705 }
1706 }
1707 if (!CallsWithMetadata.empty())
1708 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1709 }
1710
1711 if (DumpCCG) {
1712 dbgs() << "CCG before updating call stack chains:\n";
1713 dbgs() << *this;
1714 }
1715
1716 if (ExportToDot)
1717 exportToDot("prestackupdate");
1718
1719 updateStackNodes();
1720
1721 handleCallsitesWithMultipleTargets();
1722
1723 // Strip off remaining callsite metadata, no longer needed.
1724 for (auto &FuncEntry : FuncToCallsWithMetadata)
1725 for (auto &Call : FuncEntry.second)
1726 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1727}
1728
1729IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1732 isPrevailing)
1733 : Index(Index), isPrevailing(isPrevailing) {
1734 for (auto &I : Index) {
1735 auto VI = Index.getValueInfo(I);
1736 for (auto &S : VI.getSummaryList()) {
1737 // We should only add the prevailing nodes. Otherwise we may try to clone
1738 // in a weak copy that won't be linked (and may be different than the
1739 // prevailing version).
1740 // We only keep the memprof summary on the prevailing copy now when
1741 // building the combined index, as a space optimization, however don't
1742 // rely on this optimization. The linker doesn't resolve local linkage
1743 // values so don't check whether those are prevailing.
1744 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1745 !isPrevailing(VI.getGUID(), S.get()))
1746 continue;
1747 auto *FS = dyn_cast<FunctionSummary>(S.get());
1748 if (!FS)
1749 continue;
1750 std::vector<CallInfo> CallsWithMetadata;
1751 if (!FS->allocs().empty()) {
1752 for (auto &AN : FS->mutableAllocs()) {
1753 // This can happen because of recursion elimination handling that
1754 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1755 // We still added them to the summary because we need to be able to
1756 // correlate properly in applyImport in the backends.
1757 if (AN.MIBs.empty())
1758 continue;
1759 CallsWithMetadata.push_back({&AN});
1760 auto *AllocNode = addAllocNode({&AN}, FS);
1761 // Pass an empty CallStack to the CallsiteContext (second)
1762 // parameter, since for ThinLTO we already collapsed out the inlined
1763 // stack ids on the allocation call during ModuleSummaryAnalysis.
1765 EmptyContext;
1766 unsigned I = 0;
1768 AN.TotalSizes.size() == AN.MIBs.size());
1769 // Now add all of the MIBs and their stack nodes.
1770 for (auto &MIB : AN.MIBs) {
1772 StackContext(&MIB);
1773 uint64_t TotalSize = 0;
1775 TotalSize = AN.TotalSizes[I];
1776 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1777 AllocNode, StackContext, EmptyContext, MIB.AllocType,
1778 TotalSize);
1779 I++;
1780 }
1781 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1782 // Initialize version 0 on the summary alloc node to the current alloc
1783 // type, unless it has both types in which case make it default, so
1784 // that in the case where we aren't able to clone the original version
1785 // always ends up with the default allocation behavior.
1786 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1787 }
1788 }
1789 // For callsite metadata, add to list for this function for later use.
1790 if (!FS->callsites().empty())
1791 for (auto &SN : FS->mutableCallsites())
1792 CallsWithMetadata.push_back({&SN});
1793
1794 if (!CallsWithMetadata.empty())
1795 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1796
1797 if (!FS->allocs().empty() || !FS->callsites().empty())
1798 FSToVIMap[FS] = VI;
1799 }
1800 }
1801
1802 if (DumpCCG) {
1803 dbgs() << "CCG before updating call stack chains:\n";
1804 dbgs() << *this;
1805 }
1806
1807 if (ExportToDot)
1808 exportToDot("prestackupdate");
1809
1810 updateStackNodes();
1811
1812 handleCallsitesWithMultipleTargets();
1813}
1814
1815template <typename DerivedCCG, typename FuncTy, typename CallTy>
1816void CallsiteContextGraph<DerivedCCG, FuncTy,
1817 CallTy>::handleCallsitesWithMultipleTargets() {
1818 // Look for and workaround callsites that call multiple functions.
1819 // This can happen for indirect calls, which needs better handling, and in
1820 // more rare cases (e.g. macro expansion).
1821 // TODO: To fix this for indirect calls we will want to perform speculative
1822 // devirtualization using either the normal PGO info with ICP, or using the
1823 // information in the profiled MemProf contexts. We can do this prior to
1824 // this transformation for regular LTO, and for ThinLTO we can simulate that
1825 // effect in the summary and perform the actual speculative devirtualization
1826 // while cloning in the ThinLTO backend.
1827
1828 // Keep track of the new nodes synthesized for discovered tail calls missing
1829 // from the profiled contexts.
1830 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1831
1832 for (auto &Entry : NonAllocationCallToContextNodeMap) {
1833 auto *Node = Entry.second;
1834 assert(Node->Clones.empty());
1835 // Check all node callees and see if in the same function.
1836 auto Call = Node->Call.call();
1837 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
1838 ++EI) {
1839 auto Edge = *EI;
1840 if (!Edge->Callee->hasCall())
1841 continue;
1842 assert(NodeToCallingFunc.count(Edge->Callee));
1843 // Check if the called function matches that of the callee node.
1844 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1845 continue;
1846 RemovedEdgesWithMismatchedCallees++;
1847 // Work around by setting Node to have a null call, so it gets
1848 // skipped during cloning. Otherwise assignFunctions will assert
1849 // because its data structures are not designed to handle this case.
1850 Node->setCall(CallInfo());
1851 break;
1852 }
1853 }
1854
1855 // Remove all mismatched nodes identified in the above loop from the node map
1856 // (checking whether they have a null call which is set above). For a
1857 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
1858 // to do the removal via remove_if than by individually erasing entries above.
1859 NonAllocationCallToContextNodeMap.remove_if(
1860 [](const auto &it) { return !it.second->hasCall(); });
1861
1862 // Add the new nodes after the above loop so that the iteration is not
1863 // invalidated.
1864 for (auto &[Call, Node] : TailCallToContextNodeMap)
1865 NonAllocationCallToContextNodeMap[Call] = Node;
1866}
1867
1868uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1869 // In the Module (IR) case this is already the Id.
1870 return IdOrIndex;
1871}
1872
1873uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1874 // In the Index case this is an index into the stack id list in the summary
1875 // index, convert it to an Id.
1876 return Index.getStackIdAtIndex(IdOrIndex);
1877}
1878
1879template <typename DerivedCCG, typename FuncTy, typename CallTy>
1880bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1881 CallTy Call, EdgeIter &EI,
1882 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1883 auto Edge = *EI;
1884 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1885 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1886 // Will be populated in order of callee to caller if we find a chain of tail
1887 // calls between the profiled caller and callee.
1888 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1889 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1890 FoundCalleeChain))
1891 return false;
1892
1893 // The usual case where the profiled callee matches that of the IR/summary.
1894 if (FoundCalleeChain.empty())
1895 return true;
1896
1897 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1898 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1899 // If there is already an edge between these nodes, simply update it and
1900 // return.
1901 if (CurEdge) {
1902 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1903 Edge->ContextIds.end());
1904 CurEdge->AllocTypes |= Edge->AllocTypes;
1905 return;
1906 }
1907 // Otherwise, create a new edge and insert it into the caller and callee
1908 // lists.
1909 auto NewEdge = std::make_shared<ContextEdge>(
1910 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1911 Callee->CallerEdges.push_back(NewEdge);
1912 if (Caller == Edge->Caller) {
1913 // If we are inserting the new edge into the current edge's caller, insert
1914 // the new edge before the current iterator position, and then increment
1915 // back to the current edge.
1916 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1917 ++EI;
1918 assert(*EI == Edge &&
1919 "Iterator position not restored after insert and increment");
1920 } else
1921 Caller->CalleeEdges.push_back(NewEdge);
1922 };
1923
1924 // Create new nodes for each found callee and connect in between the profiled
1925 // caller and callee.
1926 auto *CurCalleeNode = Edge->Callee;
1927 for (auto &[NewCall, Func] : FoundCalleeChain) {
1928 ContextNode *NewNode = nullptr;
1929 // First check if we have already synthesized a node for this tail call.
1930 if (TailCallToContextNodeMap.count(NewCall)) {
1931 NewNode = TailCallToContextNodeMap[NewCall];
1932 NewNode->AllocTypes |= Edge->AllocTypes;
1933 } else {
1934 FuncToCallsWithMetadata[Func].push_back({NewCall});
1935 // Create Node and record node info.
1936 NodeOwner.push_back(
1937 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1938 NewNode = NodeOwner.back().get();
1939 NodeToCallingFunc[NewNode] = Func;
1940 TailCallToContextNodeMap[NewCall] = NewNode;
1941 NewNode->AllocTypes = Edge->AllocTypes;
1942 }
1943
1944 // Hook up node to its callee node
1945 AddEdge(NewNode, CurCalleeNode);
1946
1947 CurCalleeNode = NewNode;
1948 }
1949
1950 // Hook up edge's original caller to new callee node.
1951 AddEdge(Edge->Caller, CurCalleeNode);
1952
1953 // Remove old edge
1954 Edge->Callee->eraseCallerEdge(Edge.get());
1955 EI = Edge->Caller->CalleeEdges.erase(EI);
1956
1957 // To simplify the increment of EI in the caller, subtract one from EI.
1958 // In the final AddEdge call we would have either added a new callee edge,
1959 // to Edge->Caller, or found an existing one. Either way we are guaranteed
1960 // that there is at least one callee edge.
1961 assert(!Edge->Caller->CalleeEdges.empty());
1962 --EI;
1963
1964 return true;
1965}
1966
1967bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1968 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1969 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1970 bool &FoundMultipleCalleeChains) {
1971 // Stop recursive search if we have already explored the maximum specified
1972 // depth.
1974 return false;
1975
1976 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1977 FoundCalleeChain.push_back({Callsite, F});
1978 };
1979
1980 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1981 if (!CalleeFunc) {
1982 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1983 assert(Alias);
1984 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1985 assert(CalleeFunc);
1986 }
1987
1988 // Look for tail calls in this function, and check if they either call the
1989 // profiled callee directly, or indirectly (via a recursive search).
1990 // Only succeed if there is a single unique tail call chain found between the
1991 // profiled caller and callee, otherwise we could perform incorrect cloning.
1992 bool FoundSingleCalleeChain = false;
1993 for (auto &BB : *CalleeFunc) {
1994 for (auto &I : BB) {
1995 auto *CB = dyn_cast<CallBase>(&I);
1996 if (!CB || !CB->isTailCall())
1997 continue;
1998 auto *CalledValue = CB->getCalledOperand();
1999 auto *CalledFunction = CB->getCalledFunction();
2000 if (CalledValue && !CalledFunction) {
2001 CalledValue = CalledValue->stripPointerCasts();
2002 // Stripping pointer casts can reveal a called function.
2003 CalledFunction = dyn_cast<Function>(CalledValue);
2004 }
2005 // Check if this is an alias to a function. If so, get the
2006 // called aliasee for the checks below.
2007 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2008 assert(!CalledFunction &&
2009 "Expected null called function in callsite for alias");
2010 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2011 }
2012 if (!CalledFunction)
2013 continue;
2014 if (CalledFunction == ProfiledCallee) {
2015 if (FoundSingleCalleeChain) {
2016 FoundMultipleCalleeChains = true;
2017 return false;
2018 }
2019 FoundSingleCalleeChain = true;
2020 FoundProfiledCalleeCount++;
2021 FoundProfiledCalleeDepth += Depth;
2022 if (Depth > FoundProfiledCalleeMaxDepth)
2023 FoundProfiledCalleeMaxDepth = Depth;
2024 SaveCallsiteInfo(&I, CalleeFunc);
2025 } else if (findProfiledCalleeThroughTailCalls(
2026 ProfiledCallee, CalledFunction, Depth + 1,
2027 FoundCalleeChain, FoundMultipleCalleeChains)) {
2028 // findProfiledCalleeThroughTailCalls should not have returned
2029 // true if FoundMultipleCalleeChains.
2030 assert(!FoundMultipleCalleeChains);
2031 if (FoundSingleCalleeChain) {
2032 FoundMultipleCalleeChains = true;
2033 return false;
2034 }
2035 FoundSingleCalleeChain = true;
2036 SaveCallsiteInfo(&I, CalleeFunc);
2037 } else if (FoundMultipleCalleeChains)
2038 return false;
2039 }
2040 }
2041
2042 return FoundSingleCalleeChain;
2043}
2044
2045bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2046 Instruction *Call, const Function *Func, const Function *CallerFunc,
2047 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2048 auto *CB = dyn_cast<CallBase>(Call);
2049 if (!CB->getCalledOperand())
2050 return false;
2051 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2052 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2053 if (CalleeFunc == Func)
2054 return true;
2055 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2056 if (Alias && Alias->getAliasee() == Func)
2057 return true;
2058
2059 // Recursively search for the profiled callee through tail calls starting with
2060 // the actual Callee. The discovered tail call chain is saved in
2061 // FoundCalleeChain, and we will fixup the graph to include these callsites
2062 // after returning.
2063 // FIXME: We will currently redo the same recursive walk if we find the same
2064 // mismatched callee from another callsite. We can improve this with more
2065 // bookkeeping of the created chain of new nodes for each mismatch.
2066 unsigned Depth = 1;
2067 bool FoundMultipleCalleeChains = false;
2068 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2069 FoundCalleeChain,
2070 FoundMultipleCalleeChains)) {
2071 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2072 << Func->getName() << " from " << CallerFunc->getName()
2073 << " that actually called " << CalleeVal->getName()
2074 << (FoundMultipleCalleeChains
2075 ? " (found multiple possible chains)"
2076 : "")
2077 << "\n");
2078 if (FoundMultipleCalleeChains)
2079 FoundProfiledCalleeNonUniquelyCount++;
2080 return false;
2081 }
2082
2083 return true;
2084}
2085
2086bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2087 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2088 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2089 bool &FoundMultipleCalleeChains) {
2090 // Stop recursive search if we have already explored the maximum specified
2091 // depth.
2093 return false;
2094
2095 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2096 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2097 // been synthesized.
2098 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2099 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2100 // StackIds is empty (we don't have debug info available in the index for
2101 // these callsites)
2102 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2103 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2104 CallsiteInfo *NewCallsiteInfo =
2105 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2106 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2107 };
2108
2109 // Look for tail calls in this function, and check if they either call the
2110 // profiled callee directly, or indirectly (via a recursive search).
2111 // Only succeed if there is a single unique tail call chain found between the
2112 // profiled caller and callee, otherwise we could perform incorrect cloning.
2113 bool FoundSingleCalleeChain = false;
2114 for (auto &S : CurCallee.getSummaryList()) {
2115 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2116 !isPrevailing(CurCallee.getGUID(), S.get()))
2117 continue;
2118 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2119 if (!FS)
2120 continue;
2121 auto FSVI = CurCallee;
2122 auto *AS = dyn_cast<AliasSummary>(S.get());
2123 if (AS)
2124 FSVI = AS->getAliaseeVI();
2125 for (auto &CallEdge : FS->calls()) {
2126 if (!CallEdge.second.hasTailCall())
2127 continue;
2128 if (CallEdge.first == ProfiledCallee) {
2129 if (FoundSingleCalleeChain) {
2130 FoundMultipleCalleeChains = true;
2131 return false;
2132 }
2133 FoundSingleCalleeChain = true;
2134 FoundProfiledCalleeCount++;
2135 FoundProfiledCalleeDepth += Depth;
2136 if (Depth > FoundProfiledCalleeMaxDepth)
2137 FoundProfiledCalleeMaxDepth = Depth;
2138 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2139 // Add FS to FSToVIMap in case it isn't already there.
2140 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2141 FSToVIMap[FS] = FSVI;
2142 } else if (findProfiledCalleeThroughTailCalls(
2143 ProfiledCallee, CallEdge.first, Depth + 1,
2144 FoundCalleeChain, FoundMultipleCalleeChains)) {
2145 // findProfiledCalleeThroughTailCalls should not have returned
2146 // true if FoundMultipleCalleeChains.
2147 assert(!FoundMultipleCalleeChains);
2148 if (FoundSingleCalleeChain) {
2149 FoundMultipleCalleeChains = true;
2150 return false;
2151 }
2152 FoundSingleCalleeChain = true;
2153 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2154 // Add FS to FSToVIMap in case it isn't already there.
2155 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2156 FSToVIMap[FS] = FSVI;
2157 } else if (FoundMultipleCalleeChains)
2158 return false;
2159 }
2160 }
2161
2162 return FoundSingleCalleeChain;
2163}
2164
2165bool IndexCallsiteContextGraph::calleeMatchesFunc(
2166 IndexCall &Call, const FunctionSummary *Func,
2167 const FunctionSummary *CallerFunc,
2168 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2170 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2171 // If there is no summary list then this is a call to an externally defined
2172 // symbol.
2173 AliasSummary *Alias =
2174 Callee.getSummaryList().empty()
2175 ? nullptr
2176 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2177 assert(FSToVIMap.count(Func));
2178 auto FuncVI = FSToVIMap[Func];
2179 if (Callee == FuncVI ||
2180 // If callee is an alias, check the aliasee, since only function
2181 // summary base objects will contain the stack node summaries and thus
2182 // get a context node.
2183 (Alias && Alias->getAliaseeVI() == FuncVI))
2184 return true;
2185
2186 // Recursively search for the profiled callee through tail calls starting with
2187 // the actual Callee. The discovered tail call chain is saved in
2188 // FoundCalleeChain, and we will fixup the graph to include these callsites
2189 // after returning.
2190 // FIXME: We will currently redo the same recursive walk if we find the same
2191 // mismatched callee from another callsite. We can improve this with more
2192 // bookkeeping of the created chain of new nodes for each mismatch.
2193 unsigned Depth = 1;
2194 bool FoundMultipleCalleeChains = false;
2195 if (!findProfiledCalleeThroughTailCalls(
2196 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2197 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2198 << " from " << FSToVIMap[CallerFunc]
2199 << " that actually called " << Callee
2200 << (FoundMultipleCalleeChains
2201 ? " (found multiple possible chains)"
2202 : "")
2203 << "\n");
2204 if (FoundMultipleCalleeChains)
2205 FoundProfiledCalleeNonUniquelyCount++;
2206 return false;
2207 }
2208
2209 return true;
2210}
2211
2212template <typename DerivedCCG, typename FuncTy, typename CallTy>
2213void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2214 const {
2215 print(dbgs());
2216 dbgs() << "\n";
2217}
2218
2219template <typename DerivedCCG, typename FuncTy, typename CallTy>
2220void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2221 raw_ostream &OS) const {
2222 OS << "Node " << this << "\n";
2223 OS << "\t";
2224 printCall(OS);
2225 if (Recursive)
2226 OS << " (recursive)";
2227 OS << "\n";
2228 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2229 OS << "\tContextIds:";
2230 // Make a copy of the computed context ids that we can sort for stability.
2231 auto ContextIds = getContextIds();
2232 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2233 std::sort(SortedIds.begin(), SortedIds.end());
2234 for (auto Id : SortedIds)
2235 OS << " " << Id;
2236 OS << "\n";
2237 OS << "\tCalleeEdges:\n";
2238 for (auto &Edge : CalleeEdges)
2239 OS << "\t\t" << *Edge << "\n";
2240 OS << "\tCallerEdges:\n";
2241 for (auto &Edge : CallerEdges)
2242 OS << "\t\t" << *Edge << "\n";
2243 if (!Clones.empty()) {
2244 OS << "\tClones: ";
2245 FieldSeparator FS;
2246 for (auto *Clone : Clones)
2247 OS << FS << Clone;
2248 OS << "\n";
2249 } else if (CloneOf) {
2250 OS << "\tClone of " << CloneOf << "\n";
2251 }
2252}
2253
2254template <typename DerivedCCG, typename FuncTy, typename CallTy>
2255void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2256 const {
2257 print(dbgs());
2258 dbgs() << "\n";
2259}
2260
2261template <typename DerivedCCG, typename FuncTy, typename CallTy>
2262void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2263 raw_ostream &OS) const {
2264 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2265 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2266 OS << " ContextIds:";
2267 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2268 std::sort(SortedIds.begin(), SortedIds.end());
2269 for (auto Id : SortedIds)
2270 OS << " " << Id;
2271}
2272
2273template <typename DerivedCCG, typename FuncTy, typename CallTy>
2274void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2275 print(dbgs());
2276}
2277
2278template <typename DerivedCCG, typename FuncTy, typename CallTy>
2279void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2280 raw_ostream &OS) const {
2281 OS << "Callsite Context Graph:\n";
2282 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2283 for (const auto Node : nodes<GraphType>(this)) {
2284 if (Node->isRemoved())
2285 continue;
2286 Node->print(OS);
2287 OS << "\n";
2288 }
2289}
2290
2291template <typename DerivedCCG, typename FuncTy, typename CallTy>
2292void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2293 raw_ostream &OS) const {
2294 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2295 for (const auto Node : nodes<GraphType>(this)) {
2296 if (Node->isRemoved())
2297 continue;
2298 if (!Node->IsAllocation)
2299 continue;
2300 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2301 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2302 std::sort(SortedIds.begin(), SortedIds.end());
2303 for (auto Id : SortedIds) {
2304 auto SizeI = ContextIdToTotalSize.find(Id);
2305 assert(SizeI != ContextIdToTotalSize.end());
2306 auto TypeI = ContextIdToAllocationType.find(Id);
2307 assert(TypeI != ContextIdToAllocationType.end());
2308 OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
2309 << " with total size " << SizeI->second << " is "
2310 << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2311 }
2312 }
2313}
2314
2315template <typename DerivedCCG, typename FuncTy, typename CallTy>
2316void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2317 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2318 for (const auto Node : nodes<GraphType>(this)) {
2319 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2320 for (auto &Edge : Node->CallerEdges)
2321 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2322 }
2323}
2324
2325template <typename DerivedCCG, typename FuncTy, typename CallTy>
2326struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2327 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2328 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2329
2330 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2331 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2332
2335 decltype(&getNode)>;
2336
2338 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2339 }
2340
2342 return nodes_iterator(G->NodeOwner.end(), &getNode);
2343 }
2344
2346 return G->NodeOwner.begin()->get();
2347 }
2348
2349 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2350 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2352 return P->Callee;
2353 }
2354
2356 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2357 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2358 decltype(&GetCallee)>;
2359
2361 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2362 }
2363
2365 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2366 }
2367};
2368
2369template <typename DerivedCCG, typename FuncTy, typename CallTy>
2370struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2371 : public DefaultDOTGraphTraits {
2372 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2373
2374 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2376 using NodeRef = typename GTraits::NodeRef;
2377 using ChildIteratorType = typename GTraits::ChildIteratorType;
2378
2379 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2380 std::string LabelString =
2381 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2382 Twine(Node->OrigStackOrAllocId))
2383 .str();
2384 LabelString += "\n";
2385 if (Node->hasCall()) {
2386 auto Func = G->NodeToCallingFunc.find(Node);
2387 assert(Func != G->NodeToCallingFunc.end());
2388 LabelString +=
2389 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2390 } else {
2391 LabelString += "null call";
2392 if (Node->Recursive)
2393 LabelString += " (recursive)";
2394 else
2395 LabelString += " (external)";
2396 }
2397 return LabelString;
2398 }
2399
2400 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2401 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2402 getContextIds(Node->getContextIds()) + "\"")
2403 .str();
2405 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2406 AttributeString += ",style=\"filled\"";
2407 if (Node->CloneOf) {
2408 AttributeString += ",color=\"blue\"";
2409 AttributeString += ",style=\"filled,bold,dashed\"";
2410 } else
2411 AttributeString += ",style=\"filled\"";
2412 return AttributeString;
2413 }
2414
2415 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2416 GraphType) {
2417 auto &Edge = *(ChildIter.getCurrent());
2418 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2419 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2420 .str();
2421 }
2422
2423 // Since the NodeOwners list includes nodes that are no longer connected to
2424 // the graph, skip them here.
2425 static bool isNodeHidden(NodeRef Node, GraphType) {
2426 return Node->isRemoved();
2427 }
2428
2429private:
2430 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2431 std::string IdString = "ContextIds:";
2432 if (ContextIds.size() < 100) {
2433 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2434 std::sort(SortedIds.begin(), SortedIds.end());
2435 for (auto Id : SortedIds)
2436 IdString += (" " + Twine(Id)).str();
2437 } else {
2438 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2439 }
2440 return IdString;
2441 }
2442
2443 static std::string getColor(uint8_t AllocTypes) {
2444 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2445 // Color "brown1" actually looks like a lighter red.
2446 return "brown1";
2447 if (AllocTypes == (uint8_t)AllocationType::Cold)
2448 return "cyan";
2449 if (AllocTypes ==
2450 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2451 // Lighter purple.
2452 return "mediumorchid1";
2453 return "gray";
2454 }
2455
2456 static std::string getNodeId(NodeRef Node) {
2457 std::stringstream SStream;
2458 SStream << std::hex << "N0x" << (unsigned long long)Node;
2459 std::string Result = SStream.str();
2460 return Result;
2461 }
2462};
2463
2464template <typename DerivedCCG, typename FuncTy, typename CallTy>
2465void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2466 std::string Label) const {
2467 WriteGraph(this, "", false, Label,
2468 DotFilePathPrefix + "ccg." + Label + ".dot");
2469}
2470
2471template <typename DerivedCCG, typename FuncTy, typename CallTy>
2472typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2473CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2474 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2475 DenseSet<uint32_t> ContextIdsToMove) {
2476 ContextNode *Node = Edge->Callee;
2477 NodeOwner.push_back(
2478 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2479 ContextNode *Clone = NodeOwner.back().get();
2480 Node->addClone(Clone);
2481 assert(NodeToCallingFunc.count(Node));
2482 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2483 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2484 ContextIdsToMove);
2485 return Clone;
2486}
2487
2488template <typename DerivedCCG, typename FuncTy, typename CallTy>
2489void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2490 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2491 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2492 bool NewClone,
2493 DenseSet<uint32_t> ContextIdsToMove) {
2494 // NewCallee and Edge's current callee must be clones of the same original
2495 // node (Edge's current callee may be the original node too).
2496 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2497
2498 ContextNode *OldCallee = Edge->Callee;
2499
2500 // We might already have an edge to the new callee from earlier cloning for a
2501 // different allocation. If one exists we will reuse it.
2502 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2503
2504 // Callers will pass an empty ContextIdsToMove set when they want to move the
2505 // edge. Copy in Edge's ids for simplicity.
2506 if (ContextIdsToMove.empty())
2507 ContextIdsToMove = Edge->getContextIds();
2508
2509 // If we are moving all of Edge's ids, then just move the whole Edge.
2510 // Otherwise only move the specified subset, to a new edge if needed.
2511 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2512 // Moving the whole Edge.
2513 if (CallerEdgeI)
2514 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2515 else
2516 OldCallee->eraseCallerEdge(Edge.get());
2517 if (ExistingEdgeToNewCallee) {
2518 // Since we already have an edge to NewCallee, simply move the ids
2519 // onto it, and remove the existing Edge.
2520 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2521 ContextIdsToMove.end());
2522 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2523 assert(Edge->ContextIds == ContextIdsToMove);
2524 Edge->ContextIds.clear();
2525 Edge->AllocTypes = (uint8_t)AllocationType::None;
2526 Edge->Caller->eraseCalleeEdge(Edge.get());
2527 } else {
2528 // Otherwise just reconnect Edge to NewCallee.
2529 Edge->Callee = NewCallee;
2530 NewCallee->CallerEdges.push_back(Edge);
2531 // Don't need to update Edge's context ids since we are simply
2532 // reconnecting it.
2533 }
2534 // In either case, need to update the alloc types on New Callee.
2535 NewCallee->AllocTypes |= Edge->AllocTypes;
2536 } else {
2537 // Only moving a subset of Edge's ids.
2538 if (CallerEdgeI)
2539 ++CallerEdgeI;
2540 // Compute the alloc type of the subset of ids being moved.
2541 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2542 if (ExistingEdgeToNewCallee) {
2543 // Since we already have an edge to NewCallee, simply move the ids
2544 // onto it.
2545 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2546 ContextIdsToMove.end());
2547 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2548 } else {
2549 // Otherwise, create a new edge to NewCallee for the ids being moved.
2550 auto NewEdge = std::make_shared<ContextEdge>(
2551 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2552 Edge->Caller->CalleeEdges.push_back(NewEdge);
2553 NewCallee->CallerEdges.push_back(NewEdge);
2554 }
2555 // In either case, need to update the alloc types on NewCallee, and remove
2556 // those ids and update the alloc type on the original Edge.
2557 NewCallee->AllocTypes |= CallerEdgeAllocType;
2558 set_subtract(Edge->ContextIds, ContextIdsToMove);
2559 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2560 }
2561 // Now walk the old callee node's callee edges and move Edge's context ids
2562 // over to the corresponding edge into the clone (which is created here if
2563 // this is a newly created clone).
2564 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2565 // The context ids moving to the new callee are the subset of this edge's
2566 // context ids and the context ids on the caller edge being moved.
2567 DenseSet<uint32_t> EdgeContextIdsToMove =
2568 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2569 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2570 OldCalleeEdge->AllocTypes =
2571 computeAllocType(OldCalleeEdge->getContextIds());
2572 if (!NewClone) {
2573 // Update context ids / alloc type on corresponding edge to NewCallee.
2574 // There is a chance this may not exist if we are reusing an existing
2575 // clone, specifically during function assignment, where we would have
2576 // removed none type edges after creating the clone. If we can't find
2577 // a corresponding edge there, fall through to the cloning below.
2578 if (auto *NewCalleeEdge =
2579 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2580 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2581 EdgeContextIdsToMove.end());
2582 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2583 continue;
2584 }
2585 }
2586 auto NewEdge = std::make_shared<ContextEdge>(
2587 OldCalleeEdge->Callee, NewCallee,
2588 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2589 NewCallee->CalleeEdges.push_back(NewEdge);
2590 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2591 }
2592 // Recompute the node alloc type now that its callee edges have been
2593 // updated (since we will compute from those edges).
2594 OldCallee->AllocTypes = OldCallee->computeAllocType();
2595 // OldCallee alloc type should be None iff its context id set is now empty.
2596 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2597 OldCallee->emptyContextIds());
2598 if (VerifyCCG) {
2599 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2600 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2601 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2602 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2603 /*CheckEdges=*/false);
2604 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2605 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2606 /*CheckEdges=*/false);
2607 }
2608}
2609
2610template <typename DerivedCCG, typename FuncTy, typename CallTy>
2611void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2612 recursivelyRemoveNoneTypeCalleeEdges(
2613 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2614 auto Inserted = Visited.insert(Node);
2615 if (!Inserted.second)
2616 return;
2617
2618 removeNoneTypeCalleeEdges(Node);
2619
2620 for (auto *Clone : Node->Clones)
2621 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2622
2623 // The recursive call may remove some of this Node's caller edges.
2624 // Iterate over a copy and skip any that were removed.
2625 auto CallerEdges = Node->CallerEdges;
2626 for (auto &Edge : CallerEdges) {
2627 // Skip any that have been removed by an earlier recursive call.
2628 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2629 assert(!is_contained(Node->CallerEdges, Edge));
2630 continue;
2631 }
2632 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2633 }
2634}
2635
2636template <typename DerivedCCG, typename FuncTy, typename CallTy>
2637void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2639 for (auto &Entry : AllocationCallToContextNodeMap) {
2640 Visited.clear();
2641 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
2642 }
2643 Visited.clear();
2644 for (auto &Entry : AllocationCallToContextNodeMap)
2645 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2646 if (VerifyCCG)
2647 check();
2648}
2649
2650// helper function to check an AllocType is cold or notcold or both.
2652 return (AllocType == (uint8_t)AllocationType::Cold) ||
2653 (AllocType == (uint8_t)AllocationType::NotCold) ||
2654 (AllocType ==
2655 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2656}
2657
2658template <typename DerivedCCG, typename FuncTy, typename CallTy>
2659void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2660 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2661 const DenseSet<uint32_t> &AllocContextIds) {
2662 if (VerifyNodes)
2663 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2664 assert(!Node->CloneOf);
2665
2666 // If Node as a null call, then either it wasn't found in the module (regular
2667 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2668 // cloning (e.g. recursion, calls multiple targets, etc).
2669 // Do this here so that we don't try to recursively clone callers below, which
2670 // isn't useful at least for this node.
2671 if (!Node->hasCall())
2672 return;
2673
2674#ifndef NDEBUG
2675 auto Insert =
2676#endif
2677 Visited.insert(Node);
2678 // We should not have visited this node yet.
2679 assert(Insert.second);
2680 // The recursive call to identifyClones may delete the current edge from the
2681 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2682 // in an iterator and having recursive call erase from it. Other edges may
2683 // also get removed during the recursion, which will have null Callee and
2684 // Caller pointers (and are deleted later), so we skip those below.
2685 {
2686 auto CallerEdges = Node->CallerEdges;
2687 for (auto &Edge : CallerEdges) {
2688 // Skip any that have been removed by an earlier recursive call.
2689 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2690 assert(!llvm::count(Node->CallerEdges, Edge));
2691 continue;
2692 }
2693 // Ignore any caller we previously visited via another edge.
2694 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2695 identifyClones(Edge->Caller, Visited, AllocContextIds);
2696 }
2697 }
2698 }
2699
2700 // Check if we reached an unambiguous call or have have only a single caller.
2701 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2702 return;
2703
2704 // We need to clone.
2705
2706 // Try to keep the original version as alloc type NotCold. This will make
2707 // cases with indirect calls or any other situation with an unknown call to
2708 // the original function get the default behavior. We do this by sorting the
2709 // CallerEdges of the Node we will clone by alloc type.
2710 //
2711 // Give NotCold edge the lowest sort priority so those edges are at the end of
2712 // the caller edges vector, and stay on the original version (since the below
2713 // code clones greedily until it finds all remaining edges have the same type
2714 // and leaves the remaining ones on the original Node).
2715 //
2716 // We shouldn't actually have any None type edges, so the sorting priority for
2717 // that is arbitrary, and we assert in that case below.
2718 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2719 /*Cold*/ 1,
2720 /*NotColdCold*/ 2};
2721 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2722 [&](const std::shared_ptr<ContextEdge> &A,
2723 const std::shared_ptr<ContextEdge> &B) {
2724 // Nodes with non-empty context ids should be sorted before
2725 // those with empty context ids.
2726 if (A->ContextIds.empty())
2727 // Either B ContextIds are non-empty (in which case we
2728 // should return false because B < A), or B ContextIds
2729 // are empty, in which case they are equal, and we should
2730 // maintain the original relative ordering.
2731 return false;
2732 if (B->ContextIds.empty())
2733 return true;
2734
2735 if (A->AllocTypes == B->AllocTypes)
2736 // Use the first context id for each edge as a
2737 // tie-breaker.
2738 return *A->ContextIds.begin() < *B->ContextIds.begin();
2739 return AllocTypeCloningPriority[A->AllocTypes] <
2740 AllocTypeCloningPriority[B->AllocTypes];
2741 });
2742
2743 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2744
2745 // Iterate until we find no more opportunities for disambiguating the alloc
2746 // types via cloning. In most cases this loop will terminate once the Node
2747 // has a single allocation type, in which case no more cloning is needed.
2748 // We need to be able to remove Edge from CallerEdges, so need to adjust
2749 // iterator inside the loop.
2750 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2751 auto CallerEdge = *EI;
2752
2753 // See if cloning the prior caller edge left this node with a single alloc
2754 // type or a single caller. In that case no more cloning of Node is needed.
2755 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2756 break;
2757
2758 // Only need to process the ids along this edge pertaining to the given
2759 // allocation.
2760 auto CallerEdgeContextsForAlloc =
2761 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2762 if (CallerEdgeContextsForAlloc.empty()) {
2763 ++EI;
2764 continue;
2765 }
2766 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2767
2768 // Compute the node callee edge alloc types corresponding to the context ids
2769 // for this caller edge.
2770 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2771 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2772 for (auto &CalleeEdge : Node->CalleeEdges)
2773 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2774 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2775
2776 // Don't clone if doing so will not disambiguate any alloc types amongst
2777 // caller edges (including the callee edges that would be cloned).
2778 // Otherwise we will simply move all edges to the clone.
2779 //
2780 // First check if by cloning we will disambiguate the caller allocation
2781 // type from node's allocation type. Query allocTypeToUse so that we don't
2782 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2783 // neither of these should be None type.
2784 //
2785 // Then check if by cloning node at least one of the callee edges will be
2786 // disambiguated by splitting out different context ids.
2787 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2788 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2789 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2790 allocTypeToUse(Node->AllocTypes) &&
2791 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2792 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2793 ++EI;
2794 continue;
2795 }
2796
2797 // First see if we can use an existing clone. Check each clone and its
2798 // callee edges for matching alloc types.
2799 ContextNode *Clone = nullptr;
2800 for (auto *CurClone : Node->Clones) {
2801 if (allocTypeToUse(CurClone->AllocTypes) !=
2802 allocTypeToUse(CallerAllocTypeForAlloc))
2803 continue;
2804
2805 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2806 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2807 continue;
2808 Clone = CurClone;
2809 break;
2810 }
2811
2812 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2813 if (Clone)
2814 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2815 CallerEdgeContextsForAlloc);
2816 else
2817 Clone =
2818 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2819
2820 assert(EI == Node->CallerEdges.end() ||
2821 Node->AllocTypes != (uint8_t)AllocationType::None);
2822 // Sanity check that no alloc types on clone or its edges are None.
2823 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2824 }
2825
2826 // We should still have some context ids on the original Node.
2827 assert(!Node->emptyContextIds());
2828
2829 // Sanity check that no alloc types on node or edges are None.
2830 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2831
2832 if (VerifyNodes)
2833 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2834}
2835
2836void ModuleCallsiteContextGraph::updateAllocationCall(
2838 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2839 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2840 "memprof", AllocTypeString);
2841 cast<CallBase>(Call.call())->addFnAttr(A);
2842 OREGetter(Call.call()->getFunction())
2843 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2844 << ore::NV("AllocationCall", Call.call()) << " in clone "
2845 << ore::NV("Caller", Call.call()->getFunction())
2846 << " marked with memprof allocation attribute "
2847 << ore::NV("Attribute", AllocTypeString));
2848}
2849
2850void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2852 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2853 assert(AI);
2854 assert(AI->Versions.size() > Call.cloneNo());
2855 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2856}
2857
2858void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2859 FuncInfo CalleeFunc) {
2860 if (CalleeFunc.cloneNo() > 0)
2861 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2862 OREGetter(CallerCall.call()->getFunction())
2863 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2864 << ore::NV("Call", CallerCall.call()) << " in clone "
2865 << ore::NV("Caller", CallerCall.call()->getFunction())
2866 << " assigned to call function clone "
2867 << ore::NV("Callee", CalleeFunc.func()));
2868}
2869
2870void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2871 FuncInfo CalleeFunc) {
2872 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2873 assert(CI &&
2874 "Caller cannot be an allocation which should not have profiled calls");
2875 assert(CI->Clones.size() > CallerCall.cloneNo());
2876 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2877}
2878
2879CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2880 Instruction *>::FuncInfo
2881ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2882 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2883 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2884 // Use existing LLVM facilities for cloning and obtaining Call in clone
2885 ValueToValueMapTy VMap;
2886 auto *NewFunc = CloneFunction(Func.func(), VMap);
2887 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2888 assert(!Func.func()->getParent()->getFunction(Name));
2889 NewFunc->setName(Name);
2890 for (auto &Inst : CallsWithMetadataInFunc) {
2891 // This map always has the initial version in it.
2892 assert(Inst.cloneNo() == 0);
2893 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2894 }
2895 OREGetter(Func.func())
2896 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2897 << "created clone " << ore::NV("NewFunction", NewFunc));
2898 return {NewFunc, CloneNo};
2899}
2900
2901CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2902 IndexCall>::FuncInfo
2903IndexCallsiteContextGraph::cloneFunctionForCallsite(
2904 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2905 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2906 // Check how many clones we have of Call (and therefore function).
2907 // The next clone number is the current size of versions array.
2908 // Confirm this matches the CloneNo provided by the caller, which is based on
2909 // the number of function clones we have.
2910 assert(CloneNo ==
2911 (Call.call().is<AllocInfo *>()
2912 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2913 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2914 // Walk all the instructions in this function. Create a new version for
2915 // each (by adding an entry to the Versions/Clones summary array), and copy
2916 // over the version being called for the function clone being cloned here.
2917 // Additionally, add an entry to the CallMap for the new function clone,
2918 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2919 // to the new call clone.
2920 for (auto &Inst : CallsWithMetadataInFunc) {
2921 // This map always has the initial version in it.
2922 assert(Inst.cloneNo() == 0);
2923 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2924 assert(AI->Versions.size() == CloneNo);
2925 // We assign the allocation type later (in updateAllocationCall), just add
2926 // an entry for it here.
2927 AI->Versions.push_back(0);
2928 } else {
2929 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2930 assert(CI && CI->Clones.size() == CloneNo);
2931 // We assign the clone number later (in updateCall), just add an entry for
2932 // it here.
2933 CI->Clones.push_back(0);
2934 }
2935 CallMap[Inst] = {Inst.call(), CloneNo};
2936 }
2937 return {Func.func(), CloneNo};
2938}
2939
2940// This method assigns cloned callsites to functions, cloning the functions as
2941// needed. The assignment is greedy and proceeds roughly as follows:
2942//
2943// For each function Func:
2944// For each call with graph Node having clones:
2945// Initialize ClonesWorklist to Node and its clones
2946// Initialize NodeCloneCount to 0
2947// While ClonesWorklist is not empty:
2948// Clone = pop front ClonesWorklist
2949// NodeCloneCount++
2950// If Func has been cloned less than NodeCloneCount times:
2951// If NodeCloneCount is 1:
2952// Assign Clone to original Func
2953// Continue
2954// Create a new function clone
2955// If other callers not assigned to call a function clone yet:
2956// Assign them to call new function clone
2957// Continue
2958// Assign any other caller calling the cloned version to new clone
2959//
2960// For each caller of Clone:
2961// If caller is assigned to call a specific function clone:
2962// If we cannot assign Clone to that function clone:
2963// Create new callsite Clone NewClone
2964// Add NewClone to ClonesWorklist
2965// Continue
2966// Assign Clone to existing caller's called function clone
2967// Else:
2968// If Clone not already assigned to a function clone:
2969// Assign to first function clone without assignment
2970// Assign caller to selected function clone
2971template <typename DerivedCCG, typename FuncTy, typename CallTy>
2972bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2973 bool Changed = false;
2974
2975 // Keep track of the assignment of nodes (callsites) to function clones they
2976 // call.
2977 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2978
2979 // Update caller node to call function version CalleeFunc, by recording the
2980 // assignment in CallsiteToCalleeFuncCloneMap.
2981 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2982 const FuncInfo &CalleeFunc) {
2983 assert(Caller->hasCall());
2984 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2985 };
2986
2987 // Walk all functions for which we saw calls with memprof metadata, and handle
2988 // cloning for each of its calls.
2989 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2990 FuncInfo OrigFunc(Func);
2991 // Map from each clone of OrigFunc to a map of remappings of each call of
2992 // interest (from original uncloned call to the corresponding cloned call in
2993 // that function clone).
2994 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2995 for (auto &Call : CallsWithMetadata) {
2996 ContextNode *Node = getNodeForInst(Call);
2997 // Skip call if we do not have a node for it (all uses of its stack ids
2998 // were either on inlined chains or pruned from the MIBs), or if we did
2999 // not create any clones for it.
3000 if (!Node || Node->Clones.empty())
3001 continue;
3002 assert(Node->hasCall() &&
3003 "Not having a call should have prevented cloning");
3004
3005 // Track the assignment of function clones to clones of the current
3006 // callsite Node being handled.
3007 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3008
3009 // Assign callsite version CallsiteClone to function version FuncClone,
3010 // and also assign (possibly cloned) Call to CallsiteClone.
3011 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3012 CallInfo &Call,
3013 ContextNode *CallsiteClone,
3014 bool IsAlloc) {
3015 // Record the clone of callsite node assigned to this function clone.
3016 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3017
3018 assert(FuncClonesToCallMap.count(FuncClone));
3019 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3020 CallInfo CallClone(Call);
3021 if (CallMap.count(Call))
3022 CallClone = CallMap[Call];
3023 CallsiteClone->setCall(CallClone);
3024 };
3025
3026 // Keep track of the clones of callsite Node that need to be assigned to
3027 // function clones. This list may be expanded in the loop body below if we
3028 // find additional cloning is required.
3029 std::deque<ContextNode *> ClonesWorklist;
3030 // Ignore original Node if we moved all of its contexts to clones.
3031 if (!Node->emptyContextIds())
3032 ClonesWorklist.push_back(Node);
3033 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3034 Node->Clones.end());
3035
3036 // Now walk through all of the clones of this callsite Node that we need,
3037 // and determine the assignment to a corresponding clone of the current
3038 // function (creating new function clones as needed).
3039 unsigned NodeCloneCount = 0;
3040 while (!ClonesWorklist.empty()) {
3041 ContextNode *Clone = ClonesWorklist.front();
3042 ClonesWorklist.pop_front();
3043 NodeCloneCount++;
3044 if (VerifyNodes)
3045 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3046
3047 // Need to create a new function clone if we have more callsite clones
3048 // than existing function clones, which would have been assigned to an
3049 // earlier clone in the list (we assign callsite clones to function
3050 // clones greedily).
3051 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3052 // If this is the first callsite copy, assign to original function.
3053 if (NodeCloneCount == 1) {
3054 // Since FuncClonesToCallMap is empty in this case, no clones have
3055 // been created for this function yet, and no callers should have
3056 // been assigned a function clone for this callee node yet.
3058 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3059 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3060 }));
3061 // Initialize with empty call map, assign Clone to original function
3062 // and its callers, and skip to the next clone.
3063 FuncClonesToCallMap[OrigFunc] = {};
3064 AssignCallsiteCloneToFuncClone(
3065 OrigFunc, Call, Clone,
3066 AllocationCallToContextNodeMap.count(Call));
3067 for (auto &CE : Clone->CallerEdges) {
3068 // Ignore any caller that does not have a recorded callsite Call.
3069 if (!CE->Caller->hasCall())
3070 continue;
3071 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3072 }
3073 continue;
3074 }
3075
3076 // First locate which copy of OrigFunc to clone again. If a caller
3077 // of this callsite clone was already assigned to call a particular
3078 // function clone, we need to redirect all of those callers to the
3079 // new function clone, and update their other callees within this
3080 // function.
3081 FuncInfo PreviousAssignedFuncClone;
3082 auto EI = llvm::find_if(
3083 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3084 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3085 });
3086 bool CallerAssignedToCloneOfFunc = false;
3087 if (EI != Clone->CallerEdges.end()) {
3088 const std::shared_ptr<ContextEdge> &Edge = *EI;
3089 PreviousAssignedFuncClone =
3090 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3091 CallerAssignedToCloneOfFunc = true;
3092 }
3093
3094 // Clone function and save it along with the CallInfo map created
3095 // during cloning in the FuncClonesToCallMap.
3096 std::map<CallInfo, CallInfo> NewCallMap;
3097 unsigned CloneNo = FuncClonesToCallMap.size();
3098 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3099 "should already exist in the map");
3100 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3101 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3102 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3103 FunctionClonesAnalysis++;
3104 Changed = true;
3105
3106 // If no caller callsites were already assigned to a clone of this
3107 // function, we can simply assign this clone to the new func clone
3108 // and update all callers to it, then skip to the next clone.
3109 if (!CallerAssignedToCloneOfFunc) {
3110 AssignCallsiteCloneToFuncClone(
3111 NewFuncClone, Call, Clone,
3112 AllocationCallToContextNodeMap.count(Call));
3113 for (auto &CE : Clone->CallerEdges) {
3114 // Ignore any caller that does not have a recorded callsite Call.
3115 if (!CE->Caller->hasCall())
3116 continue;
3117 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3118 }
3119 continue;
3120 }
3121
3122 // We may need to do additional node cloning in this case.
3123 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3124 // that were previously assigned to call PreviousAssignedFuncClone,
3125 // to record that they now call NewFuncClone.
3126 for (auto CE : Clone->CallerEdges) {
3127 // Skip any that have been removed on an earlier iteration.
3128 if (!CE)
3129 continue;
3130 // Ignore any caller that does not have a recorded callsite Call.
3131 if (!CE->Caller->hasCall())
3132 continue;
3133
3134 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3135 // We subsequently fall through to later handling that
3136 // will perform any additional cloning required for
3137 // callers that were calling other function clones.
3138 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3139 PreviousAssignedFuncClone)
3140 continue;
3141
3142 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3143
3144 // If we are cloning a function that was already assigned to some
3145 // callers, then essentially we are creating new callsite clones
3146 // of the other callsites in that function that are reached by those
3147 // callers. Clone the other callees of the current callsite's caller
3148 // that were already assigned to PreviousAssignedFuncClone
3149 // accordingly. This is important since we subsequently update the
3150 // calls from the nodes in the graph and their assignments to callee
3151 // functions recorded in CallsiteToCalleeFuncCloneMap.
3152 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3153 // Skip any that have been removed on an earlier iteration when
3154 // cleaning up newly None type callee edges.
3155 if (!CalleeEdge)
3156 continue;
3157 ContextNode *Callee = CalleeEdge->Callee;
3158 // Skip the current callsite, we are looking for other
3159 // callsites Caller calls, as well as any that does not have a
3160 // recorded callsite Call.
3161 if (Callee == Clone || !Callee->hasCall())
3162 continue;
3163 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3164 removeNoneTypeCalleeEdges(NewClone);
3165 // Moving the edge may have resulted in some none type
3166 // callee edges on the original Callee.
3167 removeNoneTypeCalleeEdges(Callee);
3168 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3169 // If the Callee node was already assigned to call a specific
3170 // function version, make sure its new clone is assigned to call
3171 // that same function clone.
3172 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3173 RecordCalleeFuncOfCallsite(
3174 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3175 // Update NewClone with the new Call clone of this callsite's Call
3176 // created for the new function clone created earlier.
3177 // Recall that we have already ensured when building the graph
3178 // that each caller can only call callsites within the same
3179 // function, so we are guaranteed that Callee Call is in the
3180 // current OrigFunc.
3181 // CallMap is set up as indexed by original Call at clone 0.
3182 CallInfo OrigCall(Callee->getOrigNode()->Call);
3183 OrigCall.setCloneNo(0);
3184 std::map<CallInfo, CallInfo> &CallMap =
3185 FuncClonesToCallMap[NewFuncClone];
3186 assert(CallMap.count(OrigCall));
3187 CallInfo NewCall(CallMap[OrigCall]);
3188 assert(NewCall);
3189 NewClone->setCall(NewCall);
3190 }
3191 }
3192 // Fall through to handling below to perform the recording of the
3193 // function for this callsite clone. This enables handling of cases
3194 // where the callers were assigned to different clones of a function.
3195 }
3196
3197 // See if we can use existing function clone. Walk through
3198 // all caller edges to see if any have already been assigned to
3199 // a clone of this callsite's function. If we can use it, do so. If not,
3200 // because that function clone is already assigned to a different clone
3201 // of this callsite, then we need to clone again.
3202 // Basically, this checking is needed to handle the case where different
3203 // caller functions/callsites may need versions of this function
3204 // containing different mixes of callsite clones across the different
3205 // callsites within the function. If that happens, we need to create
3206 // additional function clones to handle the various combinations.
3207 //
3208 // Keep track of any new clones of this callsite created by the
3209 // following loop, as well as any existing clone that we decided to
3210 // assign this clone to.
3211 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3212 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3213 // We need to be able to remove Edge from CallerEdges, so need to adjust
3214 // iterator in the loop.
3215 for (auto EI = Clone->CallerEdges.begin();
3216 EI != Clone->CallerEdges.end();) {
3217 auto Edge = *EI;
3218 // Ignore any caller that does not have a recorded callsite Call.
3219 if (!Edge->Caller->hasCall()) {
3220 EI++;
3221 continue;
3222 }
3223 // If this caller already assigned to call a version of OrigFunc, need
3224 // to ensure we can assign this callsite clone to that function clone.
3225 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3226 FuncInfo FuncCloneCalledByCaller =
3227 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3228 // First we need to confirm that this function clone is available
3229 // for use by this callsite node clone.
3230 //
3231 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3232 // its callsite clones, one of those callsite clones X could have
3233 // been assigned to the same function clone called by Edge's caller
3234 // - if Edge's caller calls another callsite within Node's original
3235 // function, and that callsite has another caller reaching clone X.
3236 // We need to clone Node again in this case.
3237 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3238 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3239 Clone) ||
3240 // Detect when we have multiple callers of this callsite that
3241 // have already been assigned to specific, and different, clones
3242 // of OrigFunc (due to other unrelated callsites in Func they
3243 // reach via call contexts). Is this Clone of callsite Node
3244 // assigned to a different clone of OrigFunc? If so, clone Node
3245 // again.
3246 (FuncCloneAssignedToCurCallsiteClone &&
3247 FuncCloneAssignedToCurCallsiteClone !=
3248 FuncCloneCalledByCaller)) {
3249 // We need to use a different newly created callsite clone, in
3250 // order to assign it to another new function clone on a
3251 // subsequent iteration over the Clones array (adjusted below).
3252 // Note we specifically do not reset the
3253 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3254 // when this new clone is processed later we know which version of
3255 // the function to copy (so that other callsite clones we have
3256 // assigned to that function clone are properly cloned over). See
3257 // comments in the function cloning handling earlier.
3258
3259 // Check if we already have cloned this callsite again while
3260 // walking through caller edges, for a caller calling the same
3261 // function clone. If so, we can move this edge to that new clone
3262 // rather than creating yet another new clone.
3263 if (FuncCloneToNewCallsiteCloneMap.count(
3264 FuncCloneCalledByCaller)) {
3265 ContextNode *NewClone =
3266 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3267 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3268 // Cleanup any none type edges cloned over.
3269 removeNoneTypeCalleeEdges(NewClone);
3270 } else {
3271 // Create a new callsite clone.
3272 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3273 removeNoneTypeCalleeEdges(NewClone);
3274 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3275 NewClone;
3276 // Add to list of clones and process later.
3277 ClonesWorklist.push_back(NewClone);
3278 assert(EI == Clone->CallerEdges.end() ||
3279 Clone->AllocTypes != (uint8_t)AllocationType::None);
3280 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3281 }
3282 // Moving the caller edge may have resulted in some none type
3283 // callee edges.
3284 removeNoneTypeCalleeEdges(Clone);
3285 // We will handle the newly created callsite clone in a subsequent
3286 // iteration over this Node's Clones. Continue here since we
3287 // already adjusted iterator EI while moving the edge.
3288 continue;
3289 }
3290
3291 // Otherwise, we can use the function clone already assigned to this
3292 // caller.
3293 if (!FuncCloneAssignedToCurCallsiteClone) {
3294 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3295 // Assign Clone to FuncCloneCalledByCaller
3296 AssignCallsiteCloneToFuncClone(
3297 FuncCloneCalledByCaller, Call, Clone,
3298 AllocationCallToContextNodeMap.count(Call));
3299 } else
3300 // Don't need to do anything - callsite is already calling this
3301 // function clone.
3302 assert(FuncCloneAssignedToCurCallsiteClone ==
3303 FuncCloneCalledByCaller);
3304
3305 } else {
3306 // We have not already assigned this caller to a version of
3307 // OrigFunc. Do the assignment now.
3308
3309 // First check if we have already assigned this callsite clone to a
3310 // clone of OrigFunc for another caller during this iteration over
3311 // its caller edges.
3312 if (!FuncCloneAssignedToCurCallsiteClone) {
3313 // Find first function in FuncClonesToCallMap without an assigned
3314 // clone of this callsite Node. We should always have one
3315 // available at this point due to the earlier cloning when the
3316 // FuncClonesToCallMap size was smaller than the clone number.
3317 for (auto &CF : FuncClonesToCallMap) {
3318 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3319 FuncCloneAssignedToCurCallsiteClone = CF.first;
3320 break;
3321 }
3322 }
3323 assert(FuncCloneAssignedToCurCallsiteClone);
3324 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3325 AssignCallsiteCloneToFuncClone(
3326 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3327 AllocationCallToContextNodeMap.count(Call));
3328 } else
3329 assert(FuncCloneToCurNodeCloneMap
3330 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3331 // Update callers to record function version called.
3332 RecordCalleeFuncOfCallsite(Edge->Caller,
3333 FuncCloneAssignedToCurCallsiteClone);
3334 }
3335
3336 EI++;
3337 }
3338 }
3339 if (VerifyCCG) {
3340 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3341 for (const auto &PE : Node->CalleeEdges)
3342 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3343 for (const auto &CE : Node->CallerEdges)
3344 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3345 for (auto *Clone : Node->Clones) {
3346 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3347 for (const auto &PE : Clone->CalleeEdges)
3348 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3349 for (const auto &CE : Clone->CallerEdges)
3350 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3351 }
3352 }
3353 }
3354 }
3355
3356 auto UpdateCalls = [&](ContextNode *Node,
3358 auto &&UpdateCalls) {
3359 auto Inserted = Visited.insert(Node);
3360 if (!Inserted.second)
3361 return;
3362
3363 for (auto *Clone : Node->Clones)
3364 UpdateCalls(Clone, Visited, UpdateCalls);
3365
3366 for (auto &Edge : Node->CallerEdges)
3367 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3368
3369 // Skip if either no call to update, or if we ended up with no context ids
3370 // (we moved all edges onto other clones).
3371 if (!Node->hasCall() || Node->emptyContextIds())
3372 return;
3373
3374 if (Node->IsAllocation) {
3375 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3376 return;
3377 }
3378
3379 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3380 return;
3381
3382 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3383 updateCall(Node->Call, CalleeFunc);
3384 };
3385
3386 // Performs DFS traversal starting from allocation nodes to update calls to
3387 // reflect cloning decisions recorded earlier. For regular LTO this will
3388 // update the actual calls in the IR to call the appropriate function clone
3389 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3390 // are recorded in the summary entries.
3392 for (auto &Entry : AllocationCallToContextNodeMap)
3393 UpdateCalls(Entry.second, Visited, UpdateCalls);
3394
3395 return Changed;
3396}
3397
3399 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3401 &FuncToAliasMap) {
3402 // The first "clone" is the original copy, we should only call this if we
3403 // needed to create new clones.
3404 assert(NumClones > 1);
3406 VMaps.reserve(NumClones - 1);
3407 FunctionsClonedThinBackend++;
3408 for (unsigned I = 1; I < NumClones; I++) {
3409 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3410 auto *NewF = CloneFunction(&F, *VMaps.back());
3411 FunctionClonesThinBackend++;
3412 // Strip memprof and callsite metadata from clone as they are no longer
3413 // needed.
3414 for (auto &BB : *NewF) {
3415 for (auto &Inst : BB) {
3416 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3417 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3418 }
3419 }
3420 std::string Name = getMemProfFuncName(F.getName(), I);
3421 auto *PrevF = M.getFunction(Name);
3422 if (PrevF) {
3423 // We might have created this when adjusting callsite in another
3424 // function. It should be a declaration.
3425 assert(PrevF->isDeclaration());
3426 NewF->takeName(PrevF);
3427 PrevF->replaceAllUsesWith(NewF);
3428 PrevF->eraseFromParent();
3429 } else
3430 NewF->setName(Name);
3431 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3432 << "created clone " << ore::NV("NewFunction", NewF));
3433
3434 // Now handle aliases to this function, and clone those as well.
3435 if (!FuncToAliasMap.count(&F))
3436 continue;
3437 for (auto *A : FuncToAliasMap[&F]) {
3438 std::string Name = getMemProfFuncName(A->getName(), I);
3439 auto *PrevA = M.getNamedAlias(Name);
3440 auto *NewA = GlobalAlias::create(A->getValueType(),
3441 A->getType()->getPointerAddressSpace(),
3442 A->getLinkage(), Name, NewF);
3443 NewA->copyAttributesFrom(A);
3444 if (PrevA) {
3445 // We might have created this when adjusting callsite in another
3446 // function. It should be a declaration.
3447 assert(PrevA->isDeclaration());
3448 NewA->takeName(PrevA);
3449 PrevA->replaceAllUsesWith(NewA);
3450 PrevA->eraseFromParent();
3451 }
3452 }
3453 }
3454 return VMaps;
3455}
3456
3457// Locate the summary for F. This is complicated by the fact that it might
3458// have been internalized or promoted.
3460 const ModuleSummaryIndex *ImportSummary) {
3461 // FIXME: Ideally we would retain the original GUID in some fashion on the
3462 // function (e.g. as metadata), but for now do our best to locate the
3463 // summary without that information.
3464 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3465 if (!TheFnVI)
3466 // See if theFn was internalized, by checking index directly with
3467 // original name (this avoids the name adjustment done by getGUID() for
3468 // internal symbols).
3469 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3470 if (TheFnVI)
3471 return TheFnVI;
3472 // Now query with the original name before any promotion was performed.
3473 StringRef OrigName =
3475 std::string OrigId = GlobalValue::getGlobalIdentifier(
3476 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3477 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3478 if (TheFnVI)
3479 return TheFnVI;
3480 // Could be a promoted local imported from another module. We need to pass
3481 // down more info here to find the original module id. For now, try with
3482 // the OrigName which might have been stored in the OidGuidMap in the
3483 // index. This would not work if there were same-named locals in multiple
3484 // modules, however.
3485 auto OrigGUID =
3486 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3487 if (OrigGUID)
3488 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3489 return TheFnVI;
3490}
3491
3492bool MemProfContextDisambiguation::applyImport(Module &M) {
3493 assert(ImportSummary);
3494 bool Changed = false;
3495
3496 auto IsMemProfClone = [](const Function &F) {
3497 return F.getName().contains(MemProfCloneSuffix);
3498 };
3499
3500 // We also need to clone any aliases that reference cloned functions, because
3501 // the modified callsites may invoke via the alias. Keep track of the aliases
3502 // for each function.
3503 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3504 FuncToAliasMap;
3505 for (auto &A : M.aliases()) {
3506 auto *Aliasee = A.getAliaseeObject();
3507 if (auto *F = dyn_cast<Function>(Aliasee))
3508 FuncToAliasMap[F].insert(&A);
3509 }
3510
3511 for (auto &F : M) {
3512 if (F.isDeclaration() || IsMemProfClone(F))
3513 continue;
3514
3516
3518 bool ClonesCreated = false;
3519 unsigned NumClonesCreated = 0;
3520 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3521 // We should at least have version 0 which is the original copy.
3522 assert(NumClones > 0);
3523 // If only one copy needed use original.
3524 if (NumClones == 1)
3525 return;
3526 // If we already performed cloning of this function, confirm that the
3527 // requested number of clones matches (the thin link should ensure the
3528 // number of clones for each constituent callsite is consistent within
3529 // each function), before returning.
3530 if (ClonesCreated) {
3531 assert(NumClonesCreated == NumClones);
3532 return;
3533 }
3534 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3535 // The first "clone" is the original copy, which doesn't have a VMap.
3536 assert(VMaps.size() == NumClones - 1);
3537 Changed = true;
3538 ClonesCreated = true;
3539 NumClonesCreated = NumClones;
3540 };
3541
3542 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3543 Function *CalledFunction) {
3544 // Perform cloning if not yet done.
3545 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3546
3547 // Should have skipped indirect calls via mayHaveMemprofSummary.
3548 assert(CalledFunction);
3549 assert(!IsMemProfClone(*CalledFunction));
3550
3551 // Update the calls per the summary info.
3552 // Save orig name since it gets updated in the first iteration
3553 // below.
3554 auto CalleeOrigName = CalledFunction->getName();
3555 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3556 // Do nothing if this version calls the original version of its
3557 // callee.
3558 if (!StackNode.Clones[J])
3559 continue;
3560 auto NewF = M.getOrInsertFunction(
3561 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3562 CalledFunction->getFunctionType());
3563 CallBase *CBClone;
3564 // Copy 0 is the original function.
3565 if (!J)
3566 CBClone = CB;
3567 else
3568 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3569 CBClone->setCalledFunction(NewF);
3570 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3571 << ore::NV("Call", CBClone) << " in clone "
3572 << ore::NV("Caller", CBClone->getFunction())
3573 << " assigned to call function clone "
3574 << ore::NV("Callee", NewF.getCallee()));
3575 }
3576 };
3577
3578 // Locate the summary for F.
3579 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3580 // If not found, this could be an imported local (see comment in
3581 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3582 // module (where it would have been promoted to global scope so should
3583 // satisfy any reference in this module).
3584 if (!TheFnVI)
3585 continue;
3586
3587 auto *GVSummary =
3588 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3589 if (!GVSummary) {
3590 // Must have been imported, use the summary which matches the definition。
3591 // (might be multiple if this was a linkonce_odr).
3592 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3593 assert(SrcModuleMD &&
3594 "enable-import-metadata is needed to emit thinlto_src_module");
3595 StringRef SrcModule =
3596 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3597 for (auto &GVS : TheFnVI.getSummaryList()) {
3598 if (GVS->modulePath() == SrcModule) {
3599 GVSummary = GVS.get();
3600 break;
3601 }
3602 }
3603 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3604 }
3605
3606 // If this was an imported alias skip it as we won't have the function
3607 // summary, and it should be cloned in the original module.
3608 if (isa<AliasSummary>(GVSummary))
3609 continue;
3610
3611 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3612
3613 if (FS->allocs().empty() && FS->callsites().empty())
3614 continue;
3615
3616 auto SI = FS->callsites().begin();
3617 auto AI = FS->allocs().begin();
3618
3619 // To handle callsite infos synthesized for tail calls which have missing
3620 // frames in the profiled context, map callee VI to the synthesized callsite
3621 // info.
3622 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3623 // Iterate the callsites for this function in reverse, since we place all
3624 // those synthesized for tail calls at the end.
3625 for (auto CallsiteIt = FS->callsites().rbegin();
3626 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3627 auto &Callsite = *CallsiteIt;
3628 // Stop as soon as we see a non-synthesized callsite info (see comment
3629 // above loop). All the entries added for discovered tail calls have empty
3630 // stack ids.
3631 if (!Callsite.StackIdIndices.empty())
3632 break;
3633 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3634 }
3635
3636 // Assume for now that the instructions are in the exact same order
3637 // as when the summary was created, but confirm this is correct by
3638 // matching the stack ids.
3639 for (auto &BB : F) {
3640 for (auto &I : BB) {
3641 auto *CB = dyn_cast<CallBase>(&I);
3642 // Same handling as when creating module summary.
3643 if (!mayHaveMemprofSummary(CB))
3644 continue;
3645
3646 auto *CalledValue = CB->getCalledOperand();
3647 auto *CalledFunction = CB->getCalledFunction();
3648 if (CalledValue && !CalledFunction) {
3649 CalledValue = CalledValue->stripPointerCasts();
3650 // Stripping pointer casts can reveal a called function.
3651 CalledFunction = dyn_cast<Function>(CalledValue);
3652 }
3653 // Check if this is an alias to a function. If so, get the
3654 // called aliasee for the checks below.
3655 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3656 assert(!CalledFunction &&
3657 "Expected null called function in callsite for alias");
3658 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3659 }
3660
3662 I.getMetadata(LLVMContext::MD_callsite));
3663 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3664
3665 // Include allocs that were already assigned a memprof function
3666 // attribute in the statistics.
3667 if (CB->getAttributes().hasFnAttr("memprof")) {
3668 assert(!MemProfMD);
3669 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3670 ? AllocTypeColdThinBackend++
3671 : AllocTypeNotColdThinBackend++;
3672 OrigAllocsThinBackend++;
3673 AllocVersionsThinBackend++;
3674 if (!MaxAllocVersionsThinBackend)
3675 MaxAllocVersionsThinBackend = 1;
3676 // Remove any remaining callsite metadata and we can skip the rest of
3677 // the handling for this instruction, since no cloning needed.
3678 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3679 continue;
3680 }
3681
3682 if (MemProfMD) {
3683 // Consult the next alloc node.
3684 assert(AI != FS->allocs().end());
3685 auto &AllocNode = *(AI++);
3686
3687 // Sanity check that the MIB stack ids match between the summary and
3688 // instruction metadata.
3689 auto MIBIter = AllocNode.MIBs.begin();
3690 for (auto &MDOp : MemProfMD->operands()) {
3691 assert(MIBIter != AllocNode.MIBs.end());
3692 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3693 MIBIter->StackIdIndices.begin();
3694 auto *MIBMD = cast<const MDNode>(MDOp);
3695 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3696 assert(StackMDNode);
3697 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3698 auto ContextIterBegin =
3699 StackContext.beginAfterSharedPrefix(CallsiteContext);
3700 // Skip the checking on the first iteration.
3701 uint64_t LastStackContextId =
3702 (ContextIterBegin != StackContext.end() &&
3703 *ContextIterBegin == 0)
3704 ? 1
3705 : 0;
3706 for (auto ContextIter = ContextIterBegin;
3707 ContextIter != StackContext.end(); ++ContextIter) {
3708 // If this is a direct recursion, simply skip the duplicate
3709 // entries, to be consistent with how the summary ids were
3710 // generated during ModuleSummaryAnalysis.
3711 if (LastStackContextId == *ContextIter)
3712 continue;
3713 LastStackContextId = *ContextIter;
3714 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3715 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3716 *ContextIter);
3717 StackIdIndexIter++;
3718 }
3719 MIBIter++;
3720 }
3721
3722 // Perform cloning if not yet done.
3723 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3724
3725 OrigAllocsThinBackend++;
3726 AllocVersionsThinBackend += AllocNode.Versions.size();
3727 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3728 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3729
3730 // If there is only one version that means we didn't end up
3731 // considering this function for cloning, and in that case the alloc
3732 // will still be none type or should have gotten the default NotCold.
3733 // Skip that after calling clone helper since that does some sanity
3734 // checks that confirm we haven't decided yet that we need cloning.
3735 if (AllocNode.Versions.size() == 1) {
3736 assert((AllocationType)AllocNode.Versions[0] ==
3738 (AllocationType)AllocNode.Versions[0] ==
3740 UnclonableAllocsThinBackend++;
3741 continue;
3742 }
3743
3744 // All versions should have a singular allocation type.
3745 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3746 return Type == ((uint8_t)AllocationType::NotCold |
3747 (uint8_t)AllocationType::Cold);
3748 }));
3749
3750 // Update the allocation types per the summary info.
3751 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3752 // Ignore any that didn't get an assigned allocation type.
3753 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3754 continue;
3755 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3756 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3757 : AllocTypeNotColdThinBackend++;
3758 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3759 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3760 AllocTypeString);
3761 CallBase *CBClone;
3762 // Copy 0 is the original function.
3763 if (!J)
3764 CBClone = CB;
3765 else
3766 // Since VMaps are only created for new clones, we index with
3767 // clone J-1 (J==0 is the original clone and does not have a VMaps
3768 // entry).
3769 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3770 CBClone->addFnAttr(A);
3771 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3772 << ore::NV("AllocationCall", CBClone) << " in clone "
3773 << ore::NV("Caller", CBClone->getFunction())
3774 << " marked with memprof allocation attribute "
3775 << ore::NV("Attribute", AllocTypeString));
3776 }
3777 } else if (!CallsiteContext.empty()) {
3778 // Consult the next callsite node.
3779 assert(SI != FS->callsites().end());
3780 auto &StackNode = *(SI++);
3781
3782#ifndef NDEBUG
3783 // Sanity check that the stack ids match between the summary and
3784 // instruction metadata.
3785 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3786 for (auto StackId : CallsiteContext) {
3787 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3788 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3789 StackId);
3790 StackIdIndexIter++;
3791 }
3792#endif
3793
3794 CloneCallsite(StackNode, CB, CalledFunction);
3795 } else if (CB->isTailCall()) {
3796 // Locate the synthesized callsite info for the callee VI, if any was
3797 // created, and use that for cloning.
3798 ValueInfo CalleeVI =
3799 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3800 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3801 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3802 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3803 CloneCallsite(Callsite->second, CB, CalledFunction);
3804 }
3805 }
3806 // Memprof and callsite metadata on memory allocations no longer needed.
3807 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3808 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3809 }
3810 }
3811 }
3812
3813 return Changed;
3814}
3815
3816template <typename DerivedCCG, typename FuncTy, typename CallTy>
3817bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3818 if (DumpCCG) {
3819 dbgs() << "CCG before cloning:\n";
3820 dbgs() << *this;
3821 }
3822 if (ExportToDot)
3823 exportToDot("postbuild");
3824
3825 if (VerifyCCG) {
3826 check();
3827 }
3828
3829 identifyClones();
3830
3831 if (VerifyCCG) {
3832 check();
3833 }
3834
3835 if (DumpCCG) {
3836 dbgs() << "CCG after cloning:\n";
3837 dbgs() << *this;
3838 }
3839 if (ExportToDot)
3840 exportToDot("cloned");
3841
3842 bool Changed = assignFunctions();
3843
3844 if (DumpCCG) {
3845 dbgs() << "CCG after assigning function clones:\n";
3846 dbgs() << *this;
3847 }
3848 if (ExportToDot)
3849 exportToDot("clonefuncassign");
3850
3852 printTotalSizes(errs());
3853
3854 return Changed;
3855}
3856
3857bool MemProfContextDisambiguation::processModule(
3858 Module &M,
3860
3861 // If we have an import summary, then the cloning decisions were made during
3862 // the thin link on the index. Apply them and return.
3863 if (ImportSummary)
3864 return applyImport(M);
3865
3866 // TODO: If/when other types of memprof cloning are enabled beyond just for
3867 // hot and cold, we will need to change this to individually control the
3868 // AllocationType passed to addStackNodesForMIB during CCG construction.
3869 // Note that we specifically check this after applying imports above, so that
3870 // the option isn't needed to be passed to distributed ThinLTO backend
3871 // clang processes, which won't necessarily have visibility into the linker
3872 // dependences. Instead the information is communicated from the LTO link to
3873 // the backends via the combined summary index.
3874 if (!SupportsHotColdNew)
3875 return false;
3876
3877 ModuleCallsiteContextGraph CCG(M, OREGetter);
3878 return CCG.process();
3879}
3880
3882 const ModuleSummaryIndex *Summary)
3883 : ImportSummary(Summary) {
3884 if (ImportSummary) {
3885 // The MemProfImportSummary should only be used for testing ThinLTO
3886 // distributed backend handling via opt, in which case we don't have a
3887 // summary from the pass pipeline.
3889 return;
3890 }
3891 if (MemProfImportSummary.empty())
3892 return;
3893
3894 auto ReadSummaryFile =
3896 if (!ReadSummaryFile) {
3897 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3898 "Error loading file '" + MemProfImportSummary +
3899 "': ");
3900 return;
3901 }
3902 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3903 if (!ImportSummaryForTestingOrErr) {
3904 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3905 "Error parsing file '" + MemProfImportSummary +
3906 "': ");
3907 return;
3908 }
3909 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3910 ImportSummary = ImportSummaryForTesting.get();
3911}
3912
3915 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3916 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3918 };
3919 if (!processModule(M, OREGetter))
3920 return PreservedAnalyses::all();
3921 return PreservedAnalyses::none();
3922}
3923
3927 isPrevailing) {
3928 // TODO: If/when other types of memprof cloning are enabled beyond just for
3929 // hot and cold, we will need to change this to individually control the
3930 // AllocationType passed to addStackNodesForMIB during CCG construction.
3931 // The index was set from the option, so these should be in sync.
3932 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3933 if (!SupportsHotColdNew)
3934 return;
3935
3936 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3937 CCG.process();
3938}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:251
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1574
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:544
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:178
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:563
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
Metadata node.
Definition: Metadata.h:1067
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
std::pair< KeyT, ValueT > & back()
Definition: MapVector.h:85
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
uint64_t getMIBTotalSize(const MDNode *MIB)
Returns the total size from an MIB metadata node, or 0 if it was not recorded.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1226
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const