LLVM 23.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
115 "Number of aliasees prevailing in a different module than its alias");
116
118 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
119 cl::value_desc("filename"),
120 cl::desc("Specify the path prefix of the MemProf dot files."));
121
122static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
124 cl::desc("Export graph to dot files."));
125
126// TODO: Remove this option once new handling is validated more widely.
128 "memprof-merge-iteration", cl::init(true), cl::Hidden,
129 cl::desc("Iteratively apply merging on a node to catch new callers"));
130
131// How much of the graph to export to dot.
133 All, // The full CCG graph.
134 Alloc, // Only contexts for the specified allocation.
135 Context, // Only the specified context.
136};
137
139 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
142 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
144 "Export only nodes with contexts feeding given "
145 "-memprof-dot-alloc-id"),
146 clEnumValN(DotScope::Context, "context",
147 "Export only nodes with given -memprof-dot-context-id")));
148
150 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
151 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
152 "or to highlight if -memprof-dot-scope=all"));
153
155 "memprof-dot-context-id", cl::init(0), cl::Hidden,
156 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
157 "highlight otherwise"));
158
159static cl::opt<bool>
160 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
161 cl::desc("Dump CallingContextGraph to stdout after each stage."));
162
163static cl::opt<bool>
164 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
165 cl::desc("Perform verification checks on CallingContextGraph."));
166
167static cl::opt<bool>
168 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
169 cl::desc("Perform frequent verification checks on nodes."));
170
172 "memprof-import-summary",
173 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
174 cl::Hidden);
175
177 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
179 cl::desc("Max depth to recursively search for missing "
180 "frames through tail calls."));
181
182// Optionally enable cloning of callsites involved with recursive cycles
184 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
185 cl::desc("Allow cloning of callsites involved in recursive cycles"));
186
188 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
189 cl::desc("Allow cloning of contexts through recursive cycles"));
190
191// Generally this is needed for correct assignment of allocation clones to
192// function clones, however, allow it to be disabled for debugging while the
193// functionality is new and being tested more widely.
194static cl::opt<bool>
195 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
196 cl::desc("Merge clones before assigning functions"));
197
198// When disabled, try to detect and prevent cloning of recursive contexts.
199// This is only necessary until we support cloning through recursive cycles.
200// Leave on by default for now, as disabling requires a little bit of compile
201// time overhead and doesn't affect correctness, it will just inflate the cold
202// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
204 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
205 cl::desc("Allow cloning of contexts having recursive cycles"));
206
207// Set the minimum absolute count threshold for allowing inlining of indirect
208// calls promoted during cloning.
210 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
211 cl::desc("Minimum absolute count for promoted target to be inlinable"));
212
213namespace llvm {
215 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
216 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
217
218// Indicate we are linking with an allocator that supports hot/cold operator
219// new interfaces.
221 "supports-hot-cold-new", cl::init(false), cl::Hidden,
222 cl::desc("Linking with hot/cold operator new interfaces"));
223
225 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
226 cl::desc(
227 "Require target function definition when promoting indirect calls"));
228
231
233 "memprof-top-n-important", cl::init(10), cl::Hidden,
234 cl::desc("Number of largest cold contexts to consider important"));
235
237 "memprof-fixup-important", cl::init(true), cl::Hidden,
238 cl::desc("Enables edge fixup for important contexts"));
239
241
242} // namespace llvm
243
244namespace {
245
246/// CRTP base for graphs built from either IR or ThinLTO summary index.
247///
248/// The graph represents the call contexts in all memprof metadata on allocation
249/// calls, with nodes for the allocations themselves, as well as for the calls
250/// in each context. The graph is initially built from the allocation memprof
251/// metadata (or summary) MIBs. It is then updated to match calls with callsite
252/// metadata onto the nodes, updating it to reflect any inlining performed on
253/// those calls.
254///
255/// Each MIB (representing an allocation's call context with allocation
256/// behavior) is assigned a unique context id during the graph build. The edges
257/// and nodes in the graph are decorated with the context ids they carry. This
258/// is used to correctly update the graph when cloning is performed so that we
259/// can uniquify the context for a single (possibly cloned) allocation.
260template <typename DerivedCCG, typename FuncTy, typename CallTy>
261class CallsiteContextGraph {
262public:
263 CallsiteContextGraph() = default;
264 CallsiteContextGraph(const CallsiteContextGraph &) = default;
265 CallsiteContextGraph(CallsiteContextGraph &&) = default;
266
267 /// Main entry point to perform analysis and transformations on graph.
268 bool process();
269
270 /// Perform cloning on the graph necessary to uniquely identify the allocation
271 /// behavior of an allocation based on its context.
272 void identifyClones();
273
274 /// Assign callsite clones to functions, cloning functions as needed to
275 /// accommodate the combinations of their callsite clones reached by callers.
276 /// For regular LTO this clones functions and callsites in the IR, but for
277 /// ThinLTO the cloning decisions are noted in the summaries and later applied
278 /// in applyImport.
279 bool assignFunctions();
280
281 void dump() const;
282 void print(raw_ostream &OS) const;
283 void printTotalSizes(raw_ostream &OS) const;
284
286 const CallsiteContextGraph &CCG) {
287 CCG.print(OS);
288 return OS;
289 }
290
291 friend struct GraphTraits<
292 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
293 friend struct DOTGraphTraits<
294 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
295
296 void exportToDot(std::string Label) const;
297
298 /// Represents a function clone via FuncTy pointer and clone number pair.
299 struct FuncInfo final
300 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
301 using Base = std::pair<FuncTy *, unsigned>;
302 FuncInfo(const Base &B) : Base(B) {}
303 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
304 explicit operator bool() const { return this->first != nullptr; }
305 FuncTy *func() const { return this->first; }
306 unsigned cloneNo() const { return this->second; }
307 };
308
309 /// Represents a callsite clone via CallTy and clone number pair.
310 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
311 using Base = std::pair<CallTy, unsigned>;
312 CallInfo(const Base &B) : Base(B) {}
313 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
314 : Base(Call, CloneNo) {}
315 explicit operator bool() const { return (bool)this->first; }
316 CallTy call() const { return this->first; }
317 unsigned cloneNo() const { return this->second; }
318 void setCloneNo(unsigned N) { this->second = N; }
319 void print(raw_ostream &OS) const {
320 if (!operator bool()) {
321 assert(!cloneNo());
322 OS << "null Call";
323 return;
324 }
325 call()->print(OS);
326 OS << "\t(clone " << cloneNo() << ")";
327 }
328 void dump() const {
329 print(dbgs());
330 dbgs() << "\n";
331 }
332 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
333 Call.print(OS);
334 return OS;
335 }
336 };
337
338 struct ContextEdge;
339
340 /// Node in the Callsite Context Graph
341 struct ContextNode {
342 // Assigned to nodes as they are created, useful for debugging.
343 unsigned NodeId = 0;
344
345 // Keep this for now since in the IR case where we have an Instruction* it
346 // is not as immediately discoverable. Used for printing richer information
347 // when dumping graph.
348 bool IsAllocation;
349
350 // Keeps track of when the Call was reset to null because there was
351 // recursion.
352 bool Recursive = false;
353
354 // This will be formed by ORing together the AllocationType enum values
355 // for contexts including this node.
356 uint8_t AllocTypes = 0;
357
358 // The corresponding allocation or interior call. This is the primary call
359 // for which we have created this node.
360 CallInfo Call;
361
362 // List of other calls that can be treated the same as the primary call
363 // through cloning. I.e. located in the same function and have the same
364 // (possibly pruned) stack ids. They will be updated the same way as the
365 // primary call when assigning to function clones.
366 SmallVector<CallInfo, 0> MatchingCalls;
367
368 // For alloc nodes this is a unique id assigned when constructed, and for
369 // callsite stack nodes it is the original stack id when the node is
370 // constructed from the memprof MIB metadata on the alloc nodes. Note that
371 // this is only used when matching callsite metadata onto the stack nodes
372 // created when processing the allocation memprof MIBs, and for labeling
373 // nodes in the dot graph. Therefore we don't bother to assign a value for
374 // clones.
375 uint64_t OrigStackOrAllocId = 0;
376
377 // Edges to all callees in the profiled call stacks.
378 // TODO: Should this be a map (from Callee node) for more efficient lookup?
379 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
380
381 // Edges to all callers in the profiled call stacks.
382 // TODO: Should this be a map (from Caller node) for more efficient lookup?
383 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
384
385 // Returns true if we need to look at the callee edges for determining the
386 // node context ids and allocation type.
387 bool useCallerEdgesForContextInfo() const {
388 // Typically if the callee edges are empty either the caller edges are
389 // also empty, or this is an allocation (leaf node). However, if we are
390 // allowing recursive callsites and contexts this will be violated for
391 // incompletely cloned recursive cycles.
392 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
394 // When cloning for a recursive context, during cloning we might be in the
395 // midst of cloning for a recurrence and have moved context ids off of a
396 // caller edge onto the clone but not yet off of the incoming caller
397 // (back) edge. If we don't look at those we miss the fact that this node
398 // still has context ids of interest.
399 return IsAllocation || CloneRecursiveContexts;
400 }
401
402 // Compute the context ids for this node from the union of its edge context
403 // ids.
404 DenseSet<uint32_t> getContextIds() const {
405 unsigned Count = 0;
406 // Compute the number of ids for reserve below. In general we only need to
407 // look at one set of edges, typically the callee edges, since other than
408 // allocations and in some cases during recursion cloning, all the context
409 // ids on the callers should also flow out via callee edges.
410 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
411 Count += Edge->getContextIds().size();
412 DenseSet<uint32_t> ContextIds;
413 ContextIds.reserve(Count);
415 CalleeEdges, useCallerEdgesForContextInfo()
416 ? CallerEdges
417 : std::vector<std::shared_ptr<ContextEdge>>());
418 for (const auto &Edge : Edges)
419 ContextIds.insert_range(Edge->getContextIds());
420 return ContextIds;
421 }
422
423 // Compute the allocation type for this node from the OR of its edge
424 // allocation types.
425 uint8_t computeAllocType() const {
426 uint8_t BothTypes =
430 CalleeEdges, useCallerEdgesForContextInfo()
431 ? CallerEdges
432 : std::vector<std::shared_ptr<ContextEdge>>());
433 for (const auto &Edge : Edges) {
434 AllocType |= Edge->AllocTypes;
435 // Bail early if alloc type reached both, no further refinement.
436 if (AllocType == BothTypes)
437 return AllocType;
438 }
439 return AllocType;
440 }
441
442 // The context ids set for this node is empty if its edge context ids are
443 // also all empty.
444 bool emptyContextIds() const {
446 CalleeEdges, useCallerEdgesForContextInfo()
447 ? CallerEdges
448 : std::vector<std::shared_ptr<ContextEdge>>());
449 for (const auto &Edge : Edges) {
450 if (!Edge->getContextIds().empty())
451 return false;
452 }
453 return true;
454 }
455
456 // List of clones of this ContextNode, initially empty.
457 std::vector<ContextNode *> Clones;
458
459 // If a clone, points to the original uncloned node.
460 ContextNode *CloneOf = nullptr;
461
462 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
463
464 ContextNode(bool IsAllocation, CallInfo C)
465 : IsAllocation(IsAllocation), Call(C) {}
466
467 void addClone(ContextNode *Clone) {
468 if (CloneOf) {
469 CloneOf->Clones.push_back(Clone);
470 Clone->CloneOf = CloneOf;
471 } else {
472 Clones.push_back(Clone);
473 assert(!Clone->CloneOf);
474 Clone->CloneOf = this;
475 }
476 }
477
478 ContextNode *getOrigNode() {
479 if (!CloneOf)
480 return this;
481 return CloneOf;
482 }
483
484 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
485 unsigned int ContextId);
486
487 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
488 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
489 void eraseCalleeEdge(const ContextEdge *Edge);
490 void eraseCallerEdge(const ContextEdge *Edge);
491
492 void setCall(CallInfo C) { Call = C; }
493
494 bool hasCall() const { return (bool)Call.call(); }
495
496 void printCall(raw_ostream &OS) const { Call.print(OS); }
497
498 // True if this node was effectively removed from the graph, in which case
499 // it should have an allocation type of None and empty context ids.
500 bool isRemoved() const {
501 // Typically if the callee edges are empty either the caller edges are
502 // also empty, or this is an allocation (leaf node). However, if we are
503 // allowing recursive callsites and contexts this will be violated for
504 // incompletely cloned recursive cycles.
506 (AllocTypes == (uint8_t)AllocationType::None) ==
507 emptyContextIds());
508 return AllocTypes == (uint8_t)AllocationType::None;
509 }
510
511 void dump() const;
512 void print(raw_ostream &OS) const;
513
514 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
515 Node.print(OS);
516 return OS;
517 }
518 };
519
520 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
521 /// callee.
522 struct ContextEdge {
523 ContextNode *Callee;
524 ContextNode *Caller;
525
526 // This will be formed by ORing together the AllocationType enum values
527 // for contexts including this edge.
528 uint8_t AllocTypes = 0;
529
530 // Set just before initiating cloning when cloning of recursive contexts is
531 // enabled. Used to defer cloning of backedges until we have done cloning of
532 // the callee node for non-backedge caller edges. This exposes cloning
533 // opportunities through the backedge of the cycle.
534 // TODO: Note that this is not updated during cloning, and it is unclear
535 // whether that would be needed.
536 bool IsBackedge = false;
537
538 // The set of IDs for contexts including this edge.
539 DenseSet<uint32_t> ContextIds;
540
541 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
542 DenseSet<uint32_t> ContextIds)
543 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
544 ContextIds(std::move(ContextIds)) {}
545
546 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
547
548 // Helper to clear the fields of this edge when we are removing it from the
549 // graph.
550 inline void clear() {
551 ContextIds.clear();
552 AllocTypes = (uint8_t)AllocationType::None;
553 Caller = nullptr;
554 Callee = nullptr;
555 }
556
557 // Check if edge was removed from the graph. This is useful while iterating
558 // over a copy of edge lists when performing operations that mutate the
559 // graph in ways that might remove one of the edges.
560 inline bool isRemoved() const {
561 if (Callee || Caller)
562 return false;
563 // Any edges that have been removed from the graph but are still in a
564 // shared_ptr somewhere should have all fields null'ed out by clear()
565 // above.
566 assert(AllocTypes == (uint8_t)AllocationType::None);
567 assert(ContextIds.empty());
568 return true;
569 }
570
571 void dump() const;
572 void print(raw_ostream &OS) const;
573
574 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
575 Edge.print(OS);
576 return OS;
577 }
578 };
579
580 /// Helpers to remove edges that have allocation type None (due to not
581 /// carrying any context ids) after transformations.
582 void removeNoneTypeCalleeEdges(ContextNode *Node);
583 void removeNoneTypeCallerEdges(ContextNode *Node);
584 void
585 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
587
588protected:
589 /// Get a list of nodes corresponding to the stack ids in the given callsite
590 /// context.
591 template <class NodeT, class IteratorT>
592 std::vector<uint64_t>
593 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
594
595 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
596 /// metadata (or summary).
597 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
598
599 /// Adds nodes for the given MIB stack ids.
600 template <class NodeT, class IteratorT>
601 void addStackNodesForMIB(
602 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
604 ArrayRef<ContextTotalSize> ContextSizeInfo,
605 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
606
607 /// Matches all callsite metadata (or summary) to the nodes created for
608 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
609 /// inlining performed on those callsite instructions.
610 void updateStackNodes();
611
612 /// Optionally fixup edges for the N largest cold contexts to better enable
613 /// cloning. This is particularly helpful if the context includes recursion
614 /// as well as inlining, resulting in a single stack node for multiple stack
615 /// ids in the context. With recursion it is particularly difficult to get the
616 /// edge updates correct as in the general case we have lost the original
617 /// stack id ordering for the context. Do more expensive fixup for the largest
618 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
619 void fixupImportantContexts();
620
621 /// Update graph to conservatively handle any callsite stack nodes that target
622 /// multiple different callee target functions.
623 void handleCallsitesWithMultipleTargets();
624
625 /// Mark backedges via the standard DFS based backedge algorithm.
626 void markBackedges();
627
628 /// Merge clones generated during cloning for different allocations but that
629 /// are called by the same caller node, to ensure proper function assignment.
630 void mergeClones();
631
632 // Try to partition calls on the given node (already placed into the AllCalls
633 // array) by callee function, creating new copies of Node as needed to hold
634 // calls with different callees, and moving the callee edges appropriately.
635 // Returns true if partitioning was successful.
636 bool partitionCallsByCallee(
637 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
638 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
639
640 /// Save lists of calls with MemProf metadata in each function, for faster
641 /// iteration.
642 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
643
644 /// Map from callsite node to the enclosing caller function.
645 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
646
647 // When exporting to dot, and an allocation id is specified, contains the
648 // context ids on that allocation.
649 DenseSet<uint32_t> DotAllocContextIds;
650
651private:
652 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
653
654 // Structure to keep track of information for each call as we are matching
655 // non-allocation callsites onto context nodes created from the allocation
656 // call metadata / summary contexts.
657 struct CallContextInfo {
658 // The callsite we're trying to match.
659 CallTy Call;
660 // The callsites stack ids that have a context node in the graph.
661 std::vector<uint64_t> StackIds;
662 // The function containing this callsite.
663 const FuncTy *Func;
664 // Initially empty, if needed this will be updated to contain the context
665 // ids for use in a new context node created for this callsite.
666 DenseSet<uint32_t> ContextIds;
667 };
668
669 /// Helper to remove edge from graph, updating edge iterator if it is provided
670 /// (in which case CalleeIter indicates which edge list is being iterated).
671 /// This will also perform the necessary clearing of the ContextEdge members
672 /// to enable later checking if the edge has been removed (since we may have
673 /// other copies of the shared_ptr in existence, and in fact rely on this to
674 /// enable removal while iterating over a copy of a node's edge list).
675 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
676 bool CalleeIter = true);
677
678 /// Assigns the given Node to calls at or inlined into the location with
679 /// the Node's stack id, after post order traversing and processing its
680 /// caller nodes. Uses the call information recorded in the given
681 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
682 /// as needed. Called by updateStackNodes which sets up the given
683 /// StackIdToMatchingCalls map.
684 void assignStackNodesPostOrder(
685 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
686 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
687 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
688 const DenseSet<uint32_t> &ImportantContextIds);
689
690 /// Duplicates the given set of context ids, updating the provided
691 /// map from each original id with the newly generated context ids,
692 /// and returning the new duplicated id set.
693 DenseSet<uint32_t> duplicateContextIds(
694 const DenseSet<uint32_t> &StackSequenceContextIds,
695 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
696
697 /// Propagates all duplicated context ids across the graph.
698 void propagateDuplicateContextIds(
699 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
700
701 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
702 /// else to its callers. Also updates OrigNode's edges to remove any context
703 /// ids moved to the newly created edge.
704 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
705 bool TowardsCallee,
706 DenseSet<uint32_t> RemainingContextIds);
707
708 /// Get the stack id corresponding to the given Id or Index (for IR this will
709 /// return itself, for a summary index this will return the id recorded in the
710 /// index for that stack id index value).
711 uint64_t getStackId(uint64_t IdOrIndex) const {
712 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
713 }
714
715 /// Returns true if the given call targets the callee of the given edge, or if
716 /// we were able to identify the call chain through intermediate tail calls.
717 /// In the latter case new context nodes are added to the graph for the
718 /// identified tail calls, and their synthesized nodes are added to
719 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
720 /// the updated edges and to prepare it for an increment in the caller.
721 bool
722 calleesMatch(CallTy Call, EdgeIter &EI,
723 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
724
725 // Return the callee function of the given call, or nullptr if it can't be
726 // determined
727 const FuncTy *getCalleeFunc(CallTy Call) {
728 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
729 }
730
731 /// Returns true if the given call targets the given function, or if we were
732 /// able to identify the call chain through intermediate tail calls (in which
733 /// case FoundCalleeChain will be populated).
734 bool calleeMatchesFunc(
735 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
736 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
737 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
738 Call, Func, CallerFunc, FoundCalleeChain);
739 }
740
741 /// Returns true if both call instructions have the same callee.
742 bool sameCallee(CallTy Call1, CallTy Call2) {
743 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
744 }
745
746 /// Get a list of nodes corresponding to the stack ids in the given
747 /// callsite's context.
748 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
749 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
750 Call);
751 }
752
753 /// Get the last stack id in the context for callsite.
754 uint64_t getLastStackId(CallTy Call) {
755 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
756 }
757
758 /// Update the allocation call to record type of allocated memory.
759 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
760 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
761 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
762 }
763
764 /// Get the AllocationType assigned to the given allocation instruction clone.
765 AllocationType getAllocationCallType(const CallInfo &Call) const {
766 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
767 }
768
769 /// Update non-allocation call to invoke (possibly cloned) function
770 /// CalleeFunc.
771 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
772 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
773 }
774
775 /// Clone the given function for the given callsite, recording mapping of all
776 /// of the functions tracked calls to their new versions in the CallMap.
777 /// Assigns new clones to clone number CloneNo.
778 FuncInfo cloneFunctionForCallsite(
779 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
780 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
781 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
782 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
783 }
784
785 /// Gets a label to use in the dot graph for the given call clone in the given
786 /// function.
787 std::string getLabel(const FuncTy *Func, const CallTy Call,
788 unsigned CloneNo) const {
789 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
790 }
791
792 // Create and return a new ContextNode.
793 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
794 CallInfo C = CallInfo()) {
795 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
796 auto *NewNode = NodeOwner.back().get();
797 if (F)
798 NodeToCallingFunc[NewNode] = F;
799 NewNode->NodeId = NodeOwner.size();
800 return NewNode;
801 }
802
803 /// Helpers to find the node corresponding to the given call or stackid.
804 ContextNode *getNodeForInst(const CallInfo &C);
805 ContextNode *getNodeForAlloc(const CallInfo &C);
806 ContextNode *getNodeForStackId(uint64_t StackId);
807
808 /// Computes the alloc type corresponding to the given context ids, by
809 /// unioning their recorded alloc types.
810 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
811
812 /// Returns the allocation type of the intersection of the contexts of two
813 /// nodes (based on their provided context id sets), optimized for the case
814 /// when Node1Ids is smaller than Node2Ids.
815 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
816 const DenseSet<uint32_t> &Node2Ids) const;
817
818 /// Returns the allocation type of the intersection of the contexts of two
819 /// nodes (based on their provided context id sets).
820 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
821 const DenseSet<uint32_t> &Node2Ids) const;
822
823 /// Create a clone of Edge's callee and move Edge to that new callee node,
824 /// performing the necessary context id and allocation type updates.
825 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
826 /// moved to an edge to the new callee.
827 ContextNode *
828 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
829 DenseSet<uint32_t> ContextIdsToMove = {});
830
831 /// Change the callee of Edge to existing callee clone NewCallee, performing
832 /// the necessary context id and allocation type updates.
833 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
834 /// moved to an edge to the new callee.
835 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
836 ContextNode *NewCallee,
837 bool NewClone = false,
838 DenseSet<uint32_t> ContextIdsToMove = {});
839
840 /// Change the caller of the edge at the given callee edge iterator to be
841 /// NewCaller, performing the necessary context id and allocation type
842 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
843 /// a simplified version of it as we always move the given edge and all of its
844 /// context ids.
845 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
846 ContextNode *NewCaller);
847
848 /// Recursive helper for marking backedges via DFS.
849 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
850 DenseSet<const ContextNode *> &CurrentStack);
851
852 /// Recursive helper for merging clones.
853 void
854 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
855 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
856 /// Main worker for merging callee clones for a given node.
857 void mergeNodeCalleeClones(
858 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
859 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
860 /// Helper to find other callers of the given set of callee edges that can
861 /// share the same callee merge node.
862 void findOtherCallersToShareMerge(
863 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
864 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
865 DenseSet<ContextNode *> &OtherCallersToShareMerge);
866
867 /// Recursively perform cloning on the graph for the given Node and its
868 /// callers, in order to uniquely identify the allocation behavior of an
869 /// allocation given its context. The context ids of the allocation being
870 /// processed are given in AllocContextIds.
871 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
872 const DenseSet<uint32_t> &AllocContextIds);
873
874 /// Map from each context ID to the AllocationType assigned to that context.
875 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
876
877 /// Map from each contextID to the profiled full contexts and their total
878 /// sizes (there may be more than one due to context trimming),
879 /// optionally populated when requested (via MemProfReportHintedSizes or
880 /// MinClonedColdBytePercent).
881 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
882
883 /// Identifies the context node created for a stack id when adding the MIB
884 /// contexts to the graph. This is used to locate the context nodes when
885 /// trying to assign the corresponding callsites with those stack ids to these
886 /// nodes.
887 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
888
889 /// Saves information for the contexts identified as important (the largest
890 /// cold contexts up to MemProfTopNImportant).
891 struct ImportantContextInfo {
892 // The original list of leaf first stack ids corresponding to this context.
893 std::vector<uint64_t> StackIds;
894 // Max length of stack ids corresponding to a single stack ContextNode for
895 // this context (i.e. the max length of a key in StackIdsToNode below).
896 unsigned MaxLength = 0;
897 // Mapping of slices of the stack ids to the corresponding ContextNode
898 // (there can be multiple stack ids due to inlining). Populated when
899 // updating stack nodes while matching them to the IR or summary.
900 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
901 };
902
903 // Map of important full context ids to information about each.
904 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
905
906 // For each important context id found in Node (if any), records the list of
907 // stack ids that corresponded to the given callsite Node. There can be more
908 // than one in the case of inlining.
909 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
910 // We pass in the Node's context ids to avoid the
911 // overhead of computing them as the caller already has
912 // them in some cases.
913 const DenseSet<uint32_t> &NodeContextIds,
914 const DenseSet<uint32_t> &ImportantContextIds) {
916 assert(ImportantContextIds.empty());
917 return;
918 }
920 set_intersection(NodeContextIds, ImportantContextIds);
921 if (Ids.empty())
922 return;
923 auto Size = StackIds.size();
924 for (auto Id : Ids) {
925 auto &Entry = ImportantContextIdInfo[Id];
926 Entry.StackIdsToNode[StackIds] = Node;
927 // Keep track of the max to simplify later analysis.
928 if (Size > Entry.MaxLength)
929 Entry.MaxLength = Size;
930 }
931 }
932
933 /// Maps to track the calls to their corresponding nodes in the graph.
934 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
935 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
936
937 /// Owner of all ContextNode unique_ptrs.
938 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
939
940 /// Perform sanity checks on graph when requested.
941 void check() const;
942
943 /// Keeps track of the last unique context id assigned.
944 unsigned int LastContextId = 0;
945};
946
947template <typename DerivedCCG, typename FuncTy, typename CallTy>
948using ContextNode =
949 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
950template <typename DerivedCCG, typename FuncTy, typename CallTy>
951using ContextEdge =
952 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
953template <typename DerivedCCG, typename FuncTy, typename CallTy>
954using FuncInfo =
955 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
956template <typename DerivedCCG, typename FuncTy, typename CallTy>
957using CallInfo =
958 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
959
960/// CRTP derived class for graphs built from IR (regular LTO).
961class ModuleCallsiteContextGraph
962 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
963 Instruction *> {
964public:
965 ModuleCallsiteContextGraph(
966 Module &M,
967 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
968
969private:
970 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
971 Instruction *>;
972
973 uint64_t getStackId(uint64_t IdOrIndex) const;
974 const Function *getCalleeFunc(Instruction *Call);
975 bool calleeMatchesFunc(
976 Instruction *Call, const Function *Func, const Function *CallerFunc,
977 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
978 bool sameCallee(Instruction *Call1, Instruction *Call2);
979 bool findProfiledCalleeThroughTailCalls(
980 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
981 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
982 bool &FoundMultipleCalleeChains);
983 uint64_t getLastStackId(Instruction *Call);
984 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
985 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
986 AllocationType getAllocationCallType(const CallInfo &Call) const;
987 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
988 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
989 Instruction *>::FuncInfo
990 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
991 DenseMap<CallInfo, CallInfo> &CallMap,
992 std::vector<CallInfo> &CallsWithMetadataInFunc,
993 unsigned CloneNo);
994 std::string getLabel(const Function *Func, const Instruction *Call,
995 unsigned CloneNo) const;
996
997 const Module &Mod;
998 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
999};
1000
1001/// Represents a call in the summary index graph, which can either be an
1002/// allocation or an interior callsite node in an allocation's context.
1003/// Holds a pointer to the corresponding data structure in the index.
1004struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1005 IndexCall() : PointerUnion() {}
1006 IndexCall(std::nullptr_t) : IndexCall() {}
1007 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1008 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1009 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1010
1011 IndexCall *operator->() { return this; }
1012
1013 void print(raw_ostream &OS) const {
1014 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1016 OS << *AI;
1017 } else {
1019 assert(CI);
1020 OS << *CI;
1021 }
1022 }
1023};
1024} // namespace
1025
1026namespace llvm {
1027template <> struct simplify_type<IndexCall> {
1029 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1030};
1031template <> struct simplify_type<const IndexCall> {
1033 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1034};
1035} // namespace llvm
1036
1037namespace {
1038/// CRTP derived class for graphs built from summary index (ThinLTO).
1039class IndexCallsiteContextGraph
1040 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1041 IndexCall> {
1042public:
1043 IndexCallsiteContextGraph(
1044 ModuleSummaryIndex &Index,
1045 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1046 isPrevailing);
1047
1048 ~IndexCallsiteContextGraph() {
1049 // Now that we are done with the graph it is safe to add the new
1050 // CallsiteInfo structs to the function summary vectors. The graph nodes
1051 // point into locations within these vectors, so we don't want to add them
1052 // any earlier.
1053 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1054 auto *FS = I.first;
1055 for (auto &Callsite : I.second)
1056 FS->addCallsite(*Callsite.second);
1057 }
1058 }
1059
1060private:
1061 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1062 IndexCall>;
1063
1064 uint64_t getStackId(uint64_t IdOrIndex) const;
1065 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1066 bool calleeMatchesFunc(
1067 IndexCall &Call, const FunctionSummary *Func,
1068 const FunctionSummary *CallerFunc,
1069 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1070 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1071 bool findProfiledCalleeThroughTailCalls(
1072 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1073 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1074 bool &FoundMultipleCalleeChains);
1075 uint64_t getLastStackId(IndexCall &Call);
1076 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1077 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1078 AllocationType getAllocationCallType(const CallInfo &Call) const;
1079 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1080 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1081 IndexCall>::FuncInfo
1082 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1083 DenseMap<CallInfo, CallInfo> &CallMap,
1084 std::vector<CallInfo> &CallsWithMetadataInFunc,
1085 unsigned CloneNo);
1086 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1087 unsigned CloneNo) const;
1088 DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1089
1090 // Saves mapping from function summaries containing memprof records back to
1091 // its VI, for use in checking and debugging.
1092 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1093
1094 const ModuleSummaryIndex &Index;
1095 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1096 isPrevailing;
1097
1098 // Saves/owns the callsite info structures synthesized for missing tail call
1099 // frames that we discover while building the graph.
1100 // It maps from the summary of the function making the tail call, to a map
1101 // of callee ValueInfo to corresponding synthesized callsite info.
1102 std::unordered_map<FunctionSummary *,
1103 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1104 FunctionCalleesToSynthesizedCallsiteInfos;
1105};
1106} // namespace
1107
1108template <>
1109struct llvm::DenseMapInfo<CallsiteContextGraph<
1110 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1112template <>
1113struct llvm::DenseMapInfo<CallsiteContextGraph<
1114 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1115 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1116template <>
1117struct llvm::DenseMapInfo<IndexCall>
1118 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1119
1120namespace {
1121
1122// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1123// type we should actually use on the corresponding allocation.
1124// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1125// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1126// from NotCold.
1127AllocationType allocTypeToUse(uint8_t AllocTypes) {
1128 assert(AllocTypes != (uint8_t)AllocationType::None);
1129 if (AllocTypes ==
1132 else
1133 return (AllocationType)AllocTypes;
1134}
1135
1136// Helper to check if the alloc types for all edges recorded in the
1137// InAllocTypes vector match the alloc types for all edges in the Edges
1138// vector.
1139template <typename DerivedCCG, typename FuncTy, typename CallTy>
1140bool allocTypesMatch(
1141 const std::vector<uint8_t> &InAllocTypes,
1142 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1143 &Edges) {
1144 // This should be called only when the InAllocTypes vector was computed for
1145 // this set of Edges. Make sure the sizes are the same.
1146 assert(InAllocTypes.size() == Edges.size());
1147 return std::equal(
1148 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1149 [](const uint8_t &l,
1150 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1151 // Can share if one of the edges is None type - don't
1152 // care about the type along that edge as it doesn't
1153 // exist for those context ids.
1154 if (l == (uint8_t)AllocationType::None ||
1155 r->AllocTypes == (uint8_t)AllocationType::None)
1156 return true;
1157 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1158 });
1159}
1160
1161// Helper to check if the alloc types for all edges recorded in the
1162// InAllocTypes vector match the alloc types for callee edges in the given
1163// clone. Because the InAllocTypes were computed from the original node's callee
1164// edges, and other cloning could have happened after this clone was created, we
1165// need to find the matching clone callee edge, which may or may not exist.
1166template <typename DerivedCCG, typename FuncTy, typename CallTy>
1167bool allocTypesMatchClone(
1168 const std::vector<uint8_t> &InAllocTypes,
1169 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1170 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1171 assert(Node);
1172 // InAllocTypes should have been computed for the original node's callee
1173 // edges.
1174 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1175 // First create a map of the clone callee edge callees to the edge alloc type.
1177 EdgeCalleeMap;
1178 for (const auto &E : Clone->CalleeEdges) {
1179 assert(!EdgeCalleeMap.contains(E->Callee));
1180 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1181 }
1182 // Next, walk the original node's callees, and look for the corresponding
1183 // clone edge to that callee.
1184 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1185 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1186 // Not found is ok, we will simply add an edge if we use this clone.
1187 if (Iter == EdgeCalleeMap.end())
1188 continue;
1189 // Can share if one of the edges is None type - don't
1190 // care about the type along that edge as it doesn't
1191 // exist for those context ids.
1192 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1193 Iter->second == (uint8_t)AllocationType::None)
1194 continue;
1195 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1196 return false;
1197 }
1198 return true;
1199}
1200
1201} // end anonymous namespace
1202
1203template <typename DerivedCCG, typename FuncTy, typename CallTy>
1204typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1205CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1206 const CallInfo &C) {
1207 ContextNode *Node = getNodeForAlloc(C);
1208 if (Node)
1209 return Node;
1210
1211 return NonAllocationCallToContextNodeMap.lookup(C);
1212}
1213
1214template <typename DerivedCCG, typename FuncTy, typename CallTy>
1215typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1216CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1217 const CallInfo &C) {
1218 return AllocationCallToContextNodeMap.lookup(C);
1219}
1220
1221template <typename DerivedCCG, typename FuncTy, typename CallTy>
1222typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1223CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1224 uint64_t StackId) {
1225 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1226 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1227 return StackEntryNode->second;
1228 return nullptr;
1229}
1230
1231template <typename DerivedCCG, typename FuncTy, typename CallTy>
1232void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1233 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1234 unsigned int ContextId) {
1235 for (auto &Edge : CallerEdges) {
1236 if (Edge->Caller == Caller) {
1237 Edge->AllocTypes |= (uint8_t)AllocType;
1238 Edge->getContextIds().insert(ContextId);
1239 return;
1240 }
1241 }
1242 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1243 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1244 CallerEdges.push_back(Edge);
1245 Caller->CalleeEdges.push_back(Edge);
1246}
1247
1248template <typename DerivedCCG, typename FuncTy, typename CallTy>
1249void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1250 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1251 assert(!EI || (*EI)->get() == Edge);
1252 assert(!Edge->isRemoved());
1253 // Save the Caller and Callee pointers so we can erase Edge from their edge
1254 // lists after clearing Edge below. We do the clearing first in case it is
1255 // destructed after removing from the edge lists (if those were the last
1256 // shared_ptr references to Edge).
1257 auto *Callee = Edge->Callee;
1258 auto *Caller = Edge->Caller;
1259
1260 // Make sure the edge fields are cleared out so we can properly detect
1261 // removed edges if Edge is not destructed because there is still a shared_ptr
1262 // reference.
1263 Edge->clear();
1264
1265#ifndef NDEBUG
1266 auto CalleeCallerCount = Callee->CallerEdges.size();
1267 auto CallerCalleeCount = Caller->CalleeEdges.size();
1268#endif
1269 if (!EI) {
1270 Callee->eraseCallerEdge(Edge);
1271 Caller->eraseCalleeEdge(Edge);
1272 } else if (CalleeIter) {
1273 Callee->eraseCallerEdge(Edge);
1274 *EI = Caller->CalleeEdges.erase(*EI);
1275 } else {
1276 Caller->eraseCalleeEdge(Edge);
1277 *EI = Callee->CallerEdges.erase(*EI);
1278 }
1279 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1280 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1281}
1282
1283template <typename DerivedCCG, typename FuncTy, typename CallTy>
1284void CallsiteContextGraph<
1285 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1286 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1287 auto Edge = *EI;
1288 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1289 assert(Edge->ContextIds.empty());
1290 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1291 } else
1292 ++EI;
1293 }
1294}
1295
1296template <typename DerivedCCG, typename FuncTy, typename CallTy>
1297void CallsiteContextGraph<
1298 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1299 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1300 auto Edge = *EI;
1301 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1302 assert(Edge->ContextIds.empty());
1303 Edge->Caller->eraseCalleeEdge(Edge.get());
1304 EI = Node->CallerEdges.erase(EI);
1305 } else
1306 ++EI;
1307 }
1308}
1309
1310template <typename DerivedCCG, typename FuncTy, typename CallTy>
1311typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1312CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1313 findEdgeFromCallee(const ContextNode *Callee) {
1314 for (const auto &Edge : CalleeEdges)
1315 if (Edge->Callee == Callee)
1316 return Edge.get();
1317 return nullptr;
1318}
1319
1320template <typename DerivedCCG, typename FuncTy, typename CallTy>
1321typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1322CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1323 findEdgeFromCaller(const ContextNode *Caller) {
1324 for (const auto &Edge : CallerEdges)
1325 if (Edge->Caller == Caller)
1326 return Edge.get();
1327 return nullptr;
1328}
1329
1330template <typename DerivedCCG, typename FuncTy, typename CallTy>
1331void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1332 eraseCalleeEdge(const ContextEdge *Edge) {
1333 auto EI = llvm::find_if(
1334 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1335 return CalleeEdge.get() == Edge;
1336 });
1337 assert(EI != CalleeEdges.end());
1338 CalleeEdges.erase(EI);
1339}
1340
1341template <typename DerivedCCG, typename FuncTy, typename CallTy>
1342void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1343 eraseCallerEdge(const ContextEdge *Edge) {
1344 auto EI = llvm::find_if(
1345 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1346 return CallerEdge.get() == Edge;
1347 });
1348 assert(EI != CallerEdges.end());
1349 CallerEdges.erase(EI);
1350}
1351
1352template <typename DerivedCCG, typename FuncTy, typename CallTy>
1353uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1354 DenseSet<uint32_t> &ContextIds) const {
1355 uint8_t BothTypes =
1356 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1357 uint8_t AllocType = (uint8_t)AllocationType::None;
1358 for (auto Id : ContextIds) {
1359 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1360 // Bail early if alloc type reached both, no further refinement.
1361 if (AllocType == BothTypes)
1362 return AllocType;
1363 }
1364 return AllocType;
1365}
1366
1367template <typename DerivedCCG, typename FuncTy, typename CallTy>
1368uint8_t
1369CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1370 const DenseSet<uint32_t> &Node1Ids,
1371 const DenseSet<uint32_t> &Node2Ids) const {
1372 uint8_t BothTypes =
1373 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1374 uint8_t AllocType = (uint8_t)AllocationType::None;
1375 for (auto Id : Node1Ids) {
1376 if (!Node2Ids.count(Id))
1377 continue;
1378 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1379 // Bail early if alloc type reached both, no further refinement.
1380 if (AllocType == BothTypes)
1381 return AllocType;
1382 }
1383 return AllocType;
1384}
1385
1386template <typename DerivedCCG, typename FuncTy, typename CallTy>
1387uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1388 const DenseSet<uint32_t> &Node1Ids,
1389 const DenseSet<uint32_t> &Node2Ids) const {
1390 if (Node1Ids.size() < Node2Ids.size())
1391 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1392 else
1393 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1394}
1395
1396template <typename DerivedCCG, typename FuncTy, typename CallTy>
1397typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1398CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1399 CallInfo Call, const FuncTy *F) {
1400 assert(!getNodeForAlloc(Call));
1401 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1402 AllocationCallToContextNodeMap[Call] = AllocNode;
1403 // Use LastContextId as a uniq id for MIB allocation nodes.
1404 AllocNode->OrigStackOrAllocId = LastContextId;
1405 // Alloc type should be updated as we add in the MIBs. We should assert
1406 // afterwards that it is not still None.
1407 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1408
1409 return AllocNode;
1410}
1411
1412static std::string getAllocTypeString(uint8_t AllocTypes) {
1413 if (!AllocTypes)
1414 return "None";
1415 std::string Str;
1416 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1417 Str += "NotCold";
1418 if (AllocTypes & (uint8_t)AllocationType::Cold)
1419 Str += "Cold";
1420 return Str;
1421}
1422
1423template <typename DerivedCCG, typename FuncTy, typename CallTy>
1424template <class NodeT, class IteratorT>
1425void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1426 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1427 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1428 ArrayRef<ContextTotalSize> ContextSizeInfo,
1429 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1430 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1431 // is done.
1432 if (AllocType == AllocationType::Hot)
1433 AllocType = AllocationType::NotCold;
1434
1435 ContextIdToAllocationType[++LastContextId] = AllocType;
1436
1437 bool IsImportant = false;
1438 if (!ContextSizeInfo.empty()) {
1439 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1440 // If this is a cold allocation, and we are collecting non-zero largest
1441 // contexts, see if this is a candidate.
1442 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1443 uint64_t TotalCold = 0;
1444 for (auto &CSI : ContextSizeInfo)
1445 TotalCold += CSI.TotalSize;
1446 // Record this context if either we haven't found the first top-n largest
1447 // yet, or if it is larger than the smallest already recorded.
1448 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1449 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1450 // sorted in ascending size of its key which is the size.
1451 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1452 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1453 // Remove old one and its associated entries.
1454 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1455 TotalSizeToContextIdTopNCold.erase(
1456 TotalSizeToContextIdTopNCold.begin());
1457 assert(ImportantContextIdInfo.count(IdToRemove));
1458 ImportantContextIdInfo.erase(IdToRemove);
1459 }
1460 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1461 IsImportant = true;
1462 }
1463 }
1464 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1465 }
1466
1467 // Update alloc type and context ids for this MIB.
1468 AllocNode->AllocTypes |= (uint8_t)AllocType;
1469
1470 // Now add or update nodes for each stack id in alloc's context.
1471 // Later when processing the stack ids on non-alloc callsites we will adjust
1472 // for any inlining in the context.
1473 ContextNode *PrevNode = AllocNode;
1474 // Look for recursion (direct recursion should have been collapsed by
1475 // module summary analysis, here we should just be detecting mutual
1476 // recursion). Mark these nodes so we don't try to clone.
1477 SmallSet<uint64_t, 8> StackIdSet;
1478 // Skip any on the allocation call (inlining).
1479 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1480 ContextIter != StackContext.end(); ++ContextIter) {
1481 auto StackId = getStackId(*ContextIter);
1482 if (IsImportant)
1483 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1484 ContextNode *StackNode = getNodeForStackId(StackId);
1485 if (!StackNode) {
1486 StackNode = createNewNode(/*IsAllocation=*/false);
1487 StackEntryIdToContextNodeMap[StackId] = StackNode;
1488 StackNode->OrigStackOrAllocId = StackId;
1489 }
1490 // Marking a node recursive will prevent its cloning completely, even for
1491 // non-recursive contexts flowing through it.
1493 auto Ins = StackIdSet.insert(StackId);
1494 if (!Ins.second)
1495 StackNode->Recursive = true;
1496 }
1497 StackNode->AllocTypes |= (uint8_t)AllocType;
1498 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1499 PrevNode = StackNode;
1500 }
1501}
1502
1503template <typename DerivedCCG, typename FuncTy, typename CallTy>
1504DenseSet<uint32_t>
1505CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1506 const DenseSet<uint32_t> &StackSequenceContextIds,
1507 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1508 DenseSet<uint32_t> NewContextIds;
1509 for (auto OldId : StackSequenceContextIds) {
1510 NewContextIds.insert(++LastContextId);
1511 OldToNewContextIds[OldId].insert(LastContextId);
1512 assert(ContextIdToAllocationType.count(OldId));
1513 // The new context has the same allocation type and size info as original.
1514 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1515 auto CSI = ContextIdToContextSizeInfos.find(OldId);
1516 if (CSI != ContextIdToContextSizeInfos.end())
1517 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1518 if (DotAllocContextIds.contains(OldId))
1519 DotAllocContextIds.insert(LastContextId);
1520 }
1521 return NewContextIds;
1522}
1523
1524template <typename DerivedCCG, typename FuncTy, typename CallTy>
1525void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1526 propagateDuplicateContextIds(
1527 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1528 // Build a set of duplicated context ids corresponding to the input id set.
1529 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1530 DenseSet<uint32_t> NewIds;
1531 for (auto Id : ContextIds)
1532 if (auto NewId = OldToNewContextIds.find(Id);
1533 NewId != OldToNewContextIds.end())
1534 NewIds.insert_range(NewId->second);
1535 return NewIds;
1536 };
1537
1538 // Recursively update context ids sets along caller edges.
1539 auto UpdateCallers = [&](ContextNode *Node,
1540 DenseSet<const ContextEdge *> &Visited,
1541 auto &&UpdateCallers) -> void {
1542 for (const auto &Edge : Node->CallerEdges) {
1543 auto Inserted = Visited.insert(Edge.get());
1544 if (!Inserted.second)
1545 continue;
1546 ContextNode *NextNode = Edge->Caller;
1547 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1548 // Only need to recursively iterate to NextNode via this caller edge if
1549 // it resulted in any added ids to NextNode.
1550 if (!NewIdsToAdd.empty()) {
1551 Edge->getContextIds().insert_range(NewIdsToAdd);
1552 UpdateCallers(NextNode, Visited, UpdateCallers);
1553 }
1554 }
1555 };
1556
1557 DenseSet<const ContextEdge *> Visited;
1558 for (auto &Entry : AllocationCallToContextNodeMap) {
1559 auto *Node = Entry.second;
1560 UpdateCallers(Node, Visited, UpdateCallers);
1561 }
1562}
1563
1564template <typename DerivedCCG, typename FuncTy, typename CallTy>
1565void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1566 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1567 // This must be passed by value to make a copy since it will be adjusted
1568 // as ids are moved.
1569 DenseSet<uint32_t> RemainingContextIds) {
1570 auto &OrigEdges =
1571 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1572 DenseSet<uint32_t> RecursiveContextIds;
1573 DenseSet<uint32_t> AllCallerContextIds;
1575 // Identify which context ids are recursive which is needed to properly
1576 // update the RemainingContextIds set. The relevant recursive context ids
1577 // are those that are in multiple edges.
1578 for (auto &CE : OrigEdges) {
1579 AllCallerContextIds.reserve(CE->getContextIds().size());
1580 for (auto Id : CE->getContextIds())
1581 if (!AllCallerContextIds.insert(Id).second)
1582 RecursiveContextIds.insert(Id);
1583 }
1584 }
1585 // Increment iterator in loop so that we can remove edges as needed.
1586 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1587 auto Edge = *EI;
1588 DenseSet<uint32_t> NewEdgeContextIds;
1589 DenseSet<uint32_t> NotFoundContextIds;
1590 // Remove any matching context ids from Edge, return set that were found and
1591 // removed, these are the new edge's context ids. Also update the remaining
1592 // (not found ids).
1593 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1594 NotFoundContextIds);
1595 // Update the remaining context ids set for the later edges. This is a
1596 // compile time optimization.
1597 if (RecursiveContextIds.empty()) {
1598 // No recursive ids, so all of the previously remaining context ids that
1599 // were not seen on this edge are the new remaining set.
1600 RemainingContextIds.swap(NotFoundContextIds);
1601 } else {
1602 // Keep the recursive ids in the remaining set as we expect to see those
1603 // on another edge. We can remove the non-recursive remaining ids that
1604 // were seen on this edge, however. We already have the set of remaining
1605 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1606 // non-recursive and only remove those. Note that despite the higher
1607 // overhead of updating the remaining context ids set when recursion
1608 // handling is enabled, it was found to be at worst performance neutral
1609 // and in one case a clear win.
1610 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1611 set_difference(NewEdgeContextIds, RecursiveContextIds);
1612 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1613 }
1614 // If no matching context ids for this edge, skip it.
1615 if (NewEdgeContextIds.empty()) {
1616 ++EI;
1617 continue;
1618 }
1619 if (TowardsCallee) {
1620 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1621 auto NewEdge = std::make_shared<ContextEdge>(
1622 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1623 NewNode->CalleeEdges.push_back(NewEdge);
1624 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1625 } else {
1626 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1627 auto NewEdge = std::make_shared<ContextEdge>(
1628 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1629 NewNode->CallerEdges.push_back(NewEdge);
1630 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1631 }
1632 // Remove old edge if context ids empty.
1633 if (Edge->getContextIds().empty()) {
1634 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1635 continue;
1636 }
1637 ++EI;
1638 }
1639}
1640
1641template <typename DerivedCCG, typename FuncTy, typename CallTy>
1642static void checkEdge(
1643 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1644 // Confirm that alloc type is not None and that we have at least one context
1645 // id.
1646 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1647 assert(!Edge->ContextIds.empty());
1648}
1649
1650template <typename DerivedCCG, typename FuncTy, typename CallTy>
1651static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1652 bool CheckEdges = true) {
1653 if (Node->isRemoved())
1654 return;
1655#ifndef NDEBUG
1656 // Compute node's context ids once for use in asserts.
1657 auto NodeContextIds = Node->getContextIds();
1658#endif
1659 // Node's context ids should be the union of both its callee and caller edge
1660 // context ids.
1661 if (Node->CallerEdges.size()) {
1662 DenseSet<uint32_t> CallerEdgeContextIds(
1663 Node->CallerEdges.front()->ContextIds);
1664 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1665 if (CheckEdges)
1667 set_union(CallerEdgeContextIds, Edge->ContextIds);
1668 }
1669 // Node can have more context ids than callers if some contexts terminate at
1670 // node and some are longer. If we are allowing recursive callsites and
1671 // contexts this will be violated for incompletely cloned recursive cycles,
1672 // so skip the checking in that case.
1674 NodeContextIds == CallerEdgeContextIds ||
1675 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1676 }
1677 if (Node->CalleeEdges.size()) {
1678 DenseSet<uint32_t> CalleeEdgeContextIds(
1679 Node->CalleeEdges.front()->ContextIds);
1680 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1681 if (CheckEdges)
1683 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1684 }
1685 // If we are allowing recursive callsites and contexts this will be violated
1686 // for incompletely cloned recursive cycles, so skip the checking in that
1687 // case.
1689 NodeContextIds == CalleeEdgeContextIds);
1690 }
1691 // FIXME: Since this checking is only invoked under an option, we should
1692 // change the error checking from using assert to something that will trigger
1693 // an error on a release build.
1694#ifndef NDEBUG
1695 // Make sure we don't end up with duplicate edges between the same caller and
1696 // callee.
1698 for (const auto &E : Node->CalleeEdges)
1699 NodeSet.insert(E->Callee);
1700 assert(NodeSet.size() == Node->CalleeEdges.size());
1701#endif
1702}
1703
1704template <typename DerivedCCG, typename FuncTy, typename CallTy>
1705void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1706 assignStackNodesPostOrder(ContextNode *Node,
1707 DenseSet<const ContextNode *> &Visited,
1708 DenseMap<uint64_t, std::vector<CallContextInfo>>
1709 &StackIdToMatchingCalls,
1710 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1711 const DenseSet<uint32_t> &ImportantContextIds) {
1712 auto Inserted = Visited.insert(Node);
1713 if (!Inserted.second)
1714 return;
1715 // Post order traversal. Iterate over a copy since we may add nodes and
1716 // therefore new callers during the recursive call, invalidating any
1717 // iterator over the original edge vector. We don't need to process these
1718 // new nodes as they were already processed on creation.
1719 auto CallerEdges = Node->CallerEdges;
1720 for (auto &Edge : CallerEdges) {
1721 // Skip any that have been removed during the recursion.
1722 if (Edge->isRemoved()) {
1723 assert(!is_contained(Node->CallerEdges, Edge));
1724 continue;
1725 }
1726 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1727 CallToMatchingCall, ImportantContextIds);
1728 }
1729
1730 // If this node's stack id is in the map, update the graph to contain new
1731 // nodes representing any inlining at interior callsites. Note we move the
1732 // associated context ids over to the new nodes.
1733
1734 // Ignore this node if it is for an allocation or we didn't record any
1735 // stack id lists ending at it.
1736 if (Node->IsAllocation ||
1737 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1738 return;
1739
1740 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1741 // Handle the simple case first. A single call with a single stack id.
1742 // In this case there is no need to create any new context nodes, simply
1743 // assign the context node for stack id to this Call.
1744 if (Calls.size() == 1) {
1745 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1746 if (Ids.size() == 1) {
1747 assert(SavedContextIds.empty());
1748 // It should be this Node
1749 assert(Node == getNodeForStackId(Ids[0]));
1750 if (Node->Recursive)
1751 return;
1752 Node->setCall(Call);
1753 NonAllocationCallToContextNodeMap[Call] = Node;
1754 NodeToCallingFunc[Node] = Func;
1755 recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
1756 return;
1757 }
1758 }
1759
1760#ifndef NDEBUG
1761 // Find the node for the last stack id, which should be the same
1762 // across all calls recorded for this id, and is this node's id.
1763 uint64_t LastId = Node->OrigStackOrAllocId;
1764 ContextNode *LastNode = getNodeForStackId(LastId);
1765 // We should only have kept stack ids that had nodes.
1766 assert(LastNode);
1767 assert(LastNode == Node);
1768#else
1769 ContextNode *LastNode = Node;
1770#endif
1771
1772 // Compute the last node's context ids once, as it is shared by all calls in
1773 // this entry.
1774 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1775
1776 [[maybe_unused]] bool PrevIterCreatedNode = false;
1777 bool CreatedNode = false;
1778 for (unsigned I = 0; I < Calls.size();
1779 I++, PrevIterCreatedNode = CreatedNode) {
1780 CreatedNode = false;
1781 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1782 // Skip any for which we didn't assign any ids, these don't get a node in
1783 // the graph.
1784 if (SavedContextIds.empty()) {
1785 // If this call has a matching call (located in the same function and
1786 // having the same stack ids), simply add it to the context node created
1787 // for its matching call earlier. These can be treated the same through
1788 // cloning and get updated at the same time.
1789 if (!CallToMatchingCall.contains(Call))
1790 continue;
1791 auto MatchingCall = CallToMatchingCall[Call];
1792 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1793 // This should only happen if we had a prior iteration, and it didn't
1794 // create a node because of the below recomputation of context ids
1795 // finding none remaining and continuing early.
1796 assert(I > 0 && !PrevIterCreatedNode);
1797 continue;
1798 }
1799 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1800 Call);
1801 continue;
1802 }
1803
1804 assert(LastId == Ids.back());
1805
1806 // Recompute the context ids for this stack id sequence (the
1807 // intersection of the context ids of the corresponding nodes).
1808 // Start with the ids we saved in the map for this call, which could be
1809 // duplicated context ids. We have to recompute as we might have overlap
1810 // overlap between the saved context ids for different last nodes, and
1811 // removed them already during the post order traversal.
1812 set_intersect(SavedContextIds, LastNodeContextIds);
1813 ContextNode *PrevNode = LastNode;
1814 bool Skip = false;
1815 // Iterate backwards through the stack Ids, starting after the last Id
1816 // in the list, which was handled once outside for all Calls.
1817 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1818 auto Id = *IdIter;
1819 ContextNode *CurNode = getNodeForStackId(Id);
1820 // We should only have kept stack ids that had nodes and weren't
1821 // recursive.
1822 assert(CurNode);
1823 assert(!CurNode->Recursive);
1824
1825 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1826 if (!Edge) {
1827 Skip = true;
1828 break;
1829 }
1830 PrevNode = CurNode;
1831
1832 // Update the context ids, which is the intersection of the ids along
1833 // all edges in the sequence.
1834 set_intersect(SavedContextIds, Edge->getContextIds());
1835
1836 // If we now have no context ids for clone, skip this call.
1837 if (SavedContextIds.empty()) {
1838 Skip = true;
1839 break;
1840 }
1841 }
1842 if (Skip)
1843 continue;
1844
1845 // Create new context node.
1846 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1847 NonAllocationCallToContextNodeMap[Call] = NewNode;
1848 CreatedNode = true;
1849 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1850
1851 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1852 assert(FirstNode);
1853
1854 // Connect to callees of innermost stack frame in inlined call chain.
1855 // This updates context ids for FirstNode's callee's to reflect those
1856 // moved to NewNode.
1857 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1858
1859 // Connect to callers of outermost stack frame in inlined call chain.
1860 // This updates context ids for FirstNode's caller's to reflect those
1861 // moved to NewNode.
1862 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1863
1864 // Now we need to remove context ids from edges/nodes between First and
1865 // Last Node.
1866 PrevNode = nullptr;
1867 for (auto Id : Ids) {
1868 ContextNode *CurNode = getNodeForStackId(Id);
1869 // We should only have kept stack ids that had nodes.
1870 assert(CurNode);
1871
1872 // Remove the context ids moved to NewNode from CurNode, and the
1873 // edge from the prior node.
1874 if (PrevNode) {
1875 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1876 // If the sequence contained recursion, we might have already removed
1877 // some edges during the connectNewNode calls above.
1878 if (!PrevEdge) {
1879 PrevNode = CurNode;
1880 continue;
1881 }
1882 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1883 if (PrevEdge->getContextIds().empty())
1884 removeEdgeFromGraph(PrevEdge);
1885 }
1886 // Since we update the edges from leaf to tail, only look at the callee
1887 // edges. This isn't an alloc node, so if there are no callee edges, the
1888 // alloc type is None.
1889 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1890 ? (uint8_t)AllocationType::None
1891 : CurNode->computeAllocType();
1892 PrevNode = CurNode;
1893 }
1894
1895 recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
1896
1897 if (VerifyNodes) {
1898 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1899 for (auto Id : Ids) {
1900 ContextNode *CurNode = getNodeForStackId(Id);
1901 // We should only have kept stack ids that had nodes.
1902 assert(CurNode);
1903 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1904 }
1905 }
1906 }
1907}
1908
1909template <typename DerivedCCG, typename FuncTy, typename CallTy>
1910void CallsiteContextGraph<DerivedCCG, FuncTy,
1911 CallTy>::fixupImportantContexts() {
1912 if (ImportantContextIdInfo.empty())
1913 return;
1914
1915 // Update statistics as we are done building this map at this point.
1916 NumImportantContextIds = ImportantContextIdInfo.size();
1917
1919 return;
1920
1921 if (ExportToDot)
1922 exportToDot("beforestackfixup");
1923
1924 // For each context we identified as important, walk through the saved context
1925 // stack ids in order from leaf upwards, and make sure all edges are correct.
1926 // These can be difficult to get right when updating the graph while mapping
1927 // nodes onto summary or IR, especially when there is recursion. In
1928 // particular, when we have created new nodes to reflect inlining, it is
1929 // sometimes impossible to know exactly how to update the edges in the face of
1930 // recursion, as we have lost the original ordering of the stack ids in the
1931 // contexts.
1932 // TODO: Consider only doing this if we detect the context has recursive
1933 // cycles.
1934 //
1935 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1936 // and let's say A was inlined into B, C, and D. The original graph will have
1937 // multiple recursive cycles through A. When we match the original context
1938 // nodes onto the IR or summary, we will merge {A B} into one context node,
1939 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1940 // above, we should end up with a non-cyclic set of edges like:
1941 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1942 // original ordering, we won't get the edges correct initially (it's
1943 // impossible without the original ordering). Here we do the fixup (add and
1944 // removing edges where necessary) for this context. In the
1945 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1946 // and map entries for {A B}, {A C}, {A D}, and {E}.
1947 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1948 if (Info.StackIdsToNode.empty())
1949 continue;
1950 bool Changed = false;
1951 ContextNode *PrevNode = nullptr;
1952 ContextNode *CurNode = nullptr;
1953 DenseSet<const ContextEdge *> VisitedEdges;
1954 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1955 // Try to identify what callsite ContextNode maps to which slice of the
1956 // context's ordered stack ids.
1957 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1958 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1959 // see if we recorded a context node for that sequence.
1960 auto Len = Info.MaxLength;
1961 auto LenToEnd = AllStackIds.size() - I;
1962 if (Len > LenToEnd)
1963 Len = LenToEnd;
1964 CurNode = nullptr;
1965 // Try to find a recorded context node starting with the longest length
1966 // recorded, and on down until we check for just a single stack node.
1967 for (; Len > 0; Len--) {
1968 // Get the slice of the original stack id sequence to check.
1969 auto CheckStackIds = AllStackIds.slice(I, Len);
1970 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1971 if (EntryIt == Info.StackIdsToNode.end())
1972 continue;
1973 CurNode = EntryIt->second;
1974 // Skip forward so we don't try to look for the ones we just matched.
1975 // We increment by Len - 1, because the outer for loop will increment I.
1976 I += Len - 1;
1977 break;
1978 }
1979 // Give up if we couldn't find a node. Since we need to clone from the
1980 // leaf allocation upwards, no sense in doing anymore fixup further up
1981 // the context if we couldn't match part of the original stack context
1982 // onto a callsite node.
1983 if (!CurNode)
1984 break;
1985 // No edges to fix up until we have a pair of nodes that should be
1986 // adjacent in the graph.
1987 if (!PrevNode)
1988 continue;
1989 // See if we already have a call edge from CurNode to PrevNode.
1990 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1991 if (CurEdge) {
1992 // We already have an edge. Make sure it contains this context id.
1993 if (CurEdge->getContextIds().insert(CurContextId).second) {
1994 NumFixupEdgeIdsInserted++;
1995 Changed = true;
1996 }
1997 } else {
1998 // No edge exists - add one.
1999 NumFixupEdgesAdded++;
2000 DenseSet<uint32_t> ContextIds({CurContextId});
2001 auto AllocType = computeAllocType(ContextIds);
2002 auto NewEdge = std::make_shared<ContextEdge>(
2003 PrevNode, CurNode, AllocType, std::move(ContextIds));
2004 PrevNode->CallerEdges.push_back(NewEdge);
2005 CurNode->CalleeEdges.push_back(NewEdge);
2006 // Save the new edge for the below handling.
2007 CurEdge = NewEdge.get();
2008 Changed = true;
2009 }
2010 VisitedEdges.insert(CurEdge);
2011 // Now remove this context id from any other caller edges calling
2012 // PrevNode.
2013 for (auto &Edge : PrevNode->CallerEdges) {
2014 // Skip the edge updating/created above and edges we have already
2015 // visited (due to recursion).
2016 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2017 Edge->getContextIds().erase(CurContextId);
2018 }
2019 }
2020 if (Changed)
2021 NumFixedContexts++;
2022 }
2023}
2024
2025template <typename DerivedCCG, typename FuncTy, typename CallTy>
2026void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2027 // Map of stack id to all calls with that as the last (outermost caller)
2028 // callsite id that has a context node (some might not due to pruning
2029 // performed during matching of the allocation profile contexts).
2030 // The CallContextInfo contains the Call and a list of its stack ids with
2031 // ContextNodes, the function containing Call, and the set of context ids
2032 // the analysis will eventually identify for use in any new node created
2033 // for that callsite.
2034 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2035 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2036 for (auto &Call : CallsWithMetadata) {
2037 // Ignore allocations, already handled.
2038 if (AllocationCallToContextNodeMap.count(Call))
2039 continue;
2040 auto StackIdsWithContextNodes =
2041 getStackIdsWithContextNodesForCall(Call.call());
2042 // If there were no nodes created for MIBs on allocs (maybe this was in
2043 // the unambiguous part of the MIB stack that was pruned), ignore.
2044 if (StackIdsWithContextNodes.empty())
2045 continue;
2046 // Otherwise, record this Call along with the list of ids for the last
2047 // (outermost caller) stack id with a node.
2048 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2049 {Call.call(), StackIdsWithContextNodes, Func, {}});
2050 }
2051 }
2052
2053 // First make a pass through all stack ids that correspond to a call,
2054 // as identified in the above loop. Compute the context ids corresponding to
2055 // each of these calls when they correspond to multiple stack ids due to
2056 // due to inlining. Perform any duplication of context ids required when
2057 // there is more than one call with the same stack ids. Their (possibly newly
2058 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2059 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2060 // Save a map from each call to any that are found to match it. I.e. located
2061 // in the same function and have the same (possibly pruned) stack ids. We use
2062 // this to avoid creating extra graph nodes as they can be treated the same.
2063 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2064 for (auto &It : StackIdToMatchingCalls) {
2065 auto &Calls = It.getSecond();
2066 // Skip single calls with a single stack id. These don't need a new node.
2067 if (Calls.size() == 1) {
2068 auto &Ids = Calls[0].StackIds;
2069 if (Ids.size() == 1)
2070 continue;
2071 }
2072 // In order to do the best and maximal matching of inlined calls to context
2073 // node sequences we will sort the vectors of stack ids in descending order
2074 // of length, and within each length, lexicographically by stack id. The
2075 // latter is so that we can specially handle calls that have identical stack
2076 // id sequences (either due to cloning or artificially because of the MIB
2077 // context pruning). Those with the same Ids are then sorted by function to
2078 // facilitate efficiently mapping them to the same context node.
2079 // Because the functions are pointers, to ensure a stable sort first assign
2080 // each function pointer to its first index in the Calls array, and then use
2081 // that to sort by.
2082 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2083 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2084 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2086 Calls,
2087 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2088 return A.StackIds.size() > B.StackIds.size() ||
2089 (A.StackIds.size() == B.StackIds.size() &&
2090 (A.StackIds < B.StackIds ||
2091 (A.StackIds == B.StackIds &&
2092 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2093 });
2094
2095 // Find the node for the last stack id, which should be the same
2096 // across all calls recorded for this id, and is the id for this
2097 // entry in the StackIdToMatchingCalls map.
2098 uint64_t LastId = It.getFirst();
2099 ContextNode *LastNode = getNodeForStackId(LastId);
2100 // We should only have kept stack ids that had nodes.
2101 assert(LastNode);
2102
2103 if (LastNode->Recursive)
2104 continue;
2105
2106 // Initialize the context ids with the last node's. We will subsequently
2107 // refine the context ids by computing the intersection along all edges.
2108 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2109 assert(!LastNodeContextIds.empty());
2110
2111#ifndef NDEBUG
2112 // Save the set of functions seen for a particular set of the same stack
2113 // ids. This is used to ensure that they have been correctly sorted to be
2114 // adjacent in the Calls list, since we rely on that to efficiently place
2115 // all such matching calls onto the same context node.
2116 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2117#endif
2118
2119 for (unsigned I = 0; I < Calls.size(); I++) {
2120 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2121 assert(SavedContextIds.empty());
2122 assert(LastId == Ids.back());
2123
2124#ifndef NDEBUG
2125 // If this call has a different set of ids than the last one, clear the
2126 // set used to ensure they are sorted properly.
2127 if (I > 0 && Ids != Calls[I - 1].StackIds)
2128 MatchingIdsFuncSet.clear();
2129#endif
2130
2131 // First compute the context ids for this stack id sequence (the
2132 // intersection of the context ids of the corresponding nodes).
2133 // Start with the remaining saved ids for the last node.
2134 assert(!LastNodeContextIds.empty());
2135 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2136
2137 ContextNode *PrevNode = LastNode;
2138 ContextNode *CurNode = LastNode;
2139 bool Skip = false;
2140
2141 // Iterate backwards through the stack Ids, starting after the last Id
2142 // in the list, which was handled once outside for all Calls.
2143 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2144 auto Id = *IdIter;
2145 CurNode = getNodeForStackId(Id);
2146 // We should only have kept stack ids that had nodes.
2147 assert(CurNode);
2148
2149 if (CurNode->Recursive) {
2150 Skip = true;
2151 break;
2152 }
2153
2154 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2155 // If there is no edge then the nodes belong to different MIB contexts,
2156 // and we should skip this inlined context sequence. For example, this
2157 // particular inlined context may include stack ids A->B, and we may
2158 // indeed have nodes for both A and B, but it is possible that they were
2159 // never profiled in sequence in a single MIB for any allocation (i.e.
2160 // we might have profiled an allocation that involves the callsite A,
2161 // but through a different one of its callee callsites, and we might
2162 // have profiled an allocation that involves callsite B, but reached
2163 // from a different caller callsite).
2164 if (!Edge) {
2165 Skip = true;
2166 break;
2167 }
2168 PrevNode = CurNode;
2169
2170 // Update the context ids, which is the intersection of the ids along
2171 // all edges in the sequence.
2172 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2173
2174 // If we now have no context ids for clone, skip this call.
2175 if (StackSequenceContextIds.empty()) {
2176 Skip = true;
2177 break;
2178 }
2179 }
2180 if (Skip)
2181 continue;
2182
2183 // If some of this call's stack ids did not have corresponding nodes (due
2184 // to pruning), don't include any context ids for contexts that extend
2185 // beyond these nodes. Otherwise we would be matching part of unrelated /
2186 // not fully matching stack contexts. To do this, subtract any context ids
2187 // found in caller nodes of the last node found above.
2188 if (Ids.back() != getLastStackId(Call)) {
2189 for (const auto &PE : LastNode->CallerEdges) {
2190 set_subtract(StackSequenceContextIds, PE->getContextIds());
2191 if (StackSequenceContextIds.empty())
2192 break;
2193 }
2194 // If we now have no context ids for clone, skip this call.
2195 if (StackSequenceContextIds.empty())
2196 continue;
2197 }
2198
2199#ifndef NDEBUG
2200 // If the prior call had the same stack ids this set would not be empty.
2201 // Check if we already have a call that "matches" because it is located
2202 // in the same function. If the Calls list was sorted properly we should
2203 // not encounter this situation as all such entries should be adjacent
2204 // and processed in bulk further below.
2205 assert(!MatchingIdsFuncSet.contains(Func));
2206
2207 MatchingIdsFuncSet.insert(Func);
2208#endif
2209
2210 // Check if the next set of stack ids is the same (since the Calls vector
2211 // of tuples is sorted by the stack ids we can just look at the next one).
2212 // If so, save them in the CallToMatchingCall map so that they get
2213 // assigned to the same context node, and skip them.
2214 bool DuplicateContextIds = false;
2215 for (unsigned J = I + 1; J < Calls.size(); J++) {
2216 auto &CallCtxInfo = Calls[J];
2217 auto &NextIds = CallCtxInfo.StackIds;
2218 if (NextIds != Ids)
2219 break;
2220 auto *NextFunc = CallCtxInfo.Func;
2221 if (NextFunc != Func) {
2222 // We have another Call with the same ids but that cannot share this
2223 // node, must duplicate ids for it.
2224 DuplicateContextIds = true;
2225 break;
2226 }
2227 auto &NextCall = CallCtxInfo.Call;
2228 CallToMatchingCall[NextCall] = Call;
2229 // Update I so that it gets incremented correctly to skip this call.
2230 I = J;
2231 }
2232
2233 // If we don't have duplicate context ids, then we can assign all the
2234 // context ids computed for the original node sequence to this call.
2235 // If there are duplicate calls with the same stack ids then we synthesize
2236 // new context ids that are duplicates of the originals. These are
2237 // assigned to SavedContextIds, which is a reference into the map entry
2238 // for this call, allowing us to access these ids later on.
2239 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2240 StackSequenceContextIds.size());
2241 SavedContextIds =
2242 DuplicateContextIds
2243 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2244 : StackSequenceContextIds;
2245 assert(!SavedContextIds.empty());
2246
2247 if (!DuplicateContextIds) {
2248 // Update saved last node's context ids to remove those that are
2249 // assigned to other calls, so that it is ready for the next call at
2250 // this stack id.
2251 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2252 if (LastNodeContextIds.empty())
2253 break;
2254 }
2255 }
2256 }
2257
2258 // Propagate the duplicate context ids over the graph.
2259 propagateDuplicateContextIds(OldToNewContextIds);
2260
2261 if (VerifyCCG)
2262 check();
2263
2264 // Now perform a post-order traversal over the graph, starting with the
2265 // allocation nodes, essentially processing nodes from callers to callees.
2266 // For any that contains an id in the map, update the graph to contain new
2267 // nodes representing any inlining at interior callsites. Note we move the
2268 // associated context ids over to the new nodes.
2269 DenseSet<const ContextNode *> Visited;
2270 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2271 ImportantContextIdInfo.keys());
2272 for (auto &Entry : AllocationCallToContextNodeMap)
2273 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2274 CallToMatchingCall, ImportantContextIds);
2275
2276 fixupImportantContexts();
2277
2278 if (VerifyCCG)
2279 check();
2280}
2281
2282uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2283 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2284 Call->getMetadata(LLVMContext::MD_callsite));
2285 return CallsiteContext.back();
2286}
2287
2288uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2290 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2291 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2292 // Need to convert index into stack id.
2293 return Index.getStackIdAtIndex(CallsiteContext.back());
2294}
2295
2296static const std::string MemProfCloneSuffix = ".memprof.";
2297
2298static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2299 // We use CloneNo == 0 to refer to the original version, which doesn't get
2300 // renamed with a suffix.
2301 if (!CloneNo)
2302 return Base.str();
2303 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2304}
2305
2306static bool isMemProfClone(const Function &F) {
2307 return F.getName().contains(MemProfCloneSuffix);
2308}
2309
2310// Return the clone number of the given function by extracting it from the
2311// memprof suffix. Assumes the caller has already confirmed it is a memprof
2312// clone.
2313static unsigned getMemProfCloneNum(const Function &F) {
2315 auto Pos = F.getName().find_last_of('.');
2316 assert(Pos > 0);
2317 unsigned CloneNo;
2318 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2319 assert(!Err);
2320 (void)Err;
2321 return CloneNo;
2322}
2323
2324std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2325 const Instruction *Call,
2326 unsigned CloneNo) const {
2327 return (Twine(Call->getFunction()->getName()) + " -> " +
2328 cast<CallBase>(Call)->getCalledFunction()->getName())
2329 .str();
2330}
2331
2332std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2333 const IndexCall &Call,
2334 unsigned CloneNo) const {
2335 auto VI = FSToVIMap.find(Func);
2336 assert(VI != FSToVIMap.end());
2337 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2339 return CallerName + " -> alloc";
2340 else {
2341 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2342 return CallerName + " -> " +
2343 getMemProfFuncName(Callsite->Callee.name(),
2344 Callsite->Clones[CloneNo]);
2345 }
2346}
2347
2348std::vector<uint64_t>
2349ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2350 Instruction *Call) {
2351 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2352 Call->getMetadata(LLVMContext::MD_callsite));
2353 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2354 CallsiteContext);
2355}
2356
2357std::vector<uint64_t>
2358IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2360 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2361 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2362 return getStackIdsWithContextNodes<CallsiteInfo,
2363 SmallVector<unsigned>::const_iterator>(
2364 CallsiteContext);
2365}
2366
2367template <typename DerivedCCG, typename FuncTy, typename CallTy>
2368template <class NodeT, class IteratorT>
2369std::vector<uint64_t>
2370CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2371 CallStack<NodeT, IteratorT> &CallsiteContext) {
2372 std::vector<uint64_t> StackIds;
2373 for (auto IdOrIndex : CallsiteContext) {
2374 auto StackId = getStackId(IdOrIndex);
2375 ContextNode *Node = getNodeForStackId(StackId);
2376 if (!Node)
2377 break;
2378 StackIds.push_back(StackId);
2379 }
2380 return StackIds;
2381}
2382
2383ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2384 Module &M,
2385 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2386 : Mod(M), OREGetter(OREGetter) {
2387 // Map for keeping track of the largest cold contexts up to the number given
2388 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2389 // must be sorted.
2390 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2391 for (auto &F : M) {
2392 std::vector<CallInfo> CallsWithMetadata;
2393 for (auto &BB : F) {
2394 for (auto &I : BB) {
2395 if (!isa<CallBase>(I))
2396 continue;
2397 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2398 CallsWithMetadata.push_back(&I);
2399 auto *AllocNode = addAllocNode(&I, &F);
2400 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2401 assert(CallsiteMD);
2402 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2403 // Add all of the MIBs and their stack nodes.
2404 for (auto &MDOp : MemProfMD->operands()) {
2405 auto *MIBMD = cast<const MDNode>(MDOp);
2406 std::vector<ContextTotalSize> ContextSizeInfo;
2407 // Collect the context size information if it exists.
2408 if (MIBMD->getNumOperands() > 2) {
2409 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2410 MDNode *ContextSizePair =
2411 dyn_cast<MDNode>(MIBMD->getOperand(I));
2412 assert(ContextSizePair->getNumOperands() == 2);
2414 ContextSizePair->getOperand(0))
2415 ->getZExtValue();
2417 ContextSizePair->getOperand(1))
2418 ->getZExtValue();
2419 ContextSizeInfo.push_back({FullStackId, TotalSize});
2420 }
2421 }
2425 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2426 AllocNode, StackContext, CallsiteContext,
2427 getMIBAllocType(MIBMD), ContextSizeInfo,
2428 TotalSizeToContextIdTopNCold);
2429 }
2430 // If exporting the graph to dot and an allocation id of interest was
2431 // specified, record all the context ids for this allocation node.
2432 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2433 DotAllocContextIds = AllocNode->getContextIds();
2434 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2435 // Memprof and callsite metadata on memory allocations no longer
2436 // needed.
2437 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2438 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2439 }
2440 // For callsite metadata, add to list for this function for later use.
2441 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2442 CallsWithMetadata.push_back(&I);
2443 }
2444 }
2445 }
2446 if (!CallsWithMetadata.empty())
2447 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2448 }
2449
2450 if (DumpCCG) {
2451 dbgs() << "CCG before updating call stack chains:\n";
2452 dbgs() << *this;
2453 }
2454
2455 if (ExportToDot)
2456 exportToDot("prestackupdate");
2457
2458 updateStackNodes();
2459
2460 if (ExportToDot)
2461 exportToDot("poststackupdate");
2462
2463 handleCallsitesWithMultipleTargets();
2464
2465 markBackedges();
2466
2467 // Strip off remaining callsite metadata, no longer needed.
2468 for (auto &FuncEntry : FuncToCallsWithMetadata)
2469 for (auto &Call : FuncEntry.second)
2470 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2471}
2472
2473// Finds the set of GUIDs for weak aliasees that are prevailing in different
2474// modules than any of their aliases. We need to handle these specially.
2476IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2477 DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2478 for (auto &I : Index) {
2479 auto VI = Index.getValueInfo(I);
2480 for (auto &S : VI.getSummaryList()) {
2481 // We only care about aliases to functions.
2482 auto *AS = dyn_cast<AliasSummary>(S.get());
2483 if (!AS)
2484 continue;
2485 auto *AliaseeSummary = &AS->getAliasee();
2486 auto *AliaseeFS = dyn_cast<FunctionSummary>(AliaseeSummary);
2487 if (!AliaseeFS)
2488 continue;
2489 // Skip this summary if it is not for the prevailing symbol for this GUID.
2490 // The linker doesn't resolve local linkage values so don't check whether
2491 // those are prevailing.
2492 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2493 !isPrevailing(VI.getGUID(), S.get()))
2494 continue;
2495 // Prevailing aliasee could be in a different module only if it is weak.
2496 if (!GlobalValue::isWeakForLinker(AliaseeSummary->linkage()))
2497 continue;
2498 auto AliaseeGUID = AS->getAliaseeGUID();
2499 // If the aliasee copy in this module is not prevailing, record it.
2500 if (!isPrevailing(AliaseeGUID, AliaseeSummary))
2501 AliaseeGUIDs.insert(AliaseeGUID);
2502 }
2503 }
2504 AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2505 return AliaseeGUIDs;
2506}
2507
2508IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2509 ModuleSummaryIndex &Index,
2510 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2511 isPrevailing)
2512 : Index(Index), isPrevailing(isPrevailing) {
2513 // Since we use the aliasee summary info to create the necessary clones for
2514 // its aliases, conservatively skip recording the aliasee function's callsites
2515 // in the CCG for any that are prevailing in a different module than one of
2516 // its aliases. We could record the necessary information to do this in the
2517 // summary, but this case should not be common.
2518 DenseSet<GlobalValue::GUID> GUIDsToSkip =
2519 findAliaseeGUIDsPrevailingInDifferentModule();
2520 // Map for keeping track of the largest cold contexts up to the number given
2521 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2522 // must be sorted.
2523 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2524 for (auto &I : Index) {
2525 auto VI = Index.getValueInfo(I);
2526 if (GUIDsToSkip.contains(VI.getGUID()))
2527 continue;
2528 for (auto &S : VI.getSummaryList()) {
2529 // We should only add the prevailing nodes. Otherwise we may try to clone
2530 // in a weak copy that won't be linked (and may be different than the
2531 // prevailing version).
2532 // We only keep the memprof summary on the prevailing copy now when
2533 // building the combined index, as a space optimization, however don't
2534 // rely on this optimization. The linker doesn't resolve local linkage
2535 // values so don't check whether those are prevailing.
2536 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2537 !isPrevailing(VI.getGUID(), S.get()))
2538 continue;
2539 auto *FS = dyn_cast<FunctionSummary>(S.get());
2540 if (!FS)
2541 continue;
2542 std::vector<CallInfo> CallsWithMetadata;
2543 if (!FS->allocs().empty()) {
2544 for (auto &AN : FS->mutableAllocs()) {
2545 // This can happen because of recursion elimination handling that
2546 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2547 // We still added them to the summary because we need to be able to
2548 // correlate properly in applyImport in the backends.
2549 if (AN.MIBs.empty())
2550 continue;
2551 IndexCall AllocCall(&AN);
2552 CallsWithMetadata.push_back(AllocCall);
2553 auto *AllocNode = addAllocNode(AllocCall, FS);
2554 // Pass an empty CallStack to the CallsiteContext (second)
2555 // parameter, since for ThinLTO we already collapsed out the inlined
2556 // stack ids on the allocation call during ModuleSummaryAnalysis.
2558 EmptyContext;
2559 unsigned I = 0;
2561 AN.ContextSizeInfos.size() == AN.MIBs.size());
2562 // Now add all of the MIBs and their stack nodes.
2563 for (auto &MIB : AN.MIBs) {
2565 StackContext(&MIB);
2566 std::vector<ContextTotalSize> ContextSizeInfo;
2567 if (!AN.ContextSizeInfos.empty()) {
2568 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2569 ContextSizeInfo.push_back({FullStackId, TotalSize});
2570 }
2571 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2572 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2573 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2574 I++;
2575 }
2576 // If exporting the graph to dot and an allocation id of interest was
2577 // specified, record all the context ids for this allocation node.
2578 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2579 DotAllocContextIds = AllocNode->getContextIds();
2580 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2581 // Initialize version 0 on the summary alloc node to the current alloc
2582 // type, unless it has both types in which case make it default, so
2583 // that in the case where we aren't able to clone the original version
2584 // always ends up with the default allocation behavior.
2585 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2586 }
2587 }
2588 // For callsite metadata, add to list for this function for later use.
2589 if (!FS->callsites().empty())
2590 for (auto &SN : FS->mutableCallsites()) {
2591 IndexCall StackNodeCall(&SN);
2592 CallsWithMetadata.push_back(StackNodeCall);
2593 }
2594
2595 if (!CallsWithMetadata.empty())
2596 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2597
2598 if (!FS->allocs().empty() || !FS->callsites().empty())
2599 FSToVIMap[FS] = VI;
2600 }
2601 }
2602
2603 if (DumpCCG) {
2604 dbgs() << "CCG before updating call stack chains:\n";
2605 dbgs() << *this;
2606 }
2607
2608 if (ExportToDot)
2609 exportToDot("prestackupdate");
2610
2611 updateStackNodes();
2612
2613 if (ExportToDot)
2614 exportToDot("poststackupdate");
2615
2616 handleCallsitesWithMultipleTargets();
2617
2618 markBackedges();
2619}
2620
2621template <typename DerivedCCG, typename FuncTy, typename CallTy>
2622void CallsiteContextGraph<DerivedCCG, FuncTy,
2623 CallTy>::handleCallsitesWithMultipleTargets() {
2624 // Look for and workaround callsites that call multiple functions.
2625 // This can happen for indirect calls, which needs better handling, and in
2626 // more rare cases (e.g. macro expansion).
2627 // TODO: To fix this for indirect calls we will want to perform speculative
2628 // devirtualization using either the normal PGO info with ICP, or using the
2629 // information in the profiled MemProf contexts. We can do this prior to
2630 // this transformation for regular LTO, and for ThinLTO we can simulate that
2631 // effect in the summary and perform the actual speculative devirtualization
2632 // while cloning in the ThinLTO backend.
2633
2634 // Keep track of the new nodes synthesized for discovered tail calls missing
2635 // from the profiled contexts.
2636 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2637
2638 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2639 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2640 auto *Node = Entry.second;
2641 assert(Node->Clones.empty());
2642 // Check all node callees and see if in the same function.
2643 // We need to check all of the calls recorded in this Node, because in some
2644 // cases we may have had multiple calls with the same debug info calling
2645 // different callees. This can happen, for example, when an object is
2646 // constructed in the paramter list - the destructor call of the object has
2647 // the same debug info (line/col) as the call the object was passed to.
2648 // Here we will prune any that don't match all callee nodes.
2649 std::vector<CallInfo> AllCalls;
2650 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2651 AllCalls.push_back(Node->Call);
2652 llvm::append_range(AllCalls, Node->MatchingCalls);
2653
2654 // First see if we can partition the calls by callee function, creating new
2655 // nodes to host each set of calls calling the same callees. This is
2656 // necessary for support indirect calls with ThinLTO, for which we
2657 // synthesized CallsiteInfo records for each target. They will all have the
2658 // same callsite stack ids and would be sharing a context node at this
2659 // point. We need to perform separate cloning for each, which will be
2660 // applied along with speculative devirtualization in the ThinLTO backends
2661 // as needed. Note this does not currently support looking through tail
2662 // calls, it is unclear if we need that for indirect call targets.
2663 // First partition calls by callee func. Map indexed by func, value is
2664 // struct with list of matching calls, assigned node.
2665 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2666 continue;
2667
2668 auto It = AllCalls.begin();
2669 // Iterate through the calls until we find the first that matches.
2670 for (; It != AllCalls.end(); ++It) {
2671 auto ThisCall = *It;
2672 bool Match = true;
2673 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2674 ++EI) {
2675 auto Edge = *EI;
2676 if (!Edge->Callee->hasCall())
2677 continue;
2678 assert(NodeToCallingFunc.count(Edge->Callee));
2679 // Check if the called function matches that of the callee node.
2680 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2681 Match = false;
2682 break;
2683 }
2684 }
2685 // Found a call that matches the callee nodes, we can quit now.
2686 if (Match) {
2687 // If the first match is not the primary call on the Node, update it
2688 // now. We will update the list of matching calls further below.
2689 if (Node->Call != ThisCall) {
2690 Node->setCall(ThisCall);
2691 // We need to update the NonAllocationCallToContextNodeMap, but don't
2692 // want to do this during iteration over that map, so save the calls
2693 // that need updated entries.
2694 NewCallToNode.push_back({ThisCall, Node});
2695 }
2696 break;
2697 }
2698 }
2699 // We will update this list below (or leave it cleared if there was no
2700 // match found above).
2701 Node->MatchingCalls.clear();
2702 // If we hit the end of the AllCalls vector, no call matching the callee
2703 // nodes was found, clear the call information in the node.
2704 if (It == AllCalls.end()) {
2705 RemovedEdgesWithMismatchedCallees++;
2706 // Work around by setting Node to have a null call, so it gets
2707 // skipped during cloning. Otherwise assignFunctions will assert
2708 // because its data structures are not designed to handle this case.
2709 Node->setCall(CallInfo());
2710 continue;
2711 }
2712 // Now add back any matching calls that call the same function as the
2713 // matching primary call on Node.
2714 for (++It; It != AllCalls.end(); ++It) {
2715 auto ThisCall = *It;
2716 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2717 continue;
2718 Node->MatchingCalls.push_back(ThisCall);
2719 }
2720 }
2721
2722 // Remove all mismatched nodes identified in the above loop from the node map
2723 // (checking whether they have a null call which is set above). For a
2724 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2725 // to do the removal via remove_if than by individually erasing entries above.
2726 // Also remove any entries if we updated the node's primary call above.
2727 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2728 return !it.second->hasCall() || it.second->Call != it.first;
2729 });
2730
2731 // Add entries for any new primary calls recorded above.
2732 for (auto &[Call, Node] : NewCallToNode)
2733 NonAllocationCallToContextNodeMap[Call] = Node;
2734
2735 // Add the new nodes after the above loop so that the iteration is not
2736 // invalidated.
2737 for (auto &[Call, Node] : TailCallToContextNodeMap)
2738 NonAllocationCallToContextNodeMap[Call] = Node;
2739}
2740
2741template <typename DerivedCCG, typename FuncTy, typename CallTy>
2742bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2743 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2744 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2745 // Struct to keep track of all the calls having the same callee function,
2746 // and the node we eventually assign to them. Eventually we will record the
2747 // context node assigned to this group of calls.
2748 struct CallsWithSameCallee {
2749 std::vector<CallInfo> Calls;
2750 ContextNode *Node = nullptr;
2751 };
2752
2753 // First partition calls by callee function. Build map from each function
2754 // to the list of matching calls.
2756 for (auto ThisCall : AllCalls) {
2757 auto *F = getCalleeFunc(ThisCall.call());
2758 if (F)
2759 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2760 }
2761
2762 // Next, walk through all callee edges. For each callee node, get its
2763 // containing function and see if it was recorded in the above map (meaning we
2764 // have at least one matching call). Build another map from each callee node
2765 // with a matching call to the structure instance created above containing all
2766 // the calls.
2768 for (const auto &Edge : Node->CalleeEdges) {
2769 if (!Edge->Callee->hasCall())
2770 continue;
2771 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2772 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2773 CalleeNodeToCallInfo[Edge->Callee] =
2774 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2775 }
2776
2777 // If there are entries in the second map, then there were no matching
2778 // calls/callees, nothing to do here. Return so we can go to the handling that
2779 // looks through tail calls.
2780 if (CalleeNodeToCallInfo.empty())
2781 return false;
2782
2783 // Walk through all callee edges again. Any and all callee edges that didn't
2784 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2785 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2786 // ignored during cloning. If it is in the map, then we use the node recorded
2787 // in that entry (creating it if needed), and move the callee edge to it.
2788 // The first callee will use the original node instead of creating a new one.
2789 // Note that any of the original calls on this node (in AllCalls) that didn't
2790 // have a callee function automatically get dropped from the node as part of
2791 // this process.
2792 ContextNode *UnmatchedCalleesNode = nullptr;
2793 // Track whether we already assigned original node to a callee.
2794 bool UsedOrigNode = false;
2795 assert(NodeToCallingFunc[Node]);
2796 // Iterate over a copy of Node's callee edges, since we may need to remove
2797 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2798 // makes it less error-prone.
2799 auto CalleeEdges = Node->CalleeEdges;
2800 for (auto &Edge : CalleeEdges) {
2801 if (!Edge->Callee->hasCall())
2802 continue;
2803
2804 // Will be updated below to point to whatever (caller) node this callee edge
2805 // should be moved to.
2806 ContextNode *CallerNodeToUse = nullptr;
2807
2808 // Handle the case where there were no matching calls first. Move this
2809 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2810 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2811 if (!UnmatchedCalleesNode)
2812 UnmatchedCalleesNode =
2813 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2814 CallerNodeToUse = UnmatchedCalleesNode;
2815 } else {
2816 // Look up the information recorded for this callee node, and use the
2817 // recorded caller node (creating it if needed).
2818 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2819 if (!Info->Node) {
2820 // If we haven't assigned any callees to the original node use it.
2821 if (!UsedOrigNode) {
2822 Info->Node = Node;
2823 // Clear the set of matching calls which will be updated below.
2824 Node->MatchingCalls.clear();
2825 UsedOrigNode = true;
2826 } else
2827 Info->Node =
2828 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2829 assert(!Info->Calls.empty());
2830 // The first call becomes the primary call for this caller node, and the
2831 // rest go in the matching calls list.
2832 Info->Node->setCall(Info->Calls.front());
2833 llvm::append_range(Info->Node->MatchingCalls,
2834 llvm::drop_begin(Info->Calls));
2835 // Save the primary call to node correspondence so that we can update
2836 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2837 // caller of this function.
2838 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2839 }
2840 CallerNodeToUse = Info->Node;
2841 }
2842
2843 // Don't need to move edge if we are using the original node;
2844 if (CallerNodeToUse == Node)
2845 continue;
2846
2847 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2848 }
2849 // Now that we are done moving edges, clean up any caller edges that ended
2850 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2851 // caller edges from Node are replicated onto the new callers, and it
2852 // simplifies the handling to leave them until we have moved all
2853 // edges/context ids.
2854 for (auto &I : CalleeNodeToCallInfo)
2855 removeNoneTypeCallerEdges(I.second->Node);
2856 if (UnmatchedCalleesNode)
2857 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2858 removeNoneTypeCallerEdges(Node);
2859
2860 return true;
2861}
2862
2863uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2864 // In the Module (IR) case this is already the Id.
2865 return IdOrIndex;
2866}
2867
2868uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2869 // In the Index case this is an index into the stack id list in the summary
2870 // index, convert it to an Id.
2871 return Index.getStackIdAtIndex(IdOrIndex);
2872}
2873
2874template <typename DerivedCCG, typename FuncTy, typename CallTy>
2875bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2876 CallTy Call, EdgeIter &EI,
2877 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2878 auto Edge = *EI;
2879 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2880 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2881 // Will be populated in order of callee to caller if we find a chain of tail
2882 // calls between the profiled caller and callee.
2883 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2884 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2885 FoundCalleeChain))
2886 return false;
2887
2888 // The usual case where the profiled callee matches that of the IR/summary.
2889 if (FoundCalleeChain.empty())
2890 return true;
2891
2892 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2893 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2894 // If there is already an edge between these nodes, simply update it and
2895 // return.
2896 if (CurEdge) {
2897 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2898 CurEdge->AllocTypes |= Edge->AllocTypes;
2899 return;
2900 }
2901 // Otherwise, create a new edge and insert it into the caller and callee
2902 // lists.
2903 auto NewEdge = std::make_shared<ContextEdge>(
2904 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2905 Callee->CallerEdges.push_back(NewEdge);
2906 if (Caller == Edge->Caller) {
2907 // If we are inserting the new edge into the current edge's caller, insert
2908 // the new edge before the current iterator position, and then increment
2909 // back to the current edge.
2910 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2911 ++EI;
2912 assert(*EI == Edge &&
2913 "Iterator position not restored after insert and increment");
2914 } else
2915 Caller->CalleeEdges.push_back(NewEdge);
2916 };
2917
2918 // Create new nodes for each found callee and connect in between the profiled
2919 // caller and callee.
2920 auto *CurCalleeNode = Edge->Callee;
2921 for (auto &[NewCall, Func] : FoundCalleeChain) {
2922 ContextNode *NewNode = nullptr;
2923 // First check if we have already synthesized a node for this tail call.
2924 if (TailCallToContextNodeMap.count(NewCall)) {
2925 NewNode = TailCallToContextNodeMap[NewCall];
2926 NewNode->AllocTypes |= Edge->AllocTypes;
2927 } else {
2928 FuncToCallsWithMetadata[Func].push_back({NewCall});
2929 // Create Node and record node info.
2930 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2931 TailCallToContextNodeMap[NewCall] = NewNode;
2932 NewNode->AllocTypes = Edge->AllocTypes;
2933 }
2934
2935 // Hook up node to its callee node
2936 AddEdge(NewNode, CurCalleeNode);
2937
2938 CurCalleeNode = NewNode;
2939 }
2940
2941 // Hook up edge's original caller to new callee node.
2942 AddEdge(Edge->Caller, CurCalleeNode);
2943
2944#ifndef NDEBUG
2945 // Save this because Edge's fields get cleared below when removed.
2946 auto *Caller = Edge->Caller;
2947#endif
2948
2949 // Remove old edge
2950 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2951
2952 // To simplify the increment of EI in the caller, subtract one from EI.
2953 // In the final AddEdge call we would have either added a new callee edge,
2954 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2955 // that there is at least one callee edge.
2956 assert(!Caller->CalleeEdges.empty());
2957 --EI;
2958
2959 return true;
2960}
2961
2962bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2963 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2964 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2965 bool &FoundMultipleCalleeChains) {
2966 // Stop recursive search if we have already explored the maximum specified
2967 // depth.
2969 return false;
2970
2971 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2972 FoundCalleeChain.push_back({Callsite, F});
2973 };
2974
2975 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2976 if (!CalleeFunc) {
2977 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2978 assert(Alias);
2979 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2980 assert(CalleeFunc);
2981 }
2982
2983 // Look for tail calls in this function, and check if they either call the
2984 // profiled callee directly, or indirectly (via a recursive search).
2985 // Only succeed if there is a single unique tail call chain found between the
2986 // profiled caller and callee, otherwise we could perform incorrect cloning.
2987 bool FoundSingleCalleeChain = false;
2988 for (auto &BB : *CalleeFunc) {
2989 for (auto &I : BB) {
2990 auto *CB = dyn_cast<CallBase>(&I);
2991 if (!CB || !CB->isTailCall())
2992 continue;
2993 auto *CalledValue = CB->getCalledOperand();
2994 auto *CalledFunction = CB->getCalledFunction();
2995 if (CalledValue && !CalledFunction) {
2996 CalledValue = CalledValue->stripPointerCasts();
2997 // Stripping pointer casts can reveal a called function.
2998 CalledFunction = dyn_cast<Function>(CalledValue);
2999 }
3000 // Check if this is an alias to a function. If so, get the
3001 // called aliasee for the checks below.
3002 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3003 assert(!CalledFunction &&
3004 "Expected null called function in callsite for alias");
3005 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3006 }
3007 if (!CalledFunction)
3008 continue;
3009 if (CalledFunction == ProfiledCallee) {
3010 if (FoundSingleCalleeChain) {
3011 FoundMultipleCalleeChains = true;
3012 return false;
3013 }
3014 FoundSingleCalleeChain = true;
3015 FoundProfiledCalleeCount++;
3016 FoundProfiledCalleeDepth += Depth;
3017 if (Depth > FoundProfiledCalleeMaxDepth)
3018 FoundProfiledCalleeMaxDepth = Depth;
3019 SaveCallsiteInfo(&I, CalleeFunc);
3020 } else if (findProfiledCalleeThroughTailCalls(
3021 ProfiledCallee, CalledFunction, Depth + 1,
3022 FoundCalleeChain, FoundMultipleCalleeChains)) {
3023 // findProfiledCalleeThroughTailCalls should not have returned
3024 // true if FoundMultipleCalleeChains.
3025 assert(!FoundMultipleCalleeChains);
3026 if (FoundSingleCalleeChain) {
3027 FoundMultipleCalleeChains = true;
3028 return false;
3029 }
3030 FoundSingleCalleeChain = true;
3031 SaveCallsiteInfo(&I, CalleeFunc);
3032 } else if (FoundMultipleCalleeChains)
3033 return false;
3034 }
3035 }
3036
3037 return FoundSingleCalleeChain;
3038}
3039
3040const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
3041 auto *CB = dyn_cast<CallBase>(Call);
3042 if (!CB->getCalledOperand() || CB->isIndirectCall())
3043 return nullptr;
3044 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3045 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3046 if (Alias)
3047 return dyn_cast<Function>(Alias->getAliasee());
3048 return dyn_cast<Function>(CalleeVal);
3049}
3050
3051bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3052 Instruction *Call, const Function *Func, const Function *CallerFunc,
3053 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3054 auto *CB = dyn_cast<CallBase>(Call);
3055 if (!CB->getCalledOperand() || CB->isIndirectCall())
3056 return false;
3057 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3058 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
3059 if (CalleeFunc == Func)
3060 return true;
3061 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3062 if (Alias && Alias->getAliasee() == Func)
3063 return true;
3064
3065 // Recursively search for the profiled callee through tail calls starting with
3066 // the actual Callee. The discovered tail call chain is saved in
3067 // FoundCalleeChain, and we will fixup the graph to include these callsites
3068 // after returning.
3069 // FIXME: We will currently redo the same recursive walk if we find the same
3070 // mismatched callee from another callsite. We can improve this with more
3071 // bookkeeping of the created chain of new nodes for each mismatch.
3072 unsigned Depth = 1;
3073 bool FoundMultipleCalleeChains = false;
3074 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
3075 FoundCalleeChain,
3076 FoundMultipleCalleeChains)) {
3077 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3078 << Func->getName() << " from " << CallerFunc->getName()
3079 << " that actually called " << CalleeVal->getName()
3080 << (FoundMultipleCalleeChains
3081 ? " (found multiple possible chains)"
3082 : "")
3083 << "\n");
3084 if (FoundMultipleCalleeChains)
3085 FoundProfiledCalleeNonUniquelyCount++;
3086 return false;
3087 }
3088
3089 return true;
3090}
3091
3092bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3093 Instruction *Call2) {
3094 auto *CB1 = cast<CallBase>(Call1);
3095 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3096 return false;
3097 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3098 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
3099 auto *CB2 = cast<CallBase>(Call2);
3100 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3101 return false;
3102 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3103 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
3104 return CalleeFunc1 == CalleeFunc2;
3105}
3106
3107bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3108 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3109 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3110 bool &FoundMultipleCalleeChains) {
3111 // Stop recursive search if we have already explored the maximum specified
3112 // depth.
3114 return false;
3115
3116 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3117 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3118 // been synthesized.
3119 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
3120 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
3121 // StackIds is empty (we don't have debug info available in the index for
3122 // these callsites)
3123 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3124 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
3125 CallsiteInfo *NewCallsiteInfo =
3126 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3127 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
3128 };
3129
3130 // Look for tail calls in this function, and check if they either call the
3131 // profiled callee directly, or indirectly (via a recursive search).
3132 // Only succeed if there is a single unique tail call chain found between the
3133 // profiled caller and callee, otherwise we could perform incorrect cloning.
3134 bool FoundSingleCalleeChain = false;
3135 for (auto &S : CurCallee.getSummaryList()) {
3136 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
3137 !isPrevailing(CurCallee.getGUID(), S.get()))
3138 continue;
3139 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
3140 if (!FS)
3141 continue;
3142 auto FSVI = CurCallee;
3143 auto *AS = dyn_cast<AliasSummary>(S.get());
3144 if (AS)
3145 FSVI = AS->getAliaseeVI();
3146 for (auto &CallEdge : FS->calls()) {
3147 if (!CallEdge.second.hasTailCall())
3148 continue;
3149 if (CallEdge.first == ProfiledCallee) {
3150 if (FoundSingleCalleeChain) {
3151 FoundMultipleCalleeChains = true;
3152 return false;
3153 }
3154 FoundSingleCalleeChain = true;
3155 FoundProfiledCalleeCount++;
3156 FoundProfiledCalleeDepth += Depth;
3157 if (Depth > FoundProfiledCalleeMaxDepth)
3158 FoundProfiledCalleeMaxDepth = Depth;
3159 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3160 // Add FS to FSToVIMap in case it isn't already there.
3161 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3162 FSToVIMap[FS] = FSVI;
3163 } else if (findProfiledCalleeThroughTailCalls(
3164 ProfiledCallee, CallEdge.first, Depth + 1,
3165 FoundCalleeChain, FoundMultipleCalleeChains)) {
3166 // findProfiledCalleeThroughTailCalls should not have returned
3167 // true if FoundMultipleCalleeChains.
3168 assert(!FoundMultipleCalleeChains);
3169 if (FoundSingleCalleeChain) {
3170 FoundMultipleCalleeChains = true;
3171 return false;
3172 }
3173 FoundSingleCalleeChain = true;
3174 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3175 // Add FS to FSToVIMap in case it isn't already there.
3176 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3177 FSToVIMap[FS] = FSVI;
3178 } else if (FoundMultipleCalleeChains)
3179 return false;
3180 }
3181 }
3182
3183 return FoundSingleCalleeChain;
3184}
3185
3186const FunctionSummary *
3187IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3188 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3189 if (Callee.getSummaryList().empty())
3190 return nullptr;
3191 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
3192}
3193
3194bool IndexCallsiteContextGraph::calleeMatchesFunc(
3195 IndexCall &Call, const FunctionSummary *Func,
3196 const FunctionSummary *CallerFunc,
3197 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3198 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3199 // If there is no summary list then this is a call to an externally defined
3200 // symbol.
3201 AliasSummary *Alias =
3202 Callee.getSummaryList().empty()
3203 ? nullptr
3204 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
3205 assert(FSToVIMap.count(Func));
3206 auto FuncVI = FSToVIMap[Func];
3207 if (Callee == FuncVI ||
3208 // If callee is an alias, check the aliasee, since only function
3209 // summary base objects will contain the stack node summaries and thus
3210 // get a context node.
3211 (Alias && Alias->getAliaseeVI() == FuncVI))
3212 return true;
3213
3214 // Recursively search for the profiled callee through tail calls starting with
3215 // the actual Callee. The discovered tail call chain is saved in
3216 // FoundCalleeChain, and we will fixup the graph to include these callsites
3217 // after returning.
3218 // FIXME: We will currently redo the same recursive walk if we find the same
3219 // mismatched callee from another callsite. We can improve this with more
3220 // bookkeeping of the created chain of new nodes for each mismatch.
3221 unsigned Depth = 1;
3222 bool FoundMultipleCalleeChains = false;
3223 if (!findProfiledCalleeThroughTailCalls(
3224 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3225 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3226 << " from " << FSToVIMap[CallerFunc]
3227 << " that actually called " << Callee
3228 << (FoundMultipleCalleeChains
3229 ? " (found multiple possible chains)"
3230 : "")
3231 << "\n");
3232 if (FoundMultipleCalleeChains)
3233 FoundProfiledCalleeNonUniquelyCount++;
3234 return false;
3235 }
3236
3237 return true;
3238}
3239
3240bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3241 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
3242 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
3243 return Callee1 == Callee2;
3244}
3245
3246template <typename DerivedCCG, typename FuncTy, typename CallTy>
3247void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3248 const {
3249 print(dbgs());
3250 dbgs() << "\n";
3251}
3252
3253template <typename DerivedCCG, typename FuncTy, typename CallTy>
3254void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3255 raw_ostream &OS) const {
3256 OS << "Node " << this << "\n";
3257 OS << "\t";
3258 printCall(OS);
3259 if (Recursive)
3260 OS << " (recursive)";
3261 OS << "\n";
3262 if (!MatchingCalls.empty()) {
3263 OS << "\tMatchingCalls:\n";
3264 for (auto &MatchingCall : MatchingCalls) {
3265 OS << "\t";
3266 MatchingCall.print(OS);
3267 OS << "\n";
3268 }
3269 }
3270 OS << "\tNodeId: " << NodeId << "\n";
3271 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3272 OS << "\tContextIds:";
3273 // Make a copy of the computed context ids that we can sort for stability.
3274 auto ContextIds = getContextIds();
3275 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3276 std::sort(SortedIds.begin(), SortedIds.end());
3277 for (auto Id : SortedIds)
3278 OS << " " << Id;
3279 OS << "\n";
3280 OS << "\tCalleeEdges:\n";
3281 for (auto &Edge : CalleeEdges)
3282 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3283 << ")\n";
3284 OS << "\tCallerEdges:\n";
3285 for (auto &Edge : CallerEdges)
3286 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3287 << ")\n";
3288 if (!Clones.empty()) {
3289 OS << "\tClones: ";
3290 ListSeparator LS;
3291 for (auto *C : Clones)
3292 OS << LS << C << " NodeId: " << C->NodeId;
3293 OS << "\n";
3294 } else if (CloneOf) {
3295 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3296 }
3297}
3298
3299template <typename DerivedCCG, typename FuncTy, typename CallTy>
3300void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3301 const {
3302 print(dbgs());
3303 dbgs() << "\n";
3304}
3305
3306template <typename DerivedCCG, typename FuncTy, typename CallTy>
3307void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3308 raw_ostream &OS) const {
3309 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3310 << (IsBackedge ? " (BE)" : "")
3311 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3312 OS << " ContextIds:";
3313 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3314 std::sort(SortedIds.begin(), SortedIds.end());
3315 for (auto Id : SortedIds)
3316 OS << " " << Id;
3317}
3318
3319template <typename DerivedCCG, typename FuncTy, typename CallTy>
3320void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3321 print(dbgs());
3322}
3323
3324template <typename DerivedCCG, typename FuncTy, typename CallTy>
3325void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3326 raw_ostream &OS) const {
3327 OS << "Callsite Context Graph:\n";
3328 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3329 for (const auto Node : nodes<GraphType>(this)) {
3330 if (Node->isRemoved())
3331 continue;
3332 Node->print(OS);
3333 OS << "\n";
3334 }
3335}
3336
3337template <typename DerivedCCG, typename FuncTy, typename CallTy>
3338void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3339 raw_ostream &OS) const {
3340 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3341 for (const auto Node : nodes<GraphType>(this)) {
3342 if (Node->isRemoved())
3343 continue;
3344 if (!Node->IsAllocation)
3345 continue;
3346 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3347 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3348 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3349 std::sort(SortedIds.begin(), SortedIds.end());
3350 for (auto Id : SortedIds) {
3351 auto TypeI = ContextIdToAllocationType.find(Id);
3352 assert(TypeI != ContextIdToAllocationType.end());
3353 auto CSI = ContextIdToContextSizeInfos.find(Id);
3354 if (CSI != ContextIdToContextSizeInfos.end()) {
3355 for (auto &Info : CSI->second) {
3356 OS << "MemProf hinting: "
3357 << getAllocTypeString((uint8_t)TypeI->second)
3358 << " full allocation context " << Info.FullStackId
3359 << " with total size " << Info.TotalSize << " is "
3360 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3361 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3362 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3363 << " due to cold byte percent";
3364 // Print the internal context id to aid debugging and visualization.
3365 OS << " (context id " << Id << ")";
3366 OS << "\n";
3367 }
3368 }
3369 }
3370 }
3371}
3372
3373template <typename DerivedCCG, typename FuncTy, typename CallTy>
3374void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3375 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3376 for (const auto Node : nodes<GraphType>(this)) {
3377 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3378 for (auto &Edge : Node->CallerEdges)
3380 }
3381}
3382
3383template <typename DerivedCCG, typename FuncTy, typename CallTy>
3384struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3385 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3386 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3387
3388 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3389 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3390
3393 decltype(&getNode)>;
3394
3396 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3397 }
3398
3400 return nodes_iterator(G->NodeOwner.end(), &getNode);
3401 }
3402
3404 return G->NodeOwner.begin()->get();
3405 }
3406
3407 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3408 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3410 return P->Callee;
3411 }
3412
3414 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3415 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3416 decltype(&GetCallee)>;
3417
3419 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3420 }
3421
3423 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3424 }
3425};
3426
3427template <typename DerivedCCG, typename FuncTy, typename CallTy>
3428struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3429 : public DefaultDOTGraphTraits {
3430 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3431 // If the user requested the full graph to be exported, but provided an
3432 // allocation id, or if the user gave a context id and requested more than
3433 // just a specific context to be exported, note that highlighting is
3434 // enabled.
3435 DoHighlight =
3436 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3437 (ContextIdForDot.getNumOccurrences() &&
3439 }
3440
3441 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3443 using NodeRef = typename GTraits::NodeRef;
3444 using ChildIteratorType = typename GTraits::ChildIteratorType;
3445
3446 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3447 std::string LabelString =
3448 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3449 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3450 .str();
3451 LabelString += "\n";
3452 if (Node->hasCall()) {
3453 auto Func = G->NodeToCallingFunc.find(Node);
3454 assert(Func != G->NodeToCallingFunc.end());
3455 LabelString +=
3456 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3457 for (auto &MatchingCall : Node->MatchingCalls) {
3458 LabelString += "\n";
3459 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3460 MatchingCall.cloneNo());
3461 }
3462 } else {
3463 LabelString += "null call";
3464 if (Node->Recursive)
3465 LabelString += " (recursive)";
3466 else
3467 LabelString += " (external)";
3468 }
3469 return LabelString;
3470 }
3471
3473 auto ContextIds = Node->getContextIds();
3474 // If highlighting enabled, see if this node contains any of the context ids
3475 // of interest. If so, it will use a different color and a larger fontsize
3476 // (which makes the node larger as well).
3477 bool Highlight = false;
3478 if (DoHighlight) {
3479 assert(ContextIdForDot.getNumOccurrences() ||
3480 AllocIdForDot.getNumOccurrences());
3481 if (ContextIdForDot.getNumOccurrences())
3482 Highlight = ContextIds.contains(ContextIdForDot);
3483 else
3484 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3485 }
3486 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3487 getContextIds(ContextIds) + "\"")
3488 .str();
3489 // Default fontsize is 14
3490 if (Highlight)
3491 AttributeString += ",fontsize=\"30\"";
3492 AttributeString +=
3493 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3494 .str();
3495 if (Node->CloneOf) {
3496 AttributeString += ",color=\"blue\"";
3497 AttributeString += ",style=\"filled,bold,dashed\"";
3498 } else
3499 AttributeString += ",style=\"filled\"";
3500 return AttributeString;
3501 }
3502
3503 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3504 GraphType G) {
3505 auto &Edge = *(ChildIter.getCurrent());
3506 // If highlighting enabled, see if this edge contains any of the context ids
3507 // of interest. If so, it will use a different color and a heavier arrow
3508 // size and weight (the larger weight makes the highlighted path
3509 // straighter).
3510 bool Highlight = false;
3511 if (DoHighlight) {
3512 assert(ContextIdForDot.getNumOccurrences() ||
3513 AllocIdForDot.getNumOccurrences());
3514 if (ContextIdForDot.getNumOccurrences())
3515 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3516 else
3517 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3518 }
3519 auto Color = getColor(Edge->AllocTypes, Highlight);
3520 std::string AttributeString =
3521 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3522 // fillcolor is the arrow head and color is the line
3523 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3524 "\"")
3525 .str();
3526 if (Edge->IsBackedge)
3527 AttributeString += ",style=\"dotted\"";
3528 // Default penwidth and weight are both 1.
3529 if (Highlight)
3530 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3531 return AttributeString;
3532 }
3533
3534 // Since the NodeOwners list includes nodes that are no longer connected to
3535 // the graph, skip them here.
3537 if (Node->isRemoved())
3538 return true;
3539 // If a scope smaller than the full graph was requested, see if this node
3540 // contains any of the context ids of interest.
3542 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3544 return !Node->getContextIds().contains(ContextIdForDot);
3545 return false;
3546 }
3547
3548private:
3549 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3550 std::string IdString = "ContextIds:";
3551 if (ContextIds.size() < 100) {
3552 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3553 std::sort(SortedIds.begin(), SortedIds.end());
3554 for (auto Id : SortedIds)
3555 IdString += (" " + Twine(Id)).str();
3556 } else {
3557 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3558 }
3559 return IdString;
3560 }
3561
3562 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3563 // If DoHighlight is not enabled, we want to use the highlight colors for
3564 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3565 // both compatible with the color scheme before highlighting was supported,
3566 // and for the NotCold+Cold color the non-highlight color is a bit more
3567 // readable.
3568 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3569 // Color "brown1" actually looks like a lighter red.
3570 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3571 if (AllocTypes == (uint8_t)AllocationType::Cold)
3572 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3573 if (AllocTypes ==
3574 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3575 return Highlight ? "magenta" : "mediumorchid1";
3576 return "gray";
3577 }
3578
3579 static std::string getNodeId(NodeRef Node) {
3580 std::stringstream SStream;
3581 SStream << std::hex << "N0x" << (unsigned long long)Node;
3582 std::string Result = SStream.str();
3583 return Result;
3584 }
3585
3586 // True if we should highlight a specific context or allocation's contexts in
3587 // the emitted graph.
3588 static bool DoHighlight;
3589};
3590
3591template <typename DerivedCCG, typename FuncTy, typename CallTy>
3592bool DOTGraphTraits<
3593 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3594 false;
3595
3596template <typename DerivedCCG, typename FuncTy, typename CallTy>
3597void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3598 std::string Label) const {
3599 WriteGraph(this, "", false, Label,
3600 DotFilePathPrefix + "ccg." + Label + ".dot");
3601}
3602
3603template <typename DerivedCCG, typename FuncTy, typename CallTy>
3604typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3605CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3606 const std::shared_ptr<ContextEdge> &Edge,
3607 DenseSet<uint32_t> ContextIdsToMove) {
3608 ContextNode *Node = Edge->Callee;
3609 assert(NodeToCallingFunc.count(Node));
3610 ContextNode *Clone =
3611 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3612 Node->addClone(Clone);
3613 Clone->MatchingCalls = Node->MatchingCalls;
3614 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3615 ContextIdsToMove);
3616 return Clone;
3617}
3618
3619template <typename DerivedCCG, typename FuncTy, typename CallTy>
3620void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3621 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3622 ContextNode *NewCallee, bool NewClone,
3623 DenseSet<uint32_t> ContextIdsToMove) {
3624 // NewCallee and Edge's current callee must be clones of the same original
3625 // node (Edge's current callee may be the original node too).
3626 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3627
3628 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3629
3630 ContextNode *OldCallee = Edge->Callee;
3631
3632 // We might already have an edge to the new callee from earlier cloning for a
3633 // different allocation. If one exists we will reuse it.
3634 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3635
3636 // Callers will pass an empty ContextIdsToMove set when they want to move the
3637 // edge. Copy in Edge's ids for simplicity.
3638 if (ContextIdsToMove.empty())
3639 ContextIdsToMove = Edge->getContextIds();
3640
3641 // If we are moving all of Edge's ids, then just move the whole Edge.
3642 // Otherwise only move the specified subset, to a new edge if needed.
3643 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3644 // First, update the alloc types on New Callee from Edge.
3645 // Do this before we potentially clear Edge's fields below!
3646 NewCallee->AllocTypes |= Edge->AllocTypes;
3647 // Moving the whole Edge.
3648 if (ExistingEdgeToNewCallee) {
3649 // Since we already have an edge to NewCallee, simply move the ids
3650 // onto it, and remove the existing Edge.
3651 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3652 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3653 assert(Edge->ContextIds == ContextIdsToMove);
3654 removeEdgeFromGraph(Edge.get());
3655 } else {
3656 // Otherwise just reconnect Edge to NewCallee.
3657 Edge->Callee = NewCallee;
3658 NewCallee->CallerEdges.push_back(Edge);
3659 // Remove it from callee where it was previously connected.
3660 OldCallee->eraseCallerEdge(Edge.get());
3661 // Don't need to update Edge's context ids since we are simply
3662 // reconnecting it.
3663 }
3664 } else {
3665 // Only moving a subset of Edge's ids.
3666 // Compute the alloc type of the subset of ids being moved.
3667 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3668 if (ExistingEdgeToNewCallee) {
3669 // Since we already have an edge to NewCallee, simply move the ids
3670 // onto it.
3671 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3672 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3673 } else {
3674 // Otherwise, create a new edge to NewCallee for the ids being moved.
3675 auto NewEdge = std::make_shared<ContextEdge>(
3676 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3677 Edge->Caller->CalleeEdges.push_back(NewEdge);
3678 NewCallee->CallerEdges.push_back(NewEdge);
3679 }
3680 // In either case, need to update the alloc types on NewCallee, and remove
3681 // those ids and update the alloc type on the original Edge.
3682 NewCallee->AllocTypes |= CallerEdgeAllocType;
3683 set_subtract(Edge->ContextIds, ContextIdsToMove);
3684 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3685 }
3686 // Now walk the old callee node's callee edges and move Edge's context ids
3687 // over to the corresponding edge into the clone (which is created here if
3688 // this is a newly created clone).
3689 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3690 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3691 // If this is a direct recursion edge, use NewCallee (the clone) as the
3692 // callee as well, so that any edge updated/created here is also direct
3693 // recursive.
3694 if (CalleeToUse == OldCallee) {
3695 // If this is a recursive edge, see if we already moved a recursive edge
3696 // (which would have to have been this one) - if we were only moving a
3697 // subset of context ids it would still be on OldCallee.
3698 if (EdgeIsRecursive) {
3699 assert(OldCalleeEdge == Edge);
3700 continue;
3701 }
3702 CalleeToUse = NewCallee;
3703 }
3704 // The context ids moving to the new callee are the subset of this edge's
3705 // context ids and the context ids on the caller edge being moved.
3706 DenseSet<uint32_t> EdgeContextIdsToMove =
3707 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3708 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3709 OldCalleeEdge->AllocTypes =
3710 computeAllocType(OldCalleeEdge->getContextIds());
3711 if (!NewClone) {
3712 // Update context ids / alloc type on corresponding edge to NewCallee.
3713 // There is a chance this may not exist if we are reusing an existing
3714 // clone, specifically during function assignment, where we would have
3715 // removed none type edges after creating the clone. If we can't find
3716 // a corresponding edge there, fall through to the cloning below.
3717 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3718 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3719 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3720 continue;
3721 }
3722 }
3723 auto NewEdge = std::make_shared<ContextEdge>(
3724 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3725 EdgeContextIdsToMove);
3726 NewCallee->CalleeEdges.push_back(NewEdge);
3727 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3728 }
3729 // Recompute the node alloc type now that its callee edges have been
3730 // updated (since we will compute from those edges).
3731 OldCallee->AllocTypes = OldCallee->computeAllocType();
3732 // OldCallee alloc type should be None iff its context id set is now empty.
3733 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3734 OldCallee->emptyContextIds());
3735 if (VerifyCCG) {
3736 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3737 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3738 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3739 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3740 /*CheckEdges=*/false);
3741 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3742 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3743 /*CheckEdges=*/false);
3744 }
3745}
3746
3747template <typename DerivedCCG, typename FuncTy, typename CallTy>
3748void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3749 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3750 ContextNode *NewCaller) {
3751 auto *OldCallee = Edge->Callee;
3752 auto *NewCallee = OldCallee;
3753 // If this edge was direct recursive, make any new/updated edge also direct
3754 // recursive to NewCaller.
3755 bool Recursive = Edge->Caller == Edge->Callee;
3756 if (Recursive)
3757 NewCallee = NewCaller;
3758
3759 ContextNode *OldCaller = Edge->Caller;
3760 OldCaller->eraseCalleeEdge(Edge.get());
3761
3762 // We might already have an edge to the new caller. If one exists we will
3763 // reuse it.
3764 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3765
3766 if (ExistingEdgeToNewCaller) {
3767 // Since we already have an edge to NewCaller, simply move the ids
3768 // onto it, and remove the existing Edge.
3769 ExistingEdgeToNewCaller->getContextIds().insert_range(
3770 Edge->getContextIds());
3771 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3772 Edge->ContextIds.clear();
3773 Edge->AllocTypes = (uint8_t)AllocationType::None;
3774 OldCallee->eraseCallerEdge(Edge.get());
3775 } else {
3776 // Otherwise just reconnect Edge to NewCaller.
3777 Edge->Caller = NewCaller;
3778 NewCaller->CalleeEdges.push_back(Edge);
3779 if (Recursive) {
3780 assert(NewCallee == NewCaller);
3781 // In the case of (direct) recursive edges, we update the callee as well
3782 // so that it becomes recursive on the new caller.
3783 Edge->Callee = NewCallee;
3784 NewCallee->CallerEdges.push_back(Edge);
3785 OldCallee->eraseCallerEdge(Edge.get());
3786 }
3787 // Don't need to update Edge's context ids since we are simply
3788 // reconnecting it.
3789 }
3790 // In either case, need to update the alloc types on New Caller.
3791 NewCaller->AllocTypes |= Edge->AllocTypes;
3792
3793 // Now walk the old caller node's caller edges and move Edge's context ids
3794 // over to the corresponding edge into the node (which is created here if
3795 // this is a newly created node). We can tell whether this is a newly created
3796 // node by seeing if it has any caller edges yet.
3797#ifndef NDEBUG
3798 bool IsNewNode = NewCaller->CallerEdges.empty();
3799#endif
3800 // If we just moved a direct recursive edge, presumably its context ids should
3801 // also flow out of OldCaller via some other non-recursive callee edge. We
3802 // don't want to remove the recursive context ids from other caller edges yet,
3803 // otherwise the context ids get into an inconsistent state on OldCaller.
3804 // We will update these context ids on the non-recursive caller edge when and
3805 // if they are updated on the non-recursive callee.
3806 if (!Recursive) {
3807 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3808 auto OldCallerCaller = OldCallerEdge->Caller;
3809 // The context ids moving to the new caller are the subset of this edge's
3810 // context ids and the context ids on the callee edge being moved.
3811 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3812 OldCallerEdge->getContextIds(), Edge->getContextIds());
3813 if (OldCaller == OldCallerCaller) {
3814 OldCallerCaller = NewCaller;
3815 // Don't actually move this one. The caller will move it directly via a
3816 // call to this function with this as the Edge if it is appropriate to
3817 // move to a diff node that has a matching callee (itself).
3818 continue;
3819 }
3820 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3821 OldCallerEdge->AllocTypes =
3822 computeAllocType(OldCallerEdge->getContextIds());
3823 // In this function we expect that any pre-existing node already has edges
3824 // from the same callers as the old node. That should be true in the
3825 // current use case, where we will remove None-type edges after copying
3826 // over all caller edges from the callee.
3827 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3828 // Since we would have skipped caller edges when moving a direct recursive
3829 // edge, this may not hold true when recursive handling enabled.
3830 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3831 if (ExistingCallerEdge) {
3832 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3833 ExistingCallerEdge->AllocTypes |=
3834 computeAllocType(EdgeContextIdsToMove);
3835 continue;
3836 }
3837 auto NewEdge = std::make_shared<ContextEdge>(
3838 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3839 EdgeContextIdsToMove);
3840 NewCaller->CallerEdges.push_back(NewEdge);
3841 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3842 }
3843 }
3844 // Recompute the node alloc type now that its caller edges have been
3845 // updated (since we will compute from those edges).
3846 OldCaller->AllocTypes = OldCaller->computeAllocType();
3847 // OldCaller alloc type should be None iff its context id set is now empty.
3848 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3849 OldCaller->emptyContextIds());
3850 if (VerifyCCG) {
3851 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3852 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3853 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3854 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3855 /*CheckEdges=*/false);
3856 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3857 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3858 /*CheckEdges=*/false);
3859 }
3860}
3861
3862template <typename DerivedCCG, typename FuncTy, typename CallTy>
3863void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3864 recursivelyRemoveNoneTypeCalleeEdges(
3865 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3866 auto Inserted = Visited.insert(Node);
3867 if (!Inserted.second)
3868 return;
3869
3870 removeNoneTypeCalleeEdges(Node);
3871
3872 for (auto *Clone : Node->Clones)
3873 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3874
3875 // The recursive call may remove some of this Node's caller edges.
3876 // Iterate over a copy and skip any that were removed.
3877 auto CallerEdges = Node->CallerEdges;
3878 for (auto &Edge : CallerEdges) {
3879 // Skip any that have been removed by an earlier recursive call.
3880 if (Edge->isRemoved()) {
3881 assert(!is_contained(Node->CallerEdges, Edge));
3882 continue;
3883 }
3884 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3885 }
3886}
3887
3888// This is the standard DFS based backedge discovery algorithm.
3889template <typename DerivedCCG, typename FuncTy, typename CallTy>
3890void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3891 // If we are cloning recursive contexts, find and mark backedges from all root
3892 // callers, using the typical DFS based backedge analysis.
3894 return;
3895 DenseSet<const ContextNode *> Visited;
3896 DenseSet<const ContextNode *> CurrentStack;
3897 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3898 auto *Node = Entry.second;
3899 if (Node->isRemoved())
3900 continue;
3901 // It is a root if it doesn't have callers.
3902 if (!Node->CallerEdges.empty())
3903 continue;
3904 markBackedges(Node, Visited, CurrentStack);
3905 assert(CurrentStack.empty());
3906 }
3907}
3908
3909// Recursive helper for above markBackedges method.
3910template <typename DerivedCCG, typename FuncTy, typename CallTy>
3911void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3912 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3913 DenseSet<const ContextNode *> &CurrentStack) {
3914 auto I = Visited.insert(Node);
3915 // We should only call this for unvisited nodes.
3916 assert(I.second);
3917 (void)I;
3918 for (auto &CalleeEdge : Node->CalleeEdges) {
3919 auto *Callee = CalleeEdge->Callee;
3920 if (Visited.count(Callee)) {
3921 // Since this was already visited we need to check if it is currently on
3922 // the recursive stack in which case it is a backedge.
3923 if (CurrentStack.count(Callee))
3924 CalleeEdge->IsBackedge = true;
3925 continue;
3926 }
3927 CurrentStack.insert(Callee);
3928 markBackedges(Callee, Visited, CurrentStack);
3929 CurrentStack.erase(Callee);
3930 }
3931}
3932
3933template <typename DerivedCCG, typename FuncTy, typename CallTy>
3934void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3935 DenseSet<const ContextNode *> Visited;
3936 for (auto &Entry : AllocationCallToContextNodeMap) {
3937 Visited.clear();
3938 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3939 }
3940 Visited.clear();
3941 for (auto &Entry : AllocationCallToContextNodeMap)
3942 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3943 if (VerifyCCG)
3944 check();
3945}
3946
3947// helper function to check an AllocType is cold or notcold or both.
3954
3955template <typename DerivedCCG, typename FuncTy, typename CallTy>
3956void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3957 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3958 const DenseSet<uint32_t> &AllocContextIds) {
3959 if (VerifyNodes)
3960 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3961 assert(!Node->CloneOf);
3962
3963 // If Node as a null call, then either it wasn't found in the module (regular
3964 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3965 // cloning (e.g. recursion, calls multiple targets, etc).
3966 // Do this here so that we don't try to recursively clone callers below, which
3967 // isn't useful at least for this node.
3968 if (!Node->hasCall())
3969 return;
3970
3971 // No need to look at any callers if allocation type already unambiguous.
3972 if (hasSingleAllocType(Node->AllocTypes))
3973 return;
3974
3975#ifndef NDEBUG
3976 auto Insert =
3977#endif
3978 Visited.insert(Node);
3979 // We should not have visited this node yet.
3980 assert(Insert.second);
3981 // The recursive call to identifyClones may delete the current edge from the
3982 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3983 // in an iterator and having recursive call erase from it. Other edges may
3984 // also get removed during the recursion, which will have null Callee and
3985 // Caller pointers (and are deleted later), so we skip those below.
3986 {
3987 auto CallerEdges = Node->CallerEdges;
3988 for (auto &Edge : CallerEdges) {
3989 // Skip any that have been removed by an earlier recursive call.
3990 if (Edge->isRemoved()) {
3991 assert(!is_contained(Node->CallerEdges, Edge));
3992 continue;
3993 }
3994 // Defer backedges. See comments further below where these edges are
3995 // handled during the cloning of this Node.
3996 if (Edge->IsBackedge) {
3997 // We should only mark these if cloning recursive contexts, where we
3998 // need to do this deferral.
4000 continue;
4001 }
4002 // Ignore any caller we previously visited via another edge.
4003 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4004 identifyClones(Edge->Caller, Visited, AllocContextIds);
4005 }
4006 }
4007 }
4008
4009 // Check if we reached an unambiguous call or have have only a single caller.
4010 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4011 return;
4012
4013 // We need to clone.
4014
4015 // Try to keep the original version as alloc type NotCold. This will make
4016 // cases with indirect calls or any other situation with an unknown call to
4017 // the original function get the default behavior. We do this by sorting the
4018 // CallerEdges of the Node we will clone by alloc type.
4019 //
4020 // Give NotCold edge the lowest sort priority so those edges are at the end of
4021 // the caller edges vector, and stay on the original version (since the below
4022 // code clones greedily until it finds all remaining edges have the same type
4023 // and leaves the remaining ones on the original Node).
4024 //
4025 // We shouldn't actually have any None type edges, so the sorting priority for
4026 // that is arbitrary, and we assert in that case below.
4027 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
4028 /*Cold*/ 1,
4029 /*NotColdCold*/ 2};
4030 llvm::stable_sort(Node->CallerEdges,
4031 [&](const std::shared_ptr<ContextEdge> &A,
4032 const std::shared_ptr<ContextEdge> &B) {
4033 // Nodes with non-empty context ids should be sorted
4034 // before those with empty context ids.
4035 if (A->ContextIds.empty())
4036 // Either B ContextIds are non-empty (in which case we
4037 // should return false because B < A), or B ContextIds
4038 // are empty, in which case they are equal, and we
4039 // should maintain the original relative ordering.
4040 return false;
4041 if (B->ContextIds.empty())
4042 return true;
4043
4044 if (A->AllocTypes == B->AllocTypes)
4045 // Use the first context id for each edge as a
4046 // tie-breaker.
4047 return *A->ContextIds.begin() < *B->ContextIds.begin();
4048 return AllocTypeCloningPriority[A->AllocTypes] <
4049 AllocTypeCloningPriority[B->AllocTypes];
4050 });
4051
4052 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4053
4054 DenseSet<uint32_t> RecursiveContextIds;
4056 // If we are allowing recursive callsites, but have also disabled recursive
4057 // contexts, look for context ids that show up in multiple caller edges.
4059 DenseSet<uint32_t> AllCallerContextIds;
4060 for (auto &CE : Node->CallerEdges) {
4061 // Resize to the largest set of caller context ids, since we know the
4062 // final set will be at least that large.
4063 AllCallerContextIds.reserve(CE->getContextIds().size());
4064 for (auto Id : CE->getContextIds())
4065 if (!AllCallerContextIds.insert(Id).second)
4066 RecursiveContextIds.insert(Id);
4067 }
4068 }
4069
4070 // Iterate until we find no more opportunities for disambiguating the alloc
4071 // types via cloning. In most cases this loop will terminate once the Node
4072 // has a single allocation type, in which case no more cloning is needed.
4073 // Iterate over a copy of Node's caller edges, since we may need to remove
4074 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4075 // makes it less error-prone.
4076 auto CallerEdges = Node->CallerEdges;
4077 for (auto &CallerEdge : CallerEdges) {
4078 // Skip any that have been removed by an earlier recursive call.
4079 if (CallerEdge->isRemoved()) {
4080 assert(!is_contained(Node->CallerEdges, CallerEdge));
4081 continue;
4082 }
4083 assert(CallerEdge->Callee == Node);
4084
4085 // See if cloning the prior caller edge left this node with a single alloc
4086 // type or a single caller. In that case no more cloning of Node is needed.
4087 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4088 break;
4089
4090 // If the caller was not successfully matched to a call in the IR/summary,
4091 // there is no point in trying to clone for it as we can't update that call.
4092 if (!CallerEdge->Caller->hasCall())
4093 continue;
4094
4095 // Only need to process the ids along this edge pertaining to the given
4096 // allocation.
4097 auto CallerEdgeContextsForAlloc =
4098 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4099 if (!RecursiveContextIds.empty())
4100 CallerEdgeContextsForAlloc =
4101 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4102 if (CallerEdgeContextsForAlloc.empty())
4103 continue;
4104
4105 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4106
4107 // Compute the node callee edge alloc types corresponding to the context ids
4108 // for this caller edge.
4109 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4110 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
4111 for (auto &CalleeEdge : Node->CalleeEdges)
4112 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4113 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4114
4115 // Don't clone if doing so will not disambiguate any alloc types amongst
4116 // caller edges (including the callee edges that would be cloned).
4117 // Otherwise we will simply move all edges to the clone.
4118 //
4119 // First check if by cloning we will disambiguate the caller allocation
4120 // type from node's allocation type. Query allocTypeToUse so that we don't
4121 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4122 // neither of these should be None type.
4123 //
4124 // Then check if by cloning node at least one of the callee edges will be
4125 // disambiguated by splitting out different context ids.
4126 //
4127 // However, always do the cloning if this is a backedge, in which case we
4128 // have not yet cloned along this caller edge.
4129 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4130 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4131 if (!CallerEdge->IsBackedge &&
4132 allocTypeToUse(CallerAllocTypeForAlloc) ==
4133 allocTypeToUse(Node->AllocTypes) &&
4134 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4135 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4136 continue;
4137 }
4138
4139 if (CallerEdge->IsBackedge) {
4140 // We should only mark these if cloning recursive contexts, where we
4141 // need to do this deferral.
4143 DeferredBackedges++;
4144 }
4145
4146 // If this is a backedge, we now do recursive cloning starting from its
4147 // caller since we may have moved unambiguous caller contexts to a clone
4148 // of this Node in a previous iteration of the current loop, giving more
4149 // opportunity for cloning through the backedge. Because we sorted the
4150 // caller edges earlier so that cold caller edges are first, we would have
4151 // visited and cloned this node for any unamibiguously cold non-recursive
4152 // callers before any ambiguous backedge callers. Note that we don't do this
4153 // if the caller is already cloned or visited during cloning (e.g. via a
4154 // different context path from the allocation).
4155 // TODO: Can we do better in the case where the caller was already visited?
4156 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4157 !Visited.count(CallerEdge->Caller)) {
4158 const auto OrigIdCount = CallerEdge->getContextIds().size();
4159 // Now do the recursive cloning of this backedge's caller, which was
4160 // deferred earlier.
4161 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4162 removeNoneTypeCalleeEdges(CallerEdge->Caller);
4163 // See if the recursive call to identifyClones moved the context ids to a
4164 // new edge from this node to a clone of caller, and switch to looking at
4165 // that new edge so that we clone Node for the new caller clone.
4166 bool UpdatedEdge = false;
4167 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4168 for (auto E : Node->CallerEdges) {
4169 // Only interested in clones of the current edges caller.
4170 if (E->Caller->CloneOf != CallerEdge->Caller)
4171 continue;
4172 // See if this edge contains any of the context ids originally on the
4173 // current caller edge.
4174 auto CallerEdgeContextsForAllocNew =
4175 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4176 if (CallerEdgeContextsForAllocNew.empty())
4177 continue;
4178 // Make sure we don't pick a previously existing caller edge of this
4179 // Node, which would be processed on a different iteration of the
4180 // outer loop over the saved CallerEdges.
4181 if (llvm::is_contained(CallerEdges, E))
4182 continue;
4183 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4184 // are updated further below for all cases where we just invoked
4185 // identifyClones recursively.
4186 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4187 CallerEdge = E;
4188 UpdatedEdge = true;
4189 break;
4190 }
4191 }
4192 // If cloning removed this edge (and we didn't update it to a new edge
4193 // above), we're done with this edge. It's possible we moved all of the
4194 // context ids to an existing clone, in which case there's no need to do
4195 // further processing for them.
4196 if (CallerEdge->isRemoved())
4197 continue;
4198
4199 // Now we need to update the information used for the cloning decisions
4200 // further below, as we may have modified edges and their context ids.
4201
4202 // Note if we changed the CallerEdge above we would have already updated
4203 // the context ids.
4204 if (!UpdatedEdge) {
4205 CallerEdgeContextsForAlloc = set_intersection(
4206 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4207 if (CallerEdgeContextsForAlloc.empty())
4208 continue;
4209 }
4210 // Update the other information that depends on the edges and on the now
4211 // updated CallerEdgeContextsForAlloc.
4212 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4213 CalleeEdgeAllocTypesForCallerEdge.clear();
4214 for (auto &CalleeEdge : Node->CalleeEdges) {
4215 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4216 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4217 }
4218 }
4219
4220 // First see if we can use an existing clone. Check each clone and its
4221 // callee edges for matching alloc types.
4222 ContextNode *Clone = nullptr;
4223 for (auto *CurClone : Node->Clones) {
4224 if (allocTypeToUse(CurClone->AllocTypes) !=
4225 allocTypeToUse(CallerAllocTypeForAlloc))
4226 continue;
4227
4228 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4229 hasSingleAllocType(CallerAllocTypeForAlloc);
4230 // The above check should mean that if both have single alloc types that
4231 // they should be equal.
4232 assert(!BothSingleAlloc ||
4233 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4234
4235 // If either both have a single alloc type (which are the same), or if the
4236 // clone's callee edges have the same alloc types as those for the current
4237 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4238 // then we can reuse this clone.
4239 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4240 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4241 Clone = CurClone;
4242 break;
4243 }
4244 }
4245
4246 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4247 if (Clone)
4248 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
4249 CallerEdgeContextsForAlloc);
4250 else
4251 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
4252
4253 // Sanity check that no alloc types on clone or its edges are None.
4254 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4255 }
4256
4257 // We should still have some context ids on the original Node.
4258 assert(!Node->emptyContextIds());
4259
4260 // Sanity check that no alloc types on node or edges are None.
4261 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4262
4263 if (VerifyNodes)
4264 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4265}
4266
4267void ModuleCallsiteContextGraph::updateAllocationCall(
4268 CallInfo &Call, AllocationType AllocType) {
4269 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
4271 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
4272 "memprof", AllocTypeString);
4273 cast<CallBase>(Call.call())->addFnAttr(A);
4274 OREGetter(Call.call()->getFunction())
4275 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4276 << ore::NV("AllocationCall", Call.call()) << " in clone "
4277 << ore::NV("Caller", Call.call()->getFunction())
4278 << " marked with memprof allocation attribute "
4279 << ore::NV("Attribute", AllocTypeString));
4280}
4281
4282void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4284 auto *AI = cast<AllocInfo *>(Call.call());
4285 assert(AI);
4286 assert(AI->Versions.size() > Call.cloneNo());
4287 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4288}
4289
4291ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4292 const auto *CB = cast<CallBase>(Call.call());
4293 if (!CB->getAttributes().hasFnAttr("memprof"))
4294 return AllocationType::None;
4295 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4296 ? AllocationType::Cold
4297 : AllocationType::NotCold;
4298}
4299
4301IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4302 const auto *AI = cast<AllocInfo *>(Call.call());
4303 assert(AI->Versions.size() > Call.cloneNo());
4304 return (AllocationType)AI->Versions[Call.cloneNo()];
4305}
4306
4307void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4308 FuncInfo CalleeFunc) {
4309 auto *CurF = getCalleeFunc(CallerCall.call());
4310 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4311 if (isMemProfClone(*CurF)) {
4312 // If we already assigned this callsite to call a specific non-default
4313 // clone (i.e. not the original function which is clone 0), ensure that we
4314 // aren't trying to now update it to call a different clone, which is
4315 // indicative of a bug in the graph or function assignment.
4316 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4317 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4318 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4319 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4320 << "\n");
4321 MismatchedCloneAssignments++;
4322 }
4323 }
4324 if (NewCalleeCloneNo > 0)
4325 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4326 OREGetter(CallerCall.call()->getFunction())
4327 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4328 << ore::NV("Call", CallerCall.call()) << " in clone "
4329 << ore::NV("Caller", CallerCall.call()->getFunction())
4330 << " assigned to call function clone "
4331 << ore::NV("Callee", CalleeFunc.func()));
4332}
4333
4334void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4335 FuncInfo CalleeFunc) {
4336 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4337 assert(CI &&
4338 "Caller cannot be an allocation which should not have profiled calls");
4339 assert(CI->Clones.size() > CallerCall.cloneNo());
4340 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4341 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4342 // If we already assigned this callsite to call a specific non-default
4343 // clone (i.e. not the original function which is clone 0), ensure that we
4344 // aren't trying to now update it to call a different clone, which is
4345 // indicative of a bug in the graph or function assignment.
4346 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4347 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4348 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4349 << "\n");
4350 MismatchedCloneAssignments++;
4351 }
4352 CurCalleeCloneNo = NewCalleeCloneNo;
4353}
4354
4355// Update the debug information attached to NewFunc to use the clone Name. Note
4356// this needs to be done for both any existing DISubprogram for the definition,
4357// as well as any separate declaration DISubprogram.
4359 assert(Name == NewFunc->getName());
4360 auto *SP = NewFunc->getSubprogram();
4361 if (!SP)
4362 return;
4363 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4364 SP->replaceLinkageName(MDName);
4365 DISubprogram *Decl = SP->getDeclaration();
4366 if (!Decl)
4367 return;
4368 TempDISubprogram NewDecl = Decl->clone();
4369 NewDecl->replaceLinkageName(MDName);
4370 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4371}
4372
4373CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4374 Instruction *>::FuncInfo
4375ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4376 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4377 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4378 // Use existing LLVM facilities for cloning and obtaining Call in clone
4379 ValueToValueMapTy VMap;
4380 auto *NewFunc = CloneFunction(Func.func(), VMap);
4381 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4382 assert(!Func.func()->getParent()->getFunction(Name));
4383 NewFunc->setName(Name);
4384 updateSubprogramLinkageName(NewFunc, Name);
4385 for (auto &Inst : CallsWithMetadataInFunc) {
4386 // This map always has the initial version in it.
4387 assert(Inst.cloneNo() == 0);
4388 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4389 }
4390 OREGetter(Func.func())
4391 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4392 << "created clone " << ore::NV("NewFunction", NewFunc));
4393 return {NewFunc, CloneNo};
4394}
4395
4396CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4397 IndexCall>::FuncInfo
4398IndexCallsiteContextGraph::cloneFunctionForCallsite(
4399 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4400 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4401 // Check how many clones we have of Call (and therefore function).
4402 // The next clone number is the current size of versions array.
4403 // Confirm this matches the CloneNo provided by the caller, which is based on
4404 // the number of function clones we have.
4405 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4406 ? cast<AllocInfo *>(Call.call())->Versions.size()
4407 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4408 // Walk all the instructions in this function. Create a new version for
4409 // each (by adding an entry to the Versions/Clones summary array), and copy
4410 // over the version being called for the function clone being cloned here.
4411 // Additionally, add an entry to the CallMap for the new function clone,
4412 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4413 // to the new call clone.
4414 for (auto &Inst : CallsWithMetadataInFunc) {
4415 // This map always has the initial version in it.
4416 assert(Inst.cloneNo() == 0);
4417 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4418 assert(AI->Versions.size() == CloneNo);
4419 // We assign the allocation type later (in updateAllocationCall), just add
4420 // an entry for it here.
4421 AI->Versions.push_back(0);
4422 } else {
4423 auto *CI = cast<CallsiteInfo *>(Inst.call());
4424 assert(CI && CI->Clones.size() == CloneNo);
4425 // We assign the clone number later (in updateCall), just add an entry for
4426 // it here.
4427 CI->Clones.push_back(0);
4428 }
4429 CallMap[Inst] = {Inst.call(), CloneNo};
4430 }
4431 return {Func.func(), CloneNo};
4432}
4433
4434// We perform cloning for each allocation node separately. However, this
4435// sometimes results in a situation where the same node calls multiple
4436// clones of the same callee, created for different allocations. This
4437// causes issues when assigning functions to these clones, as each node can
4438// in reality only call a single callee clone.
4439//
4440// To address this, before assigning functions, merge callee clone nodes as
4441// needed using a post order traversal from the allocations. We attempt to
4442// use existing clones as the merge node when legal, and to share them
4443// among callers with the same properties (callers calling the same set of
4444// callee clone nodes for the same allocations).
4445//
4446// Without this fix, in some cases incorrect function assignment will lead
4447// to calling the wrong allocation clone.
4448template <typename DerivedCCG, typename FuncTy, typename CallTy>
4449void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4450 if (!MergeClones)
4451 return;
4452
4453 // Generate a map from context id to the associated allocation node for use
4454 // when merging clones.
4455 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4456 for (auto &Entry : AllocationCallToContextNodeMap) {
4457 auto *Node = Entry.second;
4458 for (auto Id : Node->getContextIds())
4459 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4460 for (auto *Clone : Node->Clones) {
4461 for (auto Id : Clone->getContextIds())
4462 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4463 }
4464 }
4465
4466 // Post order traversal starting from allocations to ensure each callsite
4467 // calls a single clone of its callee. Callee nodes that are clones of each
4468 // other are merged (via new merge nodes if needed) to achieve this.
4469 DenseSet<const ContextNode *> Visited;
4470 for (auto &Entry : AllocationCallToContextNodeMap) {
4471 auto *Node = Entry.second;
4472
4473 mergeClones(Node, Visited, ContextIdToAllocationNode);
4474
4475 // Make a copy so the recursive post order traversal that may create new
4476 // clones doesn't mess up iteration. Note that the recursive traversal
4477 // itself does not call mergeClones on any of these nodes, which are all
4478 // (clones of) allocations.
4479 auto Clones = Node->Clones;
4480 for (auto *Clone : Clones)
4481 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4482 }
4483
4484 if (DumpCCG) {
4485 dbgs() << "CCG after merging:\n";
4486 dbgs() << *this;
4487 }
4488 if (ExportToDot)
4489 exportToDot("aftermerge");
4490
4491 if (VerifyCCG) {
4492 check();
4493 }
4494}
4495
4496// Recursive helper for above mergeClones method.
4497template <typename DerivedCCG, typename FuncTy, typename CallTy>
4498void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4499 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4500 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4501 auto Inserted = Visited.insert(Node);
4502 if (!Inserted.second)
4503 return;
4504
4505 // Iteratively perform merging on this node to handle new caller nodes created
4506 // during the recursive traversal. We could do something more elegant such as
4507 // maintain a worklist, but this is a simple approach that doesn't cause a
4508 // measureable compile time effect, as most nodes don't have many caller
4509 // edges to check.
4510 bool FoundUnvisited = true;
4511 unsigned Iters = 0;
4512 while (FoundUnvisited) {
4513 Iters++;
4514 FoundUnvisited = false;
4515 // Make a copy since the recursive call may move a caller edge to a new
4516 // callee, messing up the iterator.
4517 auto CallerEdges = Node->CallerEdges;
4518 for (auto CallerEdge : CallerEdges) {
4519 // Skip any caller edge moved onto a different callee during recursion.
4520 if (CallerEdge->Callee != Node)
4521 continue;
4522 // If we found an unvisited caller, note that we should check the caller
4523 // edges again as mergeClones may add or change caller nodes.
4524 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4525 FoundUnvisited = true;
4526 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4527 }
4528 }
4529
4530 TotalMergeInvokes++;
4531 TotalMergeIters += Iters;
4532 if (Iters > MaxMergeIters)
4533 MaxMergeIters = Iters;
4534
4535 // Merge for this node after we handle its callers.
4536 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4537}
4538
4539template <typename DerivedCCG, typename FuncTy, typename CallTy>
4540void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4541 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4542 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4543 // Ignore Node if we moved all of its contexts to clones.
4544 if (Node->emptyContextIds())
4545 return;
4546
4547 // First identify groups of clones among Node's callee edges, by building
4548 // a map from each callee base node to the associated callee edges from Node.
4549 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4550 OrigNodeToCloneEdges;
4551 for (const auto &E : Node->CalleeEdges) {
4552 auto *Callee = E->Callee;
4553 if (!Callee->CloneOf && Callee->Clones.empty())
4554 continue;
4555 ContextNode *Base = Callee->getOrigNode();
4556 OrigNodeToCloneEdges[Base].push_back(E);
4557 }
4558
4559 // Helper for callee edge sorting below. Return true if A's callee has fewer
4560 // caller edges than B, or if A is a clone and B is not, or if A's first
4561 // context id is smaller than B's.
4562 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4563 const std::shared_ptr<ContextEdge> &B) {
4564 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4565 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4566 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4567 return true;
4568 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4569 return false;
4570 // Use the first context id for each edge as a
4571 // tie-breaker.
4572 return *A->ContextIds.begin() < *B->ContextIds.begin();
4573 };
4574
4575 // Process each set of callee clones called by Node, performing the needed
4576 // merging.
4577 for (auto Entry : OrigNodeToCloneEdges) {
4578 // CalleeEdges is the set of edges from Node reaching callees that are
4579 // mutual clones of each other.
4580 auto &CalleeEdges = Entry.second;
4581 auto NumCalleeClones = CalleeEdges.size();
4582 // A single edge means there is no merging needed.
4583 if (NumCalleeClones == 1)
4584 continue;
4585 // Sort the CalleeEdges calling this group of clones in ascending order of
4586 // their caller edge counts, putting the original non-clone node first in
4587 // cases of a tie. This simplifies finding an existing node to use as the
4588 // merge node.
4589 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4590
4591 /// Find other callers of the given set of callee edges that can
4592 /// share the same callee merge node. See the comments at this method
4593 /// definition for details.
4594 DenseSet<ContextNode *> OtherCallersToShareMerge;
4595 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4596 OtherCallersToShareMerge);
4597
4598 // Now do the actual merging. Identify existing or create a new MergeNode
4599 // during the first iteration. Move each callee over, along with edges from
4600 // other callers we've determined above can share the same merge node.
4601 ContextNode *MergeNode = nullptr;
4602 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4603 for (auto CalleeEdge : CalleeEdges) {
4604 auto *OrigCallee = CalleeEdge->Callee;
4605 // If we don't have a MergeNode yet (only happens on the first iteration,
4606 // as a new one will be created when we go to move the first callee edge
4607 // over as needed), see if we can use this callee.
4608 if (!MergeNode) {
4609 // If there are no other callers, simply use this callee.
4610 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4611 MergeNode = OrigCallee;
4612 NonNewMergedNodes++;
4613 continue;
4614 }
4615 // Otherwise, if we have identified other caller nodes that can share
4616 // the merge node with Node, see if all of OrigCallee's callers are
4617 // going to share the same merge node. In that case we can use callee
4618 // (since all of its callers would move to the new merge node).
4619 if (!OtherCallersToShareMerge.empty()) {
4620 bool MoveAllCallerEdges = true;
4621 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4622 if (CalleeCallerE == CalleeEdge)
4623 continue;
4624 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4625 MoveAllCallerEdges = false;
4626 break;
4627 }
4628 }
4629 // If we are going to move all callers over, we can use this callee as
4630 // the MergeNode.
4631 if (MoveAllCallerEdges) {
4632 MergeNode = OrigCallee;
4633 NonNewMergedNodes++;
4634 continue;
4635 }
4636 }
4637 }
4638 // Move this callee edge, creating a new merge node if necessary.
4639 if (MergeNode) {
4640 assert(MergeNode != OrigCallee);
4641 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4642 /*NewClone*/ false);
4643 } else {
4644 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4645 NewMergedNodes++;
4646 }
4647 // Now move all identified edges from other callers over to the merge node
4648 // as well.
4649 if (!OtherCallersToShareMerge.empty()) {
4650 // Make and iterate over a copy of OrigCallee's caller edges because
4651 // some of these will be moved off of the OrigCallee and that would mess
4652 // up the iteration from OrigCallee.
4653 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4654 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4655 if (CalleeCallerE == CalleeEdge)
4656 continue;
4657 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4658 continue;
4659 CallerToMoveCount[CalleeCallerE->Caller]++;
4660 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4661 /*NewClone*/ false);
4662 }
4663 }
4664 removeNoneTypeCalleeEdges(OrigCallee);
4665 removeNoneTypeCalleeEdges(MergeNode);
4666 }
4667 }
4668}
4669
4670// Look for other nodes that have edges to the same set of callee
4671// clones as the current Node. Those can share the eventual merge node
4672// (reducing cloning and binary size overhead) iff:
4673// - they have edges to the same set of callee clones
4674// - each callee edge reaches a subset of the same allocations as Node's
4675// corresponding edge to the same callee clone.
4676// The second requirement is to ensure that we don't undo any of the
4677// necessary cloning to distinguish contexts with different allocation
4678// behavior.
4679// FIXME: This is somewhat conservative, as we really just need to ensure
4680// that they don't reach the same allocations as contexts on edges from Node
4681// going to any of the *other* callee clones being merged. However, that
4682// requires more tracking and checking to get right.
4683template <typename DerivedCCG, typename FuncTy, typename CallTy>
4684void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4685 findOtherCallersToShareMerge(
4686 ContextNode *Node,
4687 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4688 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4689 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4690 auto NumCalleeClones = CalleeEdges.size();
4691 // This map counts how many edges to the same callee clone exist for other
4692 // caller nodes of each callee clone.
4693 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4694 // Counts the number of other caller nodes that have edges to all callee
4695 // clones that don't violate the allocation context checking.
4696 unsigned PossibleOtherCallerNodes = 0;
4697
4698 // We only need to look at other Caller nodes if the first callee edge has
4699 // multiple callers (recall they are sorted in ascending order above).
4700 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4701 return;
4702
4703 // For each callee edge:
4704 // - Collect the count of other caller nodes calling the same callees.
4705 // - Collect the alloc nodes reached by contexts on each callee edge.
4706 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4707 for (auto CalleeEdge : CalleeEdges) {
4708 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4709 // For each other caller of the same callee, increment the count of
4710 // edges reaching the same callee clone.
4711 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4712 if (CalleeCallerEdges->Caller == Node) {
4713 assert(CalleeCallerEdges == CalleeEdge);
4714 continue;
4715 }
4716 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4717 // If this caller edge now reaches all of the same callee clones,
4718 // increment the count of candidate other caller nodes.
4719 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4720 NumCalleeClones)
4721 PossibleOtherCallerNodes++;
4722 }
4723 // Collect the alloc nodes reached by contexts on each callee edge, for
4724 // later analysis.
4725 for (auto Id : CalleeEdge->getContextIds()) {
4726 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4727 if (!Alloc) {
4728 // FIXME: unclear why this happens occasionally, presumably
4729 // imperfect graph updates possibly with recursion.
4730 MissingAllocForContextId++;
4731 continue;
4732 }
4733 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4734 }
4735 }
4736
4737 // Now walk the callee edges again, and make sure that for each candidate
4738 // caller node all of its edges to the callees reach the same allocs (or
4739 // a subset) as those along the corresponding callee edge from Node.
4740 for (auto CalleeEdge : CalleeEdges) {
4741 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4742 // Stop if we do not have any (more) candidate other caller nodes.
4743 if (!PossibleOtherCallerNodes)
4744 break;
4745 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4746 // Check each other caller of this callee clone.
4747 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4748 // Not interested in the callee edge from Node itself.
4749 if (CalleeCallerE == CalleeEdge)
4750 continue;
4751 // Skip any callers that didn't have callee edges to all the same
4752 // callee clones.
4753 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4754 NumCalleeClones)
4755 continue;
4756 // Make sure that each context along edge from candidate caller node
4757 // reaches an allocation also reached by this callee edge from Node.
4758 for (auto Id : CalleeCallerE->getContextIds()) {
4759 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4760 if (!Alloc)
4761 continue;
4762 // If not, simply reset the map entry to 0 so caller is ignored, and
4763 // reduce the count of candidate other caller nodes.
4764 if (!CurCalleeAllocNodes.contains(Alloc)) {
4765 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4766 PossibleOtherCallerNodes--;
4767 break;
4768 }
4769 }
4770 }
4771 }
4772
4773 if (!PossibleOtherCallerNodes)
4774 return;
4775
4776 // Build the set of other caller nodes that can use the same callee merge
4777 // node.
4778 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4779 if (Count != NumCalleeClones)
4780 continue;
4781 OtherCallersToShareMerge.insert(OtherCaller);
4782 }
4783}
4784
4785// This method assigns cloned callsites to functions, cloning the functions as
4786// needed. The assignment is greedy and proceeds roughly as follows:
4787//
4788// For each function Func:
4789// For each call with graph Node having clones:
4790// Initialize ClonesWorklist to Node and its clones
4791// Initialize NodeCloneCount to 0
4792// While ClonesWorklist is not empty:
4793// Clone = pop front ClonesWorklist
4794// NodeCloneCount++
4795// If Func has been cloned less than NodeCloneCount times:
4796// If NodeCloneCount is 1:
4797// Assign Clone to original Func
4798// Continue
4799// Create a new function clone
4800// If other callers not assigned to call a function clone yet:
4801// Assign them to call new function clone
4802// Continue
4803// Assign any other caller calling the cloned version to new clone
4804//
4805// For each caller of Clone:
4806// If caller is assigned to call a specific function clone:
4807// If we cannot assign Clone to that function clone:
4808// Create new callsite Clone NewClone
4809// Add NewClone to ClonesWorklist
4810// Continue
4811// Assign Clone to existing caller's called function clone
4812// Else:
4813// If Clone not already assigned to a function clone:
4814// Assign to first function clone without assignment
4815// Assign caller to selected function clone
4816// For each call with graph Node having clones:
4817// If number func clones > number call's callsite Node clones:
4818// Record func CallInfo clones without Node clone in UnassignedCallClones
4819// For callsite Nodes in DFS order from allocations:
4820// If IsAllocation:
4821// Update allocation with alloc type
4822// Else:
4823// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4824// Update call to call recorded callee clone
4825//
4826template <typename DerivedCCG, typename FuncTy, typename CallTy>
4827bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4828 bool Changed = false;
4829
4830 mergeClones();
4831
4832 // Keep track of the assignment of nodes (callsites) to function clones they
4833 // call.
4834 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4835
4836 // Update caller node to call function version CalleeFunc, by recording the
4837 // assignment in CallsiteToCalleeFuncCloneMap.
4838 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4839 const FuncInfo &CalleeFunc) {
4840 assert(Caller->hasCall());
4841 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4842 };
4843
4844 // Information for a single clone of this Func.
4845 struct FuncCloneInfo {
4846 // The function clone.
4847 FuncInfo FuncClone;
4848 // Remappings of each call of interest (from original uncloned call to the
4849 // corresponding cloned call in this function clone).
4850 DenseMap<CallInfo, CallInfo> CallMap;
4851 };
4852
4853 // Map to keep track of information needed to update calls in function clones
4854 // when their corresponding callsite node was not itself cloned for that
4855 // function clone. Because of call context pruning (i.e. we only keep as much
4856 // caller information as needed to distinguish hot vs cold), we may not have
4857 // caller edges coming to each callsite node from all possible function
4858 // callers. A function clone may get created for other callsites in the
4859 // function for which there are caller edges that were not pruned. Any other
4860 // callsites in that function clone, which were not themselved cloned for
4861 // that function clone, should get updated the same way as the corresponding
4862 // callsite in the original function (which may call a clone of its callee).
4863 //
4864 // We build this map after completing function cloning for each function, so
4865 // that we can record the information from its call maps before they are
4866 // destructed. The map will be used as we update calls to update any still
4867 // unassigned call clones. Note that we may create new node clones as we clone
4868 // other functions, so later on we check which node clones were still not
4869 // created. To this end, the inner map is a map from function clone number to
4870 // the list of calls cloned for that function (can be more than one due to the
4871 // Node's MatchingCalls array).
4872 //
4873 // The alternative is creating new callsite clone nodes below as we clone the
4874 // function, but that is tricker to get right and likely more overhead.
4875 //
4876 // Inner map is a std::map so sorted by key (clone number), in order to get
4877 // ordered remarks in the full LTO case.
4878 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4879 UnassignedCallClones;
4880
4881 // Walk all functions for which we saw calls with memprof metadata, and handle
4882 // cloning for each of its calls.
4883 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4884 FuncInfo OrigFunc(Func);
4885 // Map from each clone number of OrigFunc to information about that function
4886 // clone (the function clone FuncInfo and call remappings). The index into
4887 // the vector is the clone number, as function clones are created and
4888 // numbered sequentially.
4889 std::vector<FuncCloneInfo> FuncCloneInfos;
4890 for (auto &Call : CallsWithMetadata) {
4891 ContextNode *Node = getNodeForInst(Call);
4892 // Skip call if we do not have a node for it (all uses of its stack ids
4893 // were either on inlined chains or pruned from the MIBs), or if we did
4894 // not create any clones for it.
4895 if (!Node || Node->Clones.empty())
4896 continue;
4897 assert(Node->hasCall() &&
4898 "Not having a call should have prevented cloning");
4899
4900 // Track the assignment of function clones to clones of the current
4901 // callsite Node being handled.
4902 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4903
4904 // Assign callsite version CallsiteClone to function version FuncClone,
4905 // and also assign (possibly cloned) Call to CallsiteClone.
4906 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4907 CallInfo &Call,
4908 ContextNode *CallsiteClone,
4909 bool IsAlloc) {
4910 // Record the clone of callsite node assigned to this function clone.
4911 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4912
4913 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4914 DenseMap<CallInfo, CallInfo> &CallMap =
4915 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4916 CallInfo CallClone(Call);
4917 if (auto It = CallMap.find(Call); It != CallMap.end())
4918 CallClone = It->second;
4919 CallsiteClone->setCall(CallClone);
4920 // Need to do the same for all matching calls.
4921 for (auto &MatchingCall : Node->MatchingCalls) {
4922 CallInfo CallClone(MatchingCall);
4923 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4924 CallClone = It->second;
4925 // Updates the call in the list.
4926 MatchingCall = CallClone;
4927 }
4928 };
4929
4930 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4931 // performs the necessary fixups (removing none type edges, and
4932 // importantly, propagating any function call assignment of the original
4933 // node to the new clone).
4934 auto MoveEdgeToNewCalleeCloneAndSetUp =
4935 [&](const std::shared_ptr<ContextEdge> &Edge) {
4936 ContextNode *OrigCallee = Edge->Callee;
4937 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4938 removeNoneTypeCalleeEdges(NewClone);
4939 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4940 // If the original Callee was already assigned to call a specific
4941 // function version, make sure its new clone is assigned to call
4942 // that same function clone.
4943 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4944 RecordCalleeFuncOfCallsite(
4945 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4946 return NewClone;
4947 };
4948
4949 // Keep track of the clones of callsite Node that need to be assigned to
4950 // function clones. This list may be expanded in the loop body below if we
4951 // find additional cloning is required.
4952 std::deque<ContextNode *> ClonesWorklist;
4953 // Ignore original Node if we moved all of its contexts to clones.
4954 if (!Node->emptyContextIds())
4955 ClonesWorklist.push_back(Node);
4956 llvm::append_range(ClonesWorklist, Node->Clones);
4957
4958 // Now walk through all of the clones of this callsite Node that we need,
4959 // and determine the assignment to a corresponding clone of the current
4960 // function (creating new function clones as needed).
4961 unsigned NodeCloneCount = 0;
4962 while (!ClonesWorklist.empty()) {
4963 ContextNode *Clone = ClonesWorklist.front();
4964 ClonesWorklist.pop_front();
4965 NodeCloneCount++;
4966 if (VerifyNodes)
4968
4969 // Need to create a new function clone if we have more callsite clones
4970 // than existing function clones, which would have been assigned to an
4971 // earlier clone in the list (we assign callsite clones to function
4972 // clones greedily).
4973 if (FuncCloneInfos.size() < NodeCloneCount) {
4974 // If this is the first callsite copy, assign to original function.
4975 if (NodeCloneCount == 1) {
4976 // Since FuncCloneInfos is empty in this case, no clones have
4977 // been created for this function yet, and no callers should have
4978 // been assigned a function clone for this callee node yet.
4980 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4981 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4982 }));
4983 // Initialize with empty call map, assign Clone to original function
4984 // and its callers, and skip to the next clone.
4985 FuncCloneInfos.push_back(
4986 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4987 AssignCallsiteCloneToFuncClone(
4988 OrigFunc, Call, Clone,
4989 AllocationCallToContextNodeMap.count(Call));
4990 for (auto &CE : Clone->CallerEdges) {
4991 // Ignore any caller that does not have a recorded callsite Call.
4992 if (!CE->Caller->hasCall())
4993 continue;
4994 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4995 }
4996 continue;
4997 }
4998
4999 // First locate which copy of OrigFunc to clone again. If a caller
5000 // of this callsite clone was already assigned to call a particular
5001 // function clone, we need to redirect all of those callers to the
5002 // new function clone, and update their other callees within this
5003 // function.
5004 FuncInfo PreviousAssignedFuncClone;
5005 auto EI = llvm::find_if(
5006 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5007 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5008 });
5009 bool CallerAssignedToCloneOfFunc = false;
5010 if (EI != Clone->CallerEdges.end()) {
5011 const std::shared_ptr<ContextEdge> &Edge = *EI;
5012 PreviousAssignedFuncClone =
5013 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5014 CallerAssignedToCloneOfFunc = true;
5015 }
5016
5017 // Clone function and save it along with the CallInfo map created
5018 // during cloning in the FuncCloneInfos.
5019 DenseMap<CallInfo, CallInfo> NewCallMap;
5020 unsigned CloneNo = FuncCloneInfos.size();
5021 assert(CloneNo > 0 && "Clone 0 is the original function, which "
5022 "should already exist in the map");
5023 FuncInfo NewFuncClone = cloneFunctionForCallsite(
5024 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
5025 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5026 FunctionClonesAnalysis++;
5027 Changed = true;
5028
5029 // If no caller callsites were already assigned to a clone of this
5030 // function, we can simply assign this clone to the new func clone
5031 // and update all callers to it, then skip to the next clone.
5032 if (!CallerAssignedToCloneOfFunc) {
5033 AssignCallsiteCloneToFuncClone(
5034 NewFuncClone, Call, Clone,
5035 AllocationCallToContextNodeMap.count(Call));
5036 for (auto &CE : Clone->CallerEdges) {
5037 // Ignore any caller that does not have a recorded callsite Call.
5038 if (!CE->Caller->hasCall())
5039 continue;
5040 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5041 }
5042 continue;
5043 }
5044
5045 // We may need to do additional node cloning in this case.
5046 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5047 // that were previously assigned to call PreviousAssignedFuncClone,
5048 // to record that they now call NewFuncClone.
5049 // The none type edge removal may remove some of this Clone's caller
5050 // edges, if it is reached via another of its caller's callees.
5051 // Iterate over a copy and skip any that were removed.
5052 auto CallerEdges = Clone->CallerEdges;
5053 for (auto CE : CallerEdges) {
5054 // Skip any that have been removed on an earlier iteration.
5055 if (CE->isRemoved()) {
5056 assert(!is_contained(Clone->CallerEdges, CE));
5057 continue;
5058 }
5059 assert(CE);
5060 // Ignore any caller that does not have a recorded callsite Call.
5061 if (!CE->Caller->hasCall())
5062 continue;
5063
5064 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5065 // We subsequently fall through to later handling that
5066 // will perform any additional cloning required for
5067 // callers that were calling other function clones.
5068 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5069 PreviousAssignedFuncClone)
5070 continue;
5071
5072 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5073
5074 // If we are cloning a function that was already assigned to some
5075 // callers, then essentially we are creating new callsite clones
5076 // of the other callsites in that function that are reached by those
5077 // callers. Clone the other callees of the current callsite's caller
5078 // that were already assigned to PreviousAssignedFuncClone
5079 // accordingly. This is important since we subsequently update the
5080 // calls from the nodes in the graph and their assignments to callee
5081 // functions recorded in CallsiteToCalleeFuncCloneMap.
5082 // The none type edge removal may remove some of this caller's
5083 // callee edges, if it is reached via another of its callees.
5084 // Iterate over a copy and skip any that were removed.
5085 auto CalleeEdges = CE->Caller->CalleeEdges;
5086 for (auto CalleeEdge : CalleeEdges) {
5087 // Skip any that have been removed on an earlier iteration when
5088 // cleaning up newly None type callee edges.
5089 if (CalleeEdge->isRemoved()) {
5090 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5091 continue;
5092 }
5093 assert(CalleeEdge);
5094 ContextNode *Callee = CalleeEdge->Callee;
5095 // Skip the current callsite, we are looking for other
5096 // callsites Caller calls, as well as any that does not have a
5097 // recorded callsite Call.
5098 if (Callee == Clone || !Callee->hasCall())
5099 continue;
5100 // Skip direct recursive calls. We don't need/want to clone the
5101 // caller node again, and this loop will not behave as expected if
5102 // we tried.
5103 if (Callee == CalleeEdge->Caller)
5104 continue;
5105 ContextNode *NewClone =
5106 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5107 // Moving the edge may have resulted in some none type
5108 // callee edges on the original Callee.
5109 removeNoneTypeCalleeEdges(Callee);
5110 // Update NewClone with the new Call clone of this callsite's Call
5111 // created for the new function clone created earlier.
5112 // Recall that we have already ensured when building the graph
5113 // that each caller can only call callsites within the same
5114 // function, so we are guaranteed that Callee Call is in the
5115 // current OrigFunc.
5116 // CallMap is set up as indexed by original Call at clone 0.
5117 CallInfo OrigCall(Callee->getOrigNode()->Call);
5118 OrigCall.setCloneNo(0);
5119 DenseMap<CallInfo, CallInfo> &CallMap =
5120 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5121 assert(CallMap.count(OrigCall));
5122 CallInfo NewCall(CallMap[OrigCall]);
5123 assert(NewCall);
5124 NewClone->setCall(NewCall);
5125 // Need to do the same for all matching calls.
5126 for (auto &MatchingCall : NewClone->MatchingCalls) {
5127 CallInfo OrigMatchingCall(MatchingCall);
5128 OrigMatchingCall.setCloneNo(0);
5129 assert(CallMap.count(OrigMatchingCall));
5130 CallInfo NewCall(CallMap[OrigMatchingCall]);
5131 assert(NewCall);
5132 // Updates the call in the list.
5133 MatchingCall = NewCall;
5134 }
5135 }
5136 }
5137 // Fall through to handling below to perform the recording of the
5138 // function for this callsite clone. This enables handling of cases
5139 // where the callers were assigned to different clones of a function.
5140 }
5141
5142 auto FindFirstAvailFuncClone = [&]() {
5143 // Find first function in FuncCloneInfos without an assigned
5144 // clone of this callsite Node. We should always have one
5145 // available at this point due to the earlier cloning when the
5146 // FuncCloneInfos size was smaller than the clone number.
5147 for (auto &CF : FuncCloneInfos) {
5148 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5149 return CF.FuncClone;
5150 }
5152 "Expected an available func clone for this callsite clone");
5153 };
5154
5155 // See if we can use existing function clone. Walk through
5156 // all caller edges to see if any have already been assigned to
5157 // a clone of this callsite's function. If we can use it, do so. If not,
5158 // because that function clone is already assigned to a different clone
5159 // of this callsite, then we need to clone again.
5160 // Basically, this checking is needed to handle the case where different
5161 // caller functions/callsites may need versions of this function
5162 // containing different mixes of callsite clones across the different
5163 // callsites within the function. If that happens, we need to create
5164 // additional function clones to handle the various combinations.
5165 //
5166 // Keep track of any new clones of this callsite created by the
5167 // following loop, as well as any existing clone that we decided to
5168 // assign this clone to.
5169 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5170 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5171 // Iterate over a copy of Clone's caller edges, since we may need to
5172 // remove edges in the moveEdgeTo* methods, and this simplifies the
5173 // handling and makes it less error-prone.
5174 auto CloneCallerEdges = Clone->CallerEdges;
5175 for (auto &Edge : CloneCallerEdges) {
5176 // Skip removed edges (due to direct recursive edges updated when
5177 // updating callee edges when moving an edge and subsequently
5178 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5179 if (Edge->isRemoved())
5180 continue;
5181 // Ignore any caller that does not have a recorded callsite Call.
5182 if (!Edge->Caller->hasCall())
5183 continue;
5184 // If this caller already assigned to call a version of OrigFunc, need
5185 // to ensure we can assign this callsite clone to that function clone.
5186 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5187 FuncInfo FuncCloneCalledByCaller =
5188 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5189 // First we need to confirm that this function clone is available
5190 // for use by this callsite node clone.
5191 //
5192 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5193 // its callsite clones, one of those callsite clones X could have
5194 // been assigned to the same function clone called by Edge's caller
5195 // - if Edge's caller calls another callsite within Node's original
5196 // function, and that callsite has another caller reaching clone X.
5197 // We need to clone Node again in this case.
5198 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5199 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5200 Clone) ||
5201 // Detect when we have multiple callers of this callsite that
5202 // have already been assigned to specific, and different, clones
5203 // of OrigFunc (due to other unrelated callsites in Func they
5204 // reach via call contexts). Is this Clone of callsite Node
5205 // assigned to a different clone of OrigFunc? If so, clone Node
5206 // again.
5207 (FuncCloneAssignedToCurCallsiteClone &&
5208 FuncCloneAssignedToCurCallsiteClone !=
5209 FuncCloneCalledByCaller)) {
5210 // We need to use a different newly created callsite clone, in
5211 // order to assign it to another new function clone on a
5212 // subsequent iteration over the Clones array (adjusted below).
5213 // Note we specifically do not reset the
5214 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5215 // when this new clone is processed later we know which version of
5216 // the function to copy (so that other callsite clones we have
5217 // assigned to that function clone are properly cloned over). See
5218 // comments in the function cloning handling earlier.
5219
5220 // Check if we already have cloned this callsite again while
5221 // walking through caller edges, for a caller calling the same
5222 // function clone. If so, we can move this edge to that new clone
5223 // rather than creating yet another new clone.
5224 if (FuncCloneToNewCallsiteCloneMap.count(
5225 FuncCloneCalledByCaller)) {
5226 ContextNode *NewClone =
5227 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5228 moveEdgeToExistingCalleeClone(Edge, NewClone);
5229 // Cleanup any none type edges cloned over.
5230 removeNoneTypeCalleeEdges(NewClone);
5231 } else {
5232 // Create a new callsite clone.
5233 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5234 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5235 NewClone;
5236 // Add to list of clones and process later.
5237 ClonesWorklist.push_back(NewClone);
5238 }
5239 // Moving the caller edge may have resulted in some none type
5240 // callee edges.
5241 removeNoneTypeCalleeEdges(Clone);
5242 // We will handle the newly created callsite clone in a subsequent
5243 // iteration over this Node's Clones.
5244 continue;
5245 }
5246
5247 // Otherwise, we can use the function clone already assigned to this
5248 // caller.
5249 if (!FuncCloneAssignedToCurCallsiteClone) {
5250 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5251 // Assign Clone to FuncCloneCalledByCaller
5252 AssignCallsiteCloneToFuncClone(
5253 FuncCloneCalledByCaller, Call, Clone,
5254 AllocationCallToContextNodeMap.count(Call));
5255 } else
5256 // Don't need to do anything - callsite is already calling this
5257 // function clone.
5258 assert(FuncCloneAssignedToCurCallsiteClone ==
5259 FuncCloneCalledByCaller);
5260
5261 } else {
5262 // We have not already assigned this caller to a version of
5263 // OrigFunc. Do the assignment now.
5264
5265 // First check if we have already assigned this callsite clone to a
5266 // clone of OrigFunc for another caller during this iteration over
5267 // its caller edges.
5268 if (!FuncCloneAssignedToCurCallsiteClone) {
5269 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5270 assert(FuncCloneAssignedToCurCallsiteClone);
5271 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5272 AssignCallsiteCloneToFuncClone(
5273 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5274 AllocationCallToContextNodeMap.count(Call));
5275 } else
5276 assert(FuncCloneToCurNodeCloneMap
5277 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5278 // Update callers to record function version called.
5279 RecordCalleeFuncOfCallsite(Edge->Caller,
5280 FuncCloneAssignedToCurCallsiteClone);
5281 }
5282 }
5283 // If we didn't assign a function clone to this callsite clone yet, e.g.
5284 // none of its callers has a non-null call, do the assignment here.
5285 // We want to ensure that every callsite clone is assigned to some
5286 // function clone, so that the call updates below work as expected.
5287 // In particular if this is the original callsite, we want to ensure it
5288 // is assigned to the original function, otherwise the original function
5289 // will appear available for assignment to other callsite clones,
5290 // leading to unintended effects. For one, the unknown and not updated
5291 // callers will call into cloned paths leading to the wrong hints,
5292 // because they still call the original function (clone 0). Also,
5293 // because all callsites start out as being clone 0 by default, we can't
5294 // easily distinguish between callsites explicitly assigned to clone 0
5295 // vs those never assigned, which can lead to multiple updates of the
5296 // calls when invoking updateCall below, with mismatched clone values.
5297 // TODO: Add a flag to the callsite nodes or some other mechanism to
5298 // better distinguish and identify callsite clones that are not getting
5299 // assigned to function clones as expected.
5300 if (!FuncCloneAssignedToCurCallsiteClone) {
5301 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5302 assert(FuncCloneAssignedToCurCallsiteClone &&
5303 "No available func clone for this callsite clone");
5304 AssignCallsiteCloneToFuncClone(
5305 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5306 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5307 }
5308 }
5309 if (VerifyCCG) {
5311 for (const auto &PE : Node->CalleeEdges)
5313 for (const auto &CE : Node->CallerEdges)
5315 for (auto *Clone : Node->Clones) {
5317 for (const auto &PE : Clone->CalleeEdges)
5319 for (const auto &CE : Clone->CallerEdges)
5321 }
5322 }
5323 }
5324
5325 if (FuncCloneInfos.size() < 2)
5326 continue;
5327
5328 // In this case there is more than just the original function copy.
5329 // Record call clones of any callsite nodes in the function that did not
5330 // themselves get cloned for all of the function clones.
5331 for (auto &Call : CallsWithMetadata) {
5332 ContextNode *Node = getNodeForInst(Call);
5333 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5334 continue;
5335 // If Node has enough clones already to cover all function clones, we can
5336 // skip it. Need to add one for the original copy.
5337 // Use >= in case there were clones that were skipped due to having empty
5338 // context ids
5339 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5340 continue;
5341 // First collect all function clones we cloned this callsite node for.
5342 // They may not be sequential due to empty clones e.g.
5343 DenseSet<unsigned> NodeCallClones;
5344 for (auto *C : Node->Clones)
5345 NodeCallClones.insert(C->Call.cloneNo());
5346 unsigned I = 0;
5347 // Now check all the function clones.
5348 for (auto &FC : FuncCloneInfos) {
5349 // Function clones should be sequential.
5350 assert(FC.FuncClone.cloneNo() == I);
5351 // Skip the first clone which got the original call.
5352 // Also skip any other clones created for this Node.
5353 if (++I == 1 || NodeCallClones.contains(I)) {
5354 continue;
5355 }
5356 // Record the call clones created for this callsite in this function
5357 // clone.
5358 auto &CallVector = UnassignedCallClones[Node][I];
5359 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5360 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5361 CallInfo CallClone = It->second;
5362 CallVector.push_back(CallClone);
5363 } else {
5364 // All but the original clone (skipped earlier) should have an entry
5365 // for all calls.
5366 assert(false && "Expected to find call in CallMap");
5367 }
5368 // Need to do the same for all matching calls.
5369 for (auto &MatchingCall : Node->MatchingCalls) {
5370 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5371 CallInfo CallClone = It->second;
5372 CallVector.push_back(CallClone);
5373 } else {
5374 // All but the original clone (skipped earlier) should have an entry
5375 // for all calls.
5376 assert(false && "Expected to find call in CallMap");
5377 }
5378 }
5379 }
5380 }
5381 }
5382
5383 uint8_t BothTypes =
5384 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5385
5386 auto UpdateCalls = [&](ContextNode *Node,
5387 DenseSet<const ContextNode *> &Visited,
5388 auto &&UpdateCalls) {
5389 auto Inserted = Visited.insert(Node);
5390 if (!Inserted.second)
5391 return;
5392
5393 for (auto *Clone : Node->Clones)
5394 UpdateCalls(Clone, Visited, UpdateCalls);
5395
5396 for (auto &Edge : Node->CallerEdges)
5397 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5398
5399 // Skip if either no call to update, or if we ended up with no context ids
5400 // (we moved all edges onto other clones).
5401 if (!Node->hasCall() || Node->emptyContextIds())
5402 return;
5403
5404 if (Node->IsAllocation) {
5405 auto AT = allocTypeToUse(Node->AllocTypes);
5406 // If the allocation type is ambiguous, and more aggressive hinting
5407 // has been enabled via the MinClonedColdBytePercent flag, see if this
5408 // allocation should be hinted cold anyway because its fraction cold bytes
5409 // allocated is at least the given threshold.
5410 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5411 !ContextIdToContextSizeInfos.empty()) {
5412 uint64_t TotalCold = 0;
5413 uint64_t Total = 0;
5414 for (auto Id : Node->getContextIds()) {
5415 auto TypeI = ContextIdToAllocationType.find(Id);
5416 assert(TypeI != ContextIdToAllocationType.end());
5417 auto CSI = ContextIdToContextSizeInfos.find(Id);
5418 if (CSI != ContextIdToContextSizeInfos.end()) {
5419 for (auto &Info : CSI->second) {
5420 Total += Info.TotalSize;
5421 if (TypeI->second == AllocationType::Cold)
5422 TotalCold += Info.TotalSize;
5423 }
5424 }
5425 }
5426 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5427 AT = AllocationType::Cold;
5428 }
5429 updateAllocationCall(Node->Call, AT);
5430 assert(Node->MatchingCalls.empty());
5431 return;
5432 }
5433
5434 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5435 return;
5436
5437 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5438 updateCall(Node->Call, CalleeFunc);
5439 // Update all the matching calls as well.
5440 for (auto &Call : Node->MatchingCalls)
5441 updateCall(Call, CalleeFunc);
5442
5443 // Now update all calls recorded earlier that are still in function clones
5444 // which don't have a clone of this callsite node.
5445 if (!UnassignedCallClones.contains(Node))
5446 return;
5447 DenseSet<unsigned> NodeCallClones;
5448 for (auto *C : Node->Clones)
5449 NodeCallClones.insert(C->Call.cloneNo());
5450 // Note that we already confirmed Node is in this map a few lines above.
5451 auto &ClonedCalls = UnassignedCallClones[Node];
5452 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5453 // Should start at 1 as we never create an entry for original node.
5454 assert(CloneNo > 0);
5455 // If we subsequently created a clone, skip this one.
5456 if (NodeCallClones.contains(CloneNo))
5457 continue;
5458 // Use the original Node's CalleeFunc.
5459 for (auto &Call : CallVector)
5460 updateCall(Call, CalleeFunc);
5461 }
5462 };
5463
5464 // Performs DFS traversal starting from allocation nodes to update calls to
5465 // reflect cloning decisions recorded earlier. For regular LTO this will
5466 // update the actual calls in the IR to call the appropriate function clone
5467 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5468 // are recorded in the summary entries.
5469 DenseSet<const ContextNode *> Visited;
5470 for (auto &Entry : AllocationCallToContextNodeMap)
5471 UpdateCalls(Entry.second, Visited, UpdateCalls);
5472
5473 return Changed;
5474}
5475
5476// Compute a SHA1 hash of the callsite and alloc version information of clone I
5477// in the summary, to use in detection of duplicate clones.
5479 SHA1 Hasher;
5480 // Update hash with any callsites that call non-default (non-zero) callee
5481 // versions.
5482 for (auto &SN : FS->callsites()) {
5483 // In theory all callsites and allocs in this function should have the same
5484 // number of clone entries, but handle any discrepancies gracefully below
5485 // for NDEBUG builds.
5486 assert(
5487 SN.Clones.size() > I &&
5488 "Callsite summary has fewer entries than other summaries in function");
5489 if (SN.Clones.size() <= I || !SN.Clones[I])
5490 continue;
5491 uint8_t Data[sizeof(SN.Clones[I])];
5492 support::endian::write32le(Data, SN.Clones[I]);
5493 Hasher.update(Data);
5494 }
5495 // Update hash with any allocs that have non-default (non-None) hints.
5496 for (auto &AN : FS->allocs()) {
5497 // In theory all callsites and allocs in this function should have the same
5498 // number of clone entries, but handle any discrepancies gracefully below
5499 // for NDEBUG builds.
5500 assert(AN.Versions.size() > I &&
5501 "Alloc summary has fewer entries than other summaries in function");
5502 if (AN.Versions.size() <= I ||
5503 (AllocationType)AN.Versions[I] == AllocationType::None)
5504 continue;
5505 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5506 }
5507 return support::endian::read64le(Hasher.result().data());
5508}
5509
5511 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5513 &FuncToAliasMap,
5514 FunctionSummary *FS) {
5515 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5516 // We might have created this when adjusting callsite in another
5517 // function. It should be a declaration.
5518 assert(DeclGV->isDeclaration());
5519 NewGV->takeName(DeclGV);
5520 DeclGV->replaceAllUsesWith(NewGV);
5521 DeclGV->eraseFromParent();
5522 };
5523
5524 // Handle aliases to this function, and create analogous alias clones to the
5525 // provided clone of this function.
5526 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5527 if (!FuncToAliasMap.count(&F))
5528 return;
5529 for (auto *A : FuncToAliasMap[&F]) {
5530 std::string AliasName = getMemProfFuncName(A->getName(), I);
5531 auto *PrevA = M.getNamedAlias(AliasName);
5532 auto *NewA = GlobalAlias::create(A->getValueType(),
5533 A->getType()->getPointerAddressSpace(),
5534 A->getLinkage(), AliasName, NewF);
5535 NewA->copyAttributesFrom(A);
5536 if (PrevA)
5537 TakeDeclNameAndReplace(PrevA, NewA);
5538 }
5539 };
5540
5541 // The first "clone" is the original copy, we should only call this if we
5542 // needed to create new clones.
5543 assert(NumClones > 1);
5545 VMaps.reserve(NumClones - 1);
5546 FunctionsClonedThinBackend++;
5547
5548 // Map of hash of callsite/alloc versions to the instantiated function clone
5549 // (possibly the original) implementing those calls. Used to avoid
5550 // instantiating duplicate function clones.
5551 // FIXME: Ideally the thin link would not generate such duplicate clones to
5552 // start with, but right now it happens due to phase ordering in the function
5553 // assignment and possible new clones that produces. We simply make each
5554 // duplicate an alias to the matching instantiated clone recorded in the map
5555 // (except for available_externally which are made declarations as they would
5556 // be aliases in the prevailing module, and available_externally aliases are
5557 // not well supported right now).
5559
5560 // Save the hash of the original function version.
5561 HashToFunc[ComputeHash(FS, 0)] = &F;
5562
5563 for (unsigned I = 1; I < NumClones; I++) {
5564 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5565 std::string Name = getMemProfFuncName(F.getName(), I);
5566 auto Hash = ComputeHash(FS, I);
5567 // If this clone would duplicate a previously seen clone, don't generate the
5568 // duplicate clone body, just make an alias to satisfy any (potentially
5569 // cross-module) references.
5570 if (HashToFunc.contains(Hash)) {
5571 FunctionCloneDuplicatesThinBackend++;
5572 auto *Func = HashToFunc[Hash];
5573 if (Func->hasAvailableExternallyLinkage()) {
5574 // Skip these as EliminateAvailableExternallyPass does not handle
5575 // available_externally aliases correctly and we end up with an
5576 // available_externally alias to a declaration. Just create a
5577 // declaration for now as we know we will have a definition in another
5578 // module.
5579 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5580 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5581 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5582 continue;
5583 }
5584 auto *PrevF = M.getFunction(Name);
5585 auto *Alias = GlobalAlias::create(Name, Func);
5586 if (PrevF)
5587 TakeDeclNameAndReplace(PrevF, Alias);
5588 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5589 << "created clone alias " << ore::NV("Alias", Alias));
5590
5591 // Now handle aliases to this function, and clone those as well.
5592 CloneFuncAliases(Func, I);
5593 continue;
5594 }
5595 auto *NewF = CloneFunction(&F, *VMaps.back());
5596 HashToFunc[Hash] = NewF;
5597 FunctionClonesThinBackend++;
5598 // Strip memprof and callsite metadata from clone as they are no longer
5599 // needed.
5600 for (auto &BB : *NewF) {
5601 for (auto &Inst : BB) {
5602 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5603 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5604 }
5605 }
5606 auto *PrevF = M.getFunction(Name);
5607 if (PrevF)
5608 TakeDeclNameAndReplace(PrevF, NewF);
5609 else
5610 NewF->setName(Name);
5611 updateSubprogramLinkageName(NewF, Name);
5612 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5613 << "created clone " << ore::NV("NewFunction", NewF));
5614
5615 // Now handle aliases to this function, and clone those as well.
5616 CloneFuncAliases(NewF, I);
5617 }
5618 return VMaps;
5619}
5620
5621// Locate the summary for F. This is complicated by the fact that it might
5622// have been internalized or promoted.
5624 const ModuleSummaryIndex *ImportSummary,
5625 const Function *CallingFunc = nullptr) {
5626 // FIXME: Ideally we would retain the original GUID in some fashion on the
5627 // function (e.g. as metadata), but for now do our best to locate the
5628 // summary without that information.
5629 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5630 if (!TheFnVI)
5631 // See if theFn was internalized, by checking index directly with
5632 // original name (this avoids the name adjustment done by getGUID() for
5633 // internal symbols).
5634 TheFnVI = ImportSummary->getValueInfo(
5636 if (TheFnVI)
5637 return TheFnVI;
5638 // Now query with the original name before any promotion was performed.
5639 StringRef OrigName =
5641 // When this pass is enabled, we always add thinlto_src_file provenance
5642 // metadata to imported function definitions, which allows us to recreate the
5643 // original internal symbol's GUID.
5644 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5645 // If this is a call to an imported/promoted local for which we didn't import
5646 // the definition, the metadata will not exist on the declaration. However,
5647 // since we are doing this early, before any inlining in the LTO backend, we
5648 // can simply look at the metadata on the calling function which must have
5649 // been from the same module if F was an internal symbol originally.
5650 if (!SrcFileMD && F.isDeclaration()) {
5651 // We would only call this for a declaration for a direct callsite, in which
5652 // case the caller would have provided the calling function pointer.
5653 assert(CallingFunc);
5654 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5655 // If this is a promoted local (OrigName != F.getName()), since this is a
5656 // declaration, it must be imported from a different module and therefore we
5657 // should always find the metadata on its calling function. Any call to a
5658 // promoted local that came from this module should still be a definition.
5659 assert(SrcFileMD || OrigName == F.getName());
5660 }
5661 StringRef SrcFile = M.getSourceFileName();
5662 if (SrcFileMD)
5663 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5664 std::string OrigId = GlobalValue::getGlobalIdentifier(
5665 OrigName, GlobalValue::InternalLinkage, SrcFile);
5666 TheFnVI = ImportSummary->getValueInfo(
5668 // Internal func in original module may have gotten a numbered suffix if we
5669 // imported an external function with the same name. This happens
5670 // automatically during IR linking for naming conflicts. It would have to
5671 // still be internal in that case (otherwise it would have been renamed on
5672 // promotion in which case we wouldn't have a naming conflict).
5673 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5674 F.getName().contains('.')) {
5675 OrigName = F.getName().rsplit('.').first;
5677 OrigName, GlobalValue::InternalLinkage, SrcFile);
5678 TheFnVI = ImportSummary->getValueInfo(
5680 }
5681 // The only way we may not have a VI is if this is a declaration created for
5682 // an imported reference. For distributed ThinLTO we may not have a VI for
5683 // such declarations in the distributed summary.
5684 assert(TheFnVI || F.isDeclaration());
5685 return TheFnVI;
5686}
5687
5688bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5689 Module &M) {
5690 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5691 Symtab = std::make_unique<InstrProfSymtab>();
5692 // Don't add canonical names, to avoid multiple functions to the symtab
5693 // when they both have the same root name with "." suffixes stripped.
5694 // If we pick the wrong one then this could lead to incorrect ICP and calling
5695 // a memprof clone that we don't actually create (resulting in linker unsats).
5696 // What this means is that the GUID of the function (or its PGOFuncName
5697 // metadata) *must* match that in the VP metadata to allow promotion.
5698 // In practice this should not be a limitation, since local functions should
5699 // have PGOFuncName metadata and global function names shouldn't need any
5700 // special handling (they should not get the ".llvm.*" suffix that the
5701 // canonicalization handling is attempting to strip).
5702 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5703 std::string SymtabFailure = toString(std::move(E));
5704 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5705 return false;
5706 }
5707 return true;
5708}
5709
5710#ifndef NDEBUG
5711// Sanity check that the MIB stack ids match between the summary and
5712// instruction metadata.
5714 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5715 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5716 const ModuleSummaryIndex *ImportSummary) {
5717 auto MIBIter = AllocNode.MIBs.begin();
5718 for (auto &MDOp : MemProfMD->operands()) {
5719 assert(MIBIter != AllocNode.MIBs.end());
5720 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5721 auto *MIBMD = cast<const MDNode>(MDOp);
5722 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5723 assert(StackMDNode);
5724 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5725 auto ContextIterBegin =
5726 StackContext.beginAfterSharedPrefix(CallsiteContext);
5727 // Skip the checking on the first iteration.
5728 uint64_t LastStackContextId =
5729 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5730 : 0;
5731 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5732 ++ContextIter) {
5733 // If this is a direct recursion, simply skip the duplicate
5734 // entries, to be consistent with how the summary ids were
5735 // generated during ModuleSummaryAnalysis.
5736 if (LastStackContextId == *ContextIter)
5737 continue;
5738 LastStackContextId = *ContextIter;
5739 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5740 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5741 *ContextIter);
5742 StackIdIndexIter++;
5743 }
5744 MIBIter++;
5745 }
5746}
5747#endif
5748
5749bool MemProfContextDisambiguation::applyImport(Module &M) {
5750 assert(ImportSummary);
5751 bool Changed = false;
5752
5753 // We also need to clone any aliases that reference cloned functions, because
5754 // the modified callsites may invoke via the alias. Keep track of the aliases
5755 // for each function.
5756 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5757 FuncToAliasMap;
5758 for (auto &A : M.aliases()) {
5759 auto *Aliasee = A.getAliaseeObject();
5760 if (auto *F = dyn_cast<Function>(Aliasee))
5761 FuncToAliasMap[F].insert(&A);
5762 }
5763
5764 if (!initializeIndirectCallPromotionInfo(M))
5765 return false;
5766
5767 for (auto &F : M) {
5768 if (F.isDeclaration() || isMemProfClone(F))
5769 continue;
5770
5771 OptimizationRemarkEmitter ORE(&F);
5772
5774 bool ClonesCreated = false;
5775 unsigned NumClonesCreated = 0;
5776 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5777 // We should at least have version 0 which is the original copy.
5778 assert(NumClones > 0);
5779 // If only one copy needed use original.
5780 if (NumClones == 1)
5781 return;
5782 // If we already performed cloning of this function, confirm that the
5783 // requested number of clones matches (the thin link should ensure the
5784 // number of clones for each constituent callsite is consistent within
5785 // each function), before returning.
5786 if (ClonesCreated) {
5787 assert(NumClonesCreated == NumClones);
5788 return;
5789 }
5790 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5791 // The first "clone" is the original copy, which doesn't have a VMap.
5792 assert(VMaps.size() == NumClones - 1);
5793 Changed = true;
5794 ClonesCreated = true;
5795 NumClonesCreated = NumClones;
5796 };
5797
5798 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5799 Function *CalledFunction, FunctionSummary *FS) {
5800 // Perform cloning if not yet done.
5801 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5802
5803 assert(!isMemProfClone(*CalledFunction));
5804
5805 // Because we update the cloned calls by calling setCalledOperand (see
5806 // comment below), out of an abundance of caution make sure the called
5807 // function was actually the called operand (or its aliasee). We also
5808 // strip pointer casts when looking for calls (to match behavior during
5809 // summary generation), however, with opaque pointers in theory this
5810 // should not be an issue. Note we still clone the current function
5811 // (containing this call) above, as that could be needed for its callers.
5812 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5813 if (CalledFunction != CB->getCalledOperand() &&
5814 (!GA || CalledFunction != GA->getAliaseeObject())) {
5815 SkippedCallsCloning++;
5816 return;
5817 }
5818 // Update the calls per the summary info.
5819 // Save orig name since it gets updated in the first iteration
5820 // below.
5821 auto CalleeOrigName = CalledFunction->getName();
5822 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5823 // If the VMap is empty, this clone was a duplicate of another and was
5824 // created as an alias or a declaration.
5825 if (J > 0 && VMaps[J - 1]->empty())
5826 continue;
5827 // Do nothing if this version calls the original version of its
5828 // callee.
5829 if (!StackNode.Clones[J])
5830 continue;
5831 auto NewF = M.getOrInsertFunction(
5832 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5833 CalledFunction->getFunctionType());
5834 CallBase *CBClone;
5835 // Copy 0 is the original function.
5836 if (!J)
5837 CBClone = CB;
5838 else
5839 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5840 // Set the called operand directly instead of calling setCalledFunction,
5841 // as the latter mutates the function type on the call. In rare cases
5842 // we may have a slightly different type on a callee function
5843 // declaration due to it being imported from a different module with
5844 // incomplete types. We really just want to change the name of the
5845 // function to the clone, and not make any type changes.
5846 CBClone->setCalledOperand(NewF.getCallee());
5847 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5848 << ore::NV("Call", CBClone) << " in clone "
5849 << ore::NV("Caller", CBClone->getFunction())
5850 << " assigned to call function clone "
5851 << ore::NV("Callee", NewF.getCallee()));
5852 }
5853 };
5854
5855 // Locate the summary for F.
5856 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5857 // If not found, this could be an imported local (see comment in
5858 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5859 // module (where it would have been promoted to global scope so should
5860 // satisfy any reference in this module).
5861 if (!TheFnVI)
5862 continue;
5863
5864 auto *GVSummary =
5865 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5866 if (!GVSummary) {
5867 // Must have been imported, use the summary which matches the definition。
5868 // (might be multiple if this was a linkonce_odr).
5869 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5870 assert(SrcModuleMD &&
5871 "enable-import-metadata is needed to emit thinlto_src_module");
5872 StringRef SrcModule =
5873 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5874 for (auto &GVS : TheFnVI.getSummaryList()) {
5875 if (GVS->modulePath() == SrcModule) {
5876 GVSummary = GVS.get();
5877 break;
5878 }
5879 }
5880 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5881 }
5882
5883 // If this was an imported alias skip it as we won't have the function
5884 // summary, and it should be cloned in the original module.
5885 if (isa<AliasSummary>(GVSummary))
5886 continue;
5887
5888 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5889
5890 if (FS->allocs().empty() && FS->callsites().empty())
5891 continue;
5892
5893 auto SI = FS->callsites().begin();
5894 auto AI = FS->allocs().begin();
5895
5896 // To handle callsite infos synthesized for tail calls which have missing
5897 // frames in the profiled context, map callee VI to the synthesized callsite
5898 // info.
5899 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5900 // Iterate the callsites for this function in reverse, since we place all
5901 // those synthesized for tail calls at the end.
5902 for (auto CallsiteIt = FS->callsites().rbegin();
5903 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5904 auto &Callsite = *CallsiteIt;
5905 // Stop as soon as we see a non-synthesized callsite info (see comment
5906 // above loop). All the entries added for discovered tail calls have empty
5907 // stack ids.
5908 if (!Callsite.StackIdIndices.empty())
5909 break;
5910 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5911 }
5912
5913 // Keeps track of needed ICP for the function.
5914 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5915
5916 // Assume for now that the instructions are in the exact same order
5917 // as when the summary was created, but confirm this is correct by
5918 // matching the stack ids.
5919 for (auto &BB : F) {
5920 for (auto &I : BB) {
5921 auto *CB = dyn_cast<CallBase>(&I);
5922 // Same handling as when creating module summary.
5923 if (!mayHaveMemprofSummary(CB))
5924 continue;
5925
5926 auto *CalledValue = CB->getCalledOperand();
5927 auto *CalledFunction = CB->getCalledFunction();
5928 if (CalledValue && !CalledFunction) {
5929 CalledValue = CalledValue->stripPointerCasts();
5930 // Stripping pointer casts can reveal a called function.
5931 CalledFunction = dyn_cast<Function>(CalledValue);
5932 }
5933 // Check if this is an alias to a function. If so, get the
5934 // called aliasee for the checks below.
5935 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5936 assert(!CalledFunction &&
5937 "Expected null called function in callsite for alias");
5938 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5939 }
5940
5941 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5942 I.getMetadata(LLVMContext::MD_callsite));
5943 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5944
5945 // Include allocs that were already assigned a memprof function
5946 // attribute in the statistics. Only do this for those that do not have
5947 // memprof metadata, since we add an "ambiguous" memprof attribute by
5948 // default.
5949 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5950 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5951 ? AllocTypeColdThinBackend++
5952 : AllocTypeNotColdThinBackend++;
5953 OrigAllocsThinBackend++;
5954 AllocVersionsThinBackend++;
5955 if (!MaxAllocVersionsThinBackend)
5956 MaxAllocVersionsThinBackend = 1;
5957 continue;
5958 }
5959
5960 if (MemProfMD) {
5961 // Consult the next alloc node.
5962 assert(AI != FS->allocs().end());
5963 auto &AllocNode = *(AI++);
5964
5965#ifndef NDEBUG
5966 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5967 ImportSummary);
5968#endif
5969
5970 // Perform cloning if not yet done.
5971 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5972
5973 OrigAllocsThinBackend++;
5974 AllocVersionsThinBackend += AllocNode.Versions.size();
5975 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5976 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5977
5978 // If there is only one version that means we didn't end up
5979 // considering this function for cloning, and in that case the alloc
5980 // will still be none type or should have gotten the default NotCold.
5981 // Skip that after calling clone helper since that does some sanity
5982 // checks that confirm we haven't decided yet that we need cloning.
5983 // We might have a single version that is cold due to the
5984 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5985 // case.
5986 if (AllocNode.Versions.size() == 1 &&
5987 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5988 assert((AllocationType)AllocNode.Versions[0] ==
5989 AllocationType::NotCold ||
5990 (AllocationType)AllocNode.Versions[0] ==
5991 AllocationType::None);
5992 UnclonableAllocsThinBackend++;
5993 continue;
5994 }
5995
5996 // All versions should have a singular allocation type.
5997 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5998 return Type == ((uint8_t)AllocationType::NotCold |
5999 (uint8_t)AllocationType::Cold);
6000 }));
6001
6002 // Update the allocation types per the summary info.
6003 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
6004 // If the VMap is empty, this clone was a duplicate of another and
6005 // was created as an alias or a declaration.
6006 if (J > 0 && VMaps[J - 1]->empty())
6007 continue;
6008 // Ignore any that didn't get an assigned allocation type.
6009 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
6010 continue;
6011 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
6012 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
6013 : AllocTypeNotColdThinBackend++;
6014 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
6015 auto A = llvm::Attribute::get(F.getContext(), "memprof",
6016 AllocTypeString);
6017 CallBase *CBClone;
6018 // Copy 0 is the original function.
6019 if (!J)
6020 CBClone = CB;
6021 else
6022 // Since VMaps are only created for new clones, we index with
6023 // clone J-1 (J==0 is the original clone and does not have a VMaps
6024 // entry).
6025 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6027 CBClone->addFnAttr(A);
6028 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
6029 << ore::NV("AllocationCall", CBClone) << " in clone "
6030 << ore::NV("Caller", CBClone->getFunction())
6031 << " marked with memprof allocation attribute "
6032 << ore::NV("Attribute", AllocTypeString));
6033 }
6034 } else if (!CallsiteContext.empty()) {
6035 if (!CalledFunction) {
6036#ifndef NDEBUG
6037 // We should have skipped inline assembly calls.
6038 auto *CI = dyn_cast<CallInst>(CB);
6039 assert(!CI || !CI->isInlineAsm());
6040#endif
6041 // We should have skipped direct calls via a Constant.
6042 assert(CalledValue && !isa<Constant>(CalledValue));
6043
6044 // This is an indirect call, see if we have profile information and
6045 // whether any clones were recorded for the profiled targets (that
6046 // we synthesized CallsiteInfo summary records for when building the
6047 // index).
6048 auto NumClones =
6049 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
6050
6051 // Perform cloning if not yet done. This is done here in case
6052 // we don't need to do ICP, but might need to clone this
6053 // function as it is the target of other cloned calls.
6054 if (NumClones)
6055 CloneFuncIfNeeded(NumClones, FS);
6056 }
6057
6058 else {
6059 // Consult the next callsite node.
6060 assert(SI != FS->callsites().end());
6061 auto &StackNode = *(SI++);
6062
6063#ifndef NDEBUG
6064 // Sanity check that the stack ids match between the summary and
6065 // instruction metadata.
6066 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6067 for (auto StackId : CallsiteContext) {
6068 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6069 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6070 StackId);
6071 StackIdIndexIter++;
6072 }
6073#endif
6074
6075 CloneCallsite(StackNode, CB, CalledFunction, FS);
6076 }
6077 } else if (CB->isTailCall() && CalledFunction) {
6078 // Locate the synthesized callsite info for the callee VI, if any was
6079 // created, and use that for cloning.
6080 ValueInfo CalleeVI =
6081 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
6082 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
6083 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
6084 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6085 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6086 }
6087 }
6088 }
6089 }
6090
6091 // Now do any promotion required for cloning.
6092 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6093 }
6094
6095 // We skip some of the functions and instructions above, so remove all the
6096 // metadata in a single sweep here.
6097 for (auto &F : M) {
6098 // We can skip memprof clones because createFunctionClones already strips
6099 // the metadata from the newly created clones.
6100 if (F.isDeclaration() || isMemProfClone(F))
6101 continue;
6102 for (auto &BB : F) {
6103 for (auto &I : BB) {
6104 if (!isa<CallBase>(I))
6105 continue;
6106 I.setMetadata(LLVMContext::MD_memprof, nullptr);
6107 I.setMetadata(LLVMContext::MD_callsite, nullptr);
6108 }
6109 }
6110 }
6111
6112 return Changed;
6113}
6114
6115unsigned MemProfContextDisambiguation::recordICPInfo(
6116 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6118 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6119 // First see if we have profile information for this indirect call.
6120 uint32_t NumCandidates;
6121 uint64_t TotalCount;
6122 auto CandidateProfileData =
6123 ICallAnalysis->getPromotionCandidatesForInstruction(
6124 CB, TotalCount, NumCandidates, MaxSummaryIndirectEdges);
6125 if (CandidateProfileData.empty())
6126 return 0;
6127
6128 // Iterate through all of the candidate profiled targets along with the
6129 // CallsiteInfo summary records synthesized for them when building the index,
6130 // and see if any are cloned and/or refer to clones.
6131 bool ICPNeeded = false;
6132 unsigned NumClones = 0;
6133 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
6134 for (const auto &Candidate : CandidateProfileData) {
6135#ifndef NDEBUG
6136 auto CalleeValueInfo =
6137#endif
6138 ImportSummary->getValueInfo(Candidate.Value);
6139 // We might not have a ValueInfo if this is a distributed
6140 // ThinLTO backend and decided not to import that function.
6141 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6142 assert(SI != AllCallsites.end());
6143 auto &StackNode = *(SI++);
6144 // See if any of the clones of the indirect callsite for this
6145 // profiled target should call a cloned version of the profiled
6146 // target. We only need to do the ICP here if so.
6147 ICPNeeded |= llvm::any_of(StackNode.Clones,
6148 [](unsigned CloneNo) { return CloneNo != 0; });
6149 // Every callsite in the same function should have been cloned the same
6150 // number of times.
6151 assert(!NumClones || NumClones == StackNode.Clones.size());
6152 NumClones = StackNode.Clones.size();
6153 }
6154 if (!ICPNeeded)
6155 return NumClones;
6156 // Save information for ICP, which is performed later to avoid messing up the
6157 // current function traversal.
6158 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
6159 TotalCount, CallsiteInfoStartIndex});
6160 return NumClones;
6161}
6162
6163void MemProfContextDisambiguation::performICP(
6164 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6165 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6166 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6167 OptimizationRemarkEmitter &ORE) {
6168 // Now do any promotion required for cloning. Specifically, for each
6169 // recorded ICP candidate (which was only recorded because one clone of that
6170 // candidate should call a cloned target), we perform ICP (speculative
6171 // devirtualization) for each clone of the callsite, and update its callee
6172 // to the appropriate clone. Note that the ICP compares against the original
6173 // version of the target, which is what is in the vtable.
6174 for (auto &Info : ICallAnalysisInfo) {
6175 auto *CB = Info.CB;
6176 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6177 auto TotalCount = Info.TotalCount;
6178 unsigned NumPromoted = 0;
6179 unsigned NumClones = 0;
6180
6181 for (auto &Candidate : Info.CandidateProfileData) {
6182 auto &StackNode = AllCallsites[CallsiteIndex++];
6183
6184 // All calls in the same function must have the same number of clones.
6185 assert(!NumClones || NumClones == StackNode.Clones.size());
6186 NumClones = StackNode.Clones.size();
6187
6188 // See if the target is in the module. If it wasn't imported, it is
6189 // possible that this profile could have been collected on a different
6190 // target (or version of the code), and we need to be conservative
6191 // (similar to what is done in the ICP pass).
6192 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
6193 if (TargetFunction == nullptr ||
6194 // Any ThinLTO global dead symbol removal should have already
6195 // occurred, so it should be safe to promote when the target is a
6196 // declaration.
6197 // TODO: Remove internal option once more fully tested.
6199 TargetFunction->isDeclaration())) {
6200 ORE.emit([&]() {
6201 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6202 << "Memprof cannot promote indirect call: target with md5sum "
6203 << ore::NV("target md5sum", Candidate.Value) << " not found";
6204 });
6205 // FIXME: See if we can use the new declaration importing support to
6206 // at least get the declarations imported for this case. Hot indirect
6207 // targets should have been imported normally, however.
6208 continue;
6209 }
6210
6211 // Check if legal to promote
6212 const char *Reason = nullptr;
6213 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
6214 ORE.emit([&]() {
6215 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6216 << "Memprof cannot promote indirect call to "
6217 << ore::NV("TargetFunction", TargetFunction)
6218 << " with count of " << ore::NV("TotalCount", TotalCount)
6219 << ": " << Reason;
6220 });
6221 continue;
6222 }
6223
6224 assert(!isMemProfClone(*TargetFunction));
6225
6226 // Handle each call clone, applying ICP so that each clone directly
6227 // calls the specified callee clone, guarded by the appropriate ICP
6228 // check.
6229 CallBase *CBClone = CB;
6230 for (unsigned J = 0; J < NumClones; J++) {
6231 // If the VMap is empty, this clone was a duplicate of another and was
6232 // created as an alias or a declaration.
6233 if (J > 0 && VMaps[J - 1]->empty())
6234 continue;
6235 // Copy 0 is the original function.
6236 if (J > 0)
6237 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6238 // We do the promotion using the original name, so that the comparison
6239 // is against the name in the vtable. Then just below, change the new
6240 // direct call to call the cloned function.
6241 auto &DirectCall =
6242 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
6243 TotalCount, isSamplePGO, &ORE);
6244 auto *TargetToUse = TargetFunction;
6245 // Call original if this version calls the original version of its
6246 // callee.
6247 if (StackNode.Clones[J]) {
6248 TargetToUse =
6249 cast<Function>(M.getOrInsertFunction(
6250 getMemProfFuncName(TargetFunction->getName(),
6251 StackNode.Clones[J]),
6252 TargetFunction->getFunctionType())
6253 .getCallee());
6254 }
6255 DirectCall.setCalledFunction(TargetToUse);
6256 // During matching we generate synthetic VP metadata for indirect calls
6257 // not already having any, from the memprof profile's callee GUIDs. If
6258 // we subsequently promote and inline those callees, we currently lose
6259 // the ability to generate this synthetic VP metadata. Optionally apply
6260 // a noinline attribute to promoted direct calls, where the threshold is
6261 // set to capture synthetic VP metadata targets which get a count of 1.
6263 Candidate.Count < MemProfICPNoInlineThreshold)
6264 DirectCall.setIsNoInline();
6265 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6266 << ore::NV("Call", CBClone) << " in clone "
6267 << ore::NV("Caller", CBClone->getFunction())
6268 << " promoted and assigned to call function clone "
6269 << ore::NV("Callee", TargetToUse));
6270 }
6271
6272 // Update TotalCount (all clones should get same count above)
6273 TotalCount -= Candidate.Count;
6274 NumPromoted++;
6275 }
6276 // Adjust the MD.prof metadata for all clones, now that we have the new
6277 // TotalCount and the number promoted.
6278 CallBase *CBClone = CB;
6279 for (unsigned J = 0; J < NumClones; J++) {
6280 // If the VMap is empty, this clone was a duplicate of another and was
6281 // created as an alias or a declaration.
6282 if (J > 0 && VMaps[J - 1]->empty())
6283 continue;
6284 // Copy 0 is the original function.
6285 if (J > 0)
6286 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6287 // First delete the old one.
6288 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6289 // If all promoted, we don't need the MD.prof metadata.
6290 // Otherwise we need update with the un-promoted records back.
6291 if (TotalCount != 0)
6293 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6294 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6295 }
6296 }
6297}
6298
6299template <typename DerivedCCG, typename FuncTy, typename CallTy>
6300bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6301 if (DumpCCG) {
6302 dbgs() << "CCG before cloning:\n";
6303 dbgs() << *this;
6304 }
6305 if (ExportToDot)
6306 exportToDot("postbuild");
6307
6308 if (VerifyCCG) {
6309 check();
6310 }
6311
6312 identifyClones();
6313
6314 if (VerifyCCG) {
6315 check();
6316 }
6317
6318 if (DumpCCG) {
6319 dbgs() << "CCG after cloning:\n";
6320 dbgs() << *this;
6321 }
6322 if (ExportToDot)
6323 exportToDot("cloned");
6324
6325 bool Changed = assignFunctions();
6326
6327 if (DumpCCG) {
6328 dbgs() << "CCG after assigning function clones:\n";
6329 dbgs() << *this;
6330 }
6331 if (ExportToDot)
6332 exportToDot("clonefuncassign");
6333
6335 printTotalSizes(errs());
6336
6337 return Changed;
6338}
6339
6340bool MemProfContextDisambiguation::processModule(
6341 Module &M,
6342 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6343
6344 // If we have an import summary, then the cloning decisions were made during
6345 // the thin link on the index. Apply them and return.
6346 if (ImportSummary)
6347 return applyImport(M);
6348
6349 // TODO: If/when other types of memprof cloning are enabled beyond just for
6350 // hot and cold, we will need to change this to individually control the
6351 // AllocationType passed to addStackNodesForMIB during CCG construction.
6352 // Note that we specifically check this after applying imports above, so that
6353 // the option isn't needed to be passed to distributed ThinLTO backend
6354 // clang processes, which won't necessarily have visibility into the linker
6355 // dependences. Instead the information is communicated from the LTO link to
6356 // the backends via the combined summary index.
6357 if (!SupportsHotColdNew)
6358 return false;
6359
6360 ModuleCallsiteContextGraph CCG(M, OREGetter);
6361 return CCG.process();
6362}
6363
6365 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6366 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6367 // Check the dot graph printing options once here, to make sure we have valid
6368 // and expected combinations.
6369 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6371 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6373 !ContextIdForDot.getNumOccurrences())
6375 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6376 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6377 ContextIdForDot.getNumOccurrences())
6379 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6380 "-memprof-dot-context-id");
6381 if (ImportSummary) {
6382 // The MemProfImportSummary should only be used for testing ThinLTO
6383 // distributed backend handling via opt, in which case we don't have a
6384 // summary from the pass pipeline.
6386 return;
6387 }
6388 if (MemProfImportSummary.empty())
6389 return;
6390
6391 auto ReadSummaryFile =
6393 if (!ReadSummaryFile) {
6394 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6395 "Error loading file '" + MemProfImportSummary +
6396 "': ");
6397 return;
6398 }
6399 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6400 if (!ImportSummaryForTestingOrErr) {
6401 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6402 "Error parsing file '" + MemProfImportSummary +
6403 "': ");
6404 return;
6405 }
6406 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6407 ImportSummary = ImportSummaryForTesting.get();
6408}
6409
6412 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6413 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6414 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6415 };
6416 if (!processModule(M, OREGetter))
6417 return PreservedAnalyses::all();
6418 return PreservedAnalyses::none();
6419}
6420
6422 ModuleSummaryIndex &Index,
6424 isPrevailing) {
6425 // TODO: If/when other types of memprof cloning are enabled beyond just for
6426 // hot and cold, we will need to change this to individually control the
6427 // AllocationType passed to addStackNodesForMIB during CCG construction.
6428 // The index was set from the option, so these should be in sync.
6429 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6430 if (!SupportsHotColdNew)
6431 return;
6432
6433 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6434 CCG.process();
6435}
6436
6437// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6438// when we don't have an index that has recorded that we are linking with
6439// allocation libraries containing the necessary APIs for downstream
6440// transformations.
6442 // The profile matcher applies hotness attributes directly for allocations,
6443 // and those will cause us to generate calls to the hot/cold interfaces
6444 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6445 // link then assume we don't want these calls (e.g. not linking with
6446 // the appropriate library, or otherwise trying to disable this behavior).
6447 bool Changed = false;
6448 for (auto &F : M) {
6449 for (auto &BB : F) {
6450 for (auto &I : BB) {
6451 auto *CI = dyn_cast<CallBase>(&I);
6452 if (!CI)
6453 continue;
6454 if (CI->hasFnAttr("memprof")) {
6455 CI->removeFnAttr("memprof");
6456 Changed = true;
6457 }
6458 if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
6459 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6460 continue;
6461 }
6462 // Strip off all memprof metadata as it is no longer needed.
6463 // Importantly, this avoids the addition of new memprof attributes
6464 // after inlining propagation.
6465 CI->setMetadata(LLVMContext::MD_memprof, nullptr);
6466 CI->setMetadata(LLVMContext::MD_callsite, nullptr);
6467 Changed = true;
6468 }
6469 }
6470 }
6471 if (!Changed)
6472 return PreservedAnalyses::all();
6473 return PreservedAnalyses::none();
6474}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
const_pointer iterator
Definition ArrayRef.h:47
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:598
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
bool isWeakForLinker() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:675
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:150
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:61
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
constexpr from_range_t from_range
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
cl::opt< unsigned > MemProfTopNImportant("memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
cl::opt< unsigned > MaxSummaryIndirectEdges("module-summary-max-indirect-edges", cl::init(0), cl::Hidden, cl::desc("Max number of summary edges added from " "indirect call profile metadata"))
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
cl::opt< bool > MemProfFixupImportant("memprof-fixup-important", cl::init(true), cl::Hidden, cl::desc("Enables edge fixup for important contexts"))
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34