LLVM 22.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <unordered_map>
53#include <vector>
54using namespace llvm;
55using namespace llvm::memprof;
56
57#define DEBUG_TYPE "memprof-context-disambiguation"
58
59STATISTIC(FunctionClonesAnalysis,
60 "Number of function clones created during whole program analysis");
61STATISTIC(FunctionClonesThinBackend,
62 "Number of function clones created during ThinLTO backend");
63STATISTIC(FunctionsClonedThinBackend,
64 "Number of functions that had clones created during ThinLTO backend");
66 FunctionCloneDuplicatesThinBackend,
67 "Number of function clone duplicates detected during ThinLTO backend");
68STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
69 "cloned) during whole program analysis");
70STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
71 "during whole program analysis");
72STATISTIC(AllocTypeNotColdThinBackend,
73 "Number of not cold static allocations (possibly cloned) during "
74 "ThinLTO backend");
75STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
76 "(possibly cloned) during ThinLTO backend");
77STATISTIC(OrigAllocsThinBackend,
78 "Number of original (not cloned) allocations with memprof profiles "
79 "during ThinLTO backend");
81 AllocVersionsThinBackend,
82 "Number of allocation versions (including clones) during ThinLTO backend");
83STATISTIC(MaxAllocVersionsThinBackend,
84 "Maximum number of allocation versions created for an original "
85 "allocation during ThinLTO backend");
86STATISTIC(UnclonableAllocsThinBackend,
87 "Number of unclonable ambigous allocations during ThinLTO backend");
88STATISTIC(RemovedEdgesWithMismatchedCallees,
89 "Number of edges removed due to mismatched callees (profiled vs IR)");
90STATISTIC(FoundProfiledCalleeCount,
91 "Number of profiled callees found via tail calls");
92STATISTIC(FoundProfiledCalleeDepth,
93 "Aggregate depth of profiled callees found via tail calls");
94STATISTIC(FoundProfiledCalleeMaxDepth,
95 "Maximum depth of profiled callees found via tail calls");
96STATISTIC(FoundProfiledCalleeNonUniquelyCount,
97 "Number of profiled callees found via multiple tail call chains");
98STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
99STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
100STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
101STATISTIC(MissingAllocForContextId,
102 "Number of missing alloc nodes for context ids");
103STATISTIC(SkippedCallsCloning,
104 "Number of calls skipped during cloning due to unexpected operand");
105STATISTIC(MismatchedCloneAssignments,
106 "Number of callsites assigned to call multiple non-matching clones");
107STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
108STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
109STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
110STATISTIC(NumImportantContextIds, "Number of important context ids");
111STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
112STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
113STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
114
116 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
117 cl::value_desc("filename"),
118 cl::desc("Specify the path prefix of the MemProf dot files."));
119
120static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
122 cl::desc("Export graph to dot files."));
123
124// TODO: Remove this option once new handling is validated more widely.
126 "memprof-merge-iteration", cl::init(true), cl::Hidden,
127 cl::desc("Iteratively apply merging on a node to catch new callers"));
128
129// How much of the graph to export to dot.
131 All, // The full CCG graph.
132 Alloc, // Only contexts for the specified allocation.
133 Context, // Only the specified context.
134};
135
137 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
140 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
142 "Export only nodes with contexts feeding given "
143 "-memprof-dot-alloc-id"),
144 clEnumValN(DotScope::Context, "context",
145 "Export only nodes with given -memprof-dot-context-id")));
146
148 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
149 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
150 "or to highlight if -memprof-dot-scope=all"));
151
153 "memprof-dot-context-id", cl::init(0), cl::Hidden,
154 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
155 "highlight otherwise"));
156
157static cl::opt<bool>
158 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
159 cl::desc("Dump CallingContextGraph to stdout after each stage."));
160
161static cl::opt<bool>
162 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
163 cl::desc("Perform verification checks on CallingContextGraph."));
164
165static cl::opt<bool>
166 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
167 cl::desc("Perform frequent verification checks on nodes."));
168
170 "memprof-import-summary",
171 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
172 cl::Hidden);
173
175 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
177 cl::desc("Max depth to recursively search for missing "
178 "frames through tail calls."));
179
180// Optionally enable cloning of callsites involved with recursive cycles
182 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
183 cl::desc("Allow cloning of callsites involved in recursive cycles"));
184
186 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
187 cl::desc("Allow cloning of contexts through recursive cycles"));
188
189// Generally this is needed for correct assignment of allocation clones to
190// function clones, however, allow it to be disabled for debugging while the
191// functionality is new and being tested more widely.
192static cl::opt<bool>
193 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
194 cl::desc("Merge clones before assigning functions"));
195
196// When disabled, try to detect and prevent cloning of recursive contexts.
197// This is only necessary until we support cloning through recursive cycles.
198// Leave on by default for now, as disabling requires a little bit of compile
199// time overhead and doesn't affect correctness, it will just inflate the cold
200// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
202 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
203 cl::desc("Allow cloning of contexts having recursive cycles"));
204
205// Set the minimum absolute count threshold for allowing inlining of indirect
206// calls promoted during cloning.
208 "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
209 cl::desc("Minimum absolute count for promoted target to be inlinable"));
210
211namespace llvm {
213 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
214 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
215
216// Indicate we are linking with an allocator that supports hot/cold operator
217// new interfaces.
219 "supports-hot-cold-new", cl::init(false), cl::Hidden,
220 cl::desc("Linking with hot/cold operator new interfaces"));
221
223 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
224 cl::desc(
225 "Require target function definition when promoting indirect calls"));
226
229
231 "memprof-top-n-important", cl::init(10), cl::Hidden,
232 cl::desc("Number of largest cold contexts to consider important"));
233
235 "memprof-fixup-important", cl::init(true), cl::Hidden,
236 cl::desc("Enables edge fixup for important contexts"));
237
239
240} // namespace llvm
241
242namespace {
243
244/// CRTP base for graphs built from either IR or ThinLTO summary index.
245///
246/// The graph represents the call contexts in all memprof metadata on allocation
247/// calls, with nodes for the allocations themselves, as well as for the calls
248/// in each context. The graph is initially built from the allocation memprof
249/// metadata (or summary) MIBs. It is then updated to match calls with callsite
250/// metadata onto the nodes, updating it to reflect any inlining performed on
251/// those calls.
252///
253/// Each MIB (representing an allocation's call context with allocation
254/// behavior) is assigned a unique context id during the graph build. The edges
255/// and nodes in the graph are decorated with the context ids they carry. This
256/// is used to correctly update the graph when cloning is performed so that we
257/// can uniquify the context for a single (possibly cloned) allocation.
258template <typename DerivedCCG, typename FuncTy, typename CallTy>
259class CallsiteContextGraph {
260public:
261 CallsiteContextGraph() = default;
262 CallsiteContextGraph(const CallsiteContextGraph &) = default;
263 CallsiteContextGraph(CallsiteContextGraph &&) = default;
264
265 /// Main entry point to perform analysis and transformations on graph.
266 bool process();
267
268 /// Perform cloning on the graph necessary to uniquely identify the allocation
269 /// behavior of an allocation based on its context.
270 void identifyClones();
271
272 /// Assign callsite clones to functions, cloning functions as needed to
273 /// accommodate the combinations of their callsite clones reached by callers.
274 /// For regular LTO this clones functions and callsites in the IR, but for
275 /// ThinLTO the cloning decisions are noted in the summaries and later applied
276 /// in applyImport.
277 bool assignFunctions();
278
279 void dump() const;
280 void print(raw_ostream &OS) const;
281 void printTotalSizes(raw_ostream &OS) const;
282
284 const CallsiteContextGraph &CCG) {
285 CCG.print(OS);
286 return OS;
287 }
288
289 friend struct GraphTraits<
290 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
291 friend struct DOTGraphTraits<
292 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
293
294 void exportToDot(std::string Label) const;
295
296 /// Represents a function clone via FuncTy pointer and clone number pair.
297 struct FuncInfo final
298 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
299 using Base = std::pair<FuncTy *, unsigned>;
300 FuncInfo(const Base &B) : Base(B) {}
301 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
302 explicit operator bool() const { return this->first != nullptr; }
303 FuncTy *func() const { return this->first; }
304 unsigned cloneNo() const { return this->second; }
305 };
306
307 /// Represents a callsite clone via CallTy and clone number pair.
308 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
309 using Base = std::pair<CallTy, unsigned>;
310 CallInfo(const Base &B) : Base(B) {}
311 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
312 : Base(Call, CloneNo) {}
313 explicit operator bool() const { return (bool)this->first; }
314 CallTy call() const { return this->first; }
315 unsigned cloneNo() const { return this->second; }
316 void setCloneNo(unsigned N) { this->second = N; }
317 void print(raw_ostream &OS) const {
318 if (!operator bool()) {
319 assert(!cloneNo());
320 OS << "null Call";
321 return;
322 }
323 call()->print(OS);
324 OS << "\t(clone " << cloneNo() << ")";
325 }
326 void dump() const {
327 print(dbgs());
328 dbgs() << "\n";
329 }
330 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
331 Call.print(OS);
332 return OS;
333 }
334 };
335
336 struct ContextEdge;
337
338 /// Node in the Callsite Context Graph
339 struct ContextNode {
340 // Assigned to nodes as they are created, useful for debugging.
341 unsigned NodeId = 0;
342
343 // Keep this for now since in the IR case where we have an Instruction* it
344 // is not as immediately discoverable. Used for printing richer information
345 // when dumping graph.
346 bool IsAllocation;
347
348 // Keeps track of when the Call was reset to null because there was
349 // recursion.
350 bool Recursive = false;
351
352 // This will be formed by ORing together the AllocationType enum values
353 // for contexts including this node.
354 uint8_t AllocTypes = 0;
355
356 // The corresponding allocation or interior call. This is the primary call
357 // for which we have created this node.
358 CallInfo Call;
359
360 // List of other calls that can be treated the same as the primary call
361 // through cloning. I.e. located in the same function and have the same
362 // (possibly pruned) stack ids. They will be updated the same way as the
363 // primary call when assigning to function clones.
364 SmallVector<CallInfo, 0> MatchingCalls;
365
366 // For alloc nodes this is a unique id assigned when constructed, and for
367 // callsite stack nodes it is the original stack id when the node is
368 // constructed from the memprof MIB metadata on the alloc nodes. Note that
369 // this is only used when matching callsite metadata onto the stack nodes
370 // created when processing the allocation memprof MIBs, and for labeling
371 // nodes in the dot graph. Therefore we don't bother to assign a value for
372 // clones.
373 uint64_t OrigStackOrAllocId = 0;
374
375 // Edges to all callees in the profiled call stacks.
376 // TODO: Should this be a map (from Callee node) for more efficient lookup?
377 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
378
379 // Edges to all callers in the profiled call stacks.
380 // TODO: Should this be a map (from Caller node) for more efficient lookup?
381 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
382
383 // Returns true if we need to look at the callee edges for determining the
384 // node context ids and allocation type.
385 bool useCallerEdgesForContextInfo() const {
386 // Typically if the callee edges are empty either the caller edges are
387 // also empty, or this is an allocation (leaf node). However, if we are
388 // allowing recursive callsites and contexts this will be violated for
389 // incompletely cloned recursive cycles.
390 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
392 // When cloning for a recursive context, during cloning we might be in the
393 // midst of cloning for a recurrence and have moved context ids off of a
394 // caller edge onto the clone but not yet off of the incoming caller
395 // (back) edge. If we don't look at those we miss the fact that this node
396 // still has context ids of interest.
397 return IsAllocation || CloneRecursiveContexts;
398 }
399
400 // Compute the context ids for this node from the union of its edge context
401 // ids.
402 DenseSet<uint32_t> getContextIds() const {
403 unsigned Count = 0;
404 // Compute the number of ids for reserve below. In general we only need to
405 // look at one set of edges, typically the callee edges, since other than
406 // allocations and in some cases during recursion cloning, all the context
407 // ids on the callers should also flow out via callee edges.
408 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
409 Count += Edge->getContextIds().size();
410 DenseSet<uint32_t> ContextIds;
411 ContextIds.reserve(Count);
413 CalleeEdges, useCallerEdgesForContextInfo()
414 ? CallerEdges
415 : std::vector<std::shared_ptr<ContextEdge>>());
416 for (const auto &Edge : Edges)
417 ContextIds.insert_range(Edge->getContextIds());
418 return ContextIds;
419 }
420
421 // Compute the allocation type for this node from the OR of its edge
422 // allocation types.
423 uint8_t computeAllocType() const {
424 uint8_t BothTypes =
428 CalleeEdges, useCallerEdgesForContextInfo()
429 ? CallerEdges
430 : std::vector<std::shared_ptr<ContextEdge>>());
431 for (const auto &Edge : Edges) {
432 AllocType |= Edge->AllocTypes;
433 // Bail early if alloc type reached both, no further refinement.
434 if (AllocType == BothTypes)
435 return AllocType;
436 }
437 return AllocType;
438 }
439
440 // The context ids set for this node is empty if its edge context ids are
441 // also all empty.
442 bool emptyContextIds() const {
444 CalleeEdges, useCallerEdgesForContextInfo()
445 ? CallerEdges
446 : std::vector<std::shared_ptr<ContextEdge>>());
447 for (const auto &Edge : Edges) {
448 if (!Edge->getContextIds().empty())
449 return false;
450 }
451 return true;
452 }
453
454 // List of clones of this ContextNode, initially empty.
455 std::vector<ContextNode *> Clones;
456
457 // If a clone, points to the original uncloned node.
458 ContextNode *CloneOf = nullptr;
459
460 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
461
462 ContextNode(bool IsAllocation, CallInfo C)
463 : IsAllocation(IsAllocation), Call(C) {}
464
465 void addClone(ContextNode *Clone) {
466 if (CloneOf) {
467 CloneOf->Clones.push_back(Clone);
468 Clone->CloneOf = CloneOf;
469 } else {
470 Clones.push_back(Clone);
471 assert(!Clone->CloneOf);
472 Clone->CloneOf = this;
473 }
474 }
475
476 ContextNode *getOrigNode() {
477 if (!CloneOf)
478 return this;
479 return CloneOf;
480 }
481
482 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
483 unsigned int ContextId);
484
485 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
486 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
487 void eraseCalleeEdge(const ContextEdge *Edge);
488 void eraseCallerEdge(const ContextEdge *Edge);
489
490 void setCall(CallInfo C) { Call = C; }
491
492 bool hasCall() const { return (bool)Call.call(); }
493
494 void printCall(raw_ostream &OS) const { Call.print(OS); }
495
496 // True if this node was effectively removed from the graph, in which case
497 // it should have an allocation type of None and empty context ids.
498 bool isRemoved() const {
499 // Typically if the callee edges are empty either the caller edges are
500 // also empty, or this is an allocation (leaf node). However, if we are
501 // allowing recursive callsites and contexts this will be violated for
502 // incompletely cloned recursive cycles.
504 (AllocTypes == (uint8_t)AllocationType::None) ==
505 emptyContextIds());
506 return AllocTypes == (uint8_t)AllocationType::None;
507 }
508
509 void dump() const;
510 void print(raw_ostream &OS) const;
511
512 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
513 Node.print(OS);
514 return OS;
515 }
516 };
517
518 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
519 /// callee.
520 struct ContextEdge {
521 ContextNode *Callee;
522 ContextNode *Caller;
523
524 // This will be formed by ORing together the AllocationType enum values
525 // for contexts including this edge.
526 uint8_t AllocTypes = 0;
527
528 // Set just before initiating cloning when cloning of recursive contexts is
529 // enabled. Used to defer cloning of backedges until we have done cloning of
530 // the callee node for non-backedge caller edges. This exposes cloning
531 // opportunities through the backedge of the cycle.
532 // TODO: Note that this is not updated during cloning, and it is unclear
533 // whether that would be needed.
534 bool IsBackedge = false;
535
536 // The set of IDs for contexts including this edge.
537 DenseSet<uint32_t> ContextIds;
538
539 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
540 DenseSet<uint32_t> ContextIds)
541 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
542 ContextIds(std::move(ContextIds)) {}
543
544 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
545
546 // Helper to clear the fields of this edge when we are removing it from the
547 // graph.
548 inline void clear() {
549 ContextIds.clear();
550 AllocTypes = (uint8_t)AllocationType::None;
551 Caller = nullptr;
552 Callee = nullptr;
553 }
554
555 // Check if edge was removed from the graph. This is useful while iterating
556 // over a copy of edge lists when performing operations that mutate the
557 // graph in ways that might remove one of the edges.
558 inline bool isRemoved() const {
559 if (Callee || Caller)
560 return false;
561 // Any edges that have been removed from the graph but are still in a
562 // shared_ptr somewhere should have all fields null'ed out by clear()
563 // above.
564 assert(AllocTypes == (uint8_t)AllocationType::None);
565 assert(ContextIds.empty());
566 return true;
567 }
568
569 void dump() const;
570 void print(raw_ostream &OS) const;
571
572 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
573 Edge.print(OS);
574 return OS;
575 }
576 };
577
578 /// Helpers to remove edges that have allocation type None (due to not
579 /// carrying any context ids) after transformations.
580 void removeNoneTypeCalleeEdges(ContextNode *Node);
581 void removeNoneTypeCallerEdges(ContextNode *Node);
582 void
583 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
585
586protected:
587 /// Get a list of nodes corresponding to the stack ids in the given callsite
588 /// context.
589 template <class NodeT, class IteratorT>
590 std::vector<uint64_t>
591 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
592
593 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
594 /// metadata (or summary).
595 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
596
597 /// Adds nodes for the given MIB stack ids.
598 template <class NodeT, class IteratorT>
599 void addStackNodesForMIB(
600 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
602 ArrayRef<ContextTotalSize> ContextSizeInfo,
603 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
604
605 /// Matches all callsite metadata (or summary) to the nodes created for
606 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
607 /// inlining performed on those callsite instructions.
608 void updateStackNodes();
609
610 /// Optionally fixup edges for the N largest cold contexts to better enable
611 /// cloning. This is particularly helpful if the context includes recursion
612 /// as well as inlining, resulting in a single stack node for multiple stack
613 /// ids in the context. With recursion it is particularly difficult to get the
614 /// edge updates correct as in the general case we have lost the original
615 /// stack id ordering for the context. Do more expensive fixup for the largest
616 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
617 void fixupImportantContexts();
618
619 /// Update graph to conservatively handle any callsite stack nodes that target
620 /// multiple different callee target functions.
621 void handleCallsitesWithMultipleTargets();
622
623 /// Mark backedges via the standard DFS based backedge algorithm.
624 void markBackedges();
625
626 /// Merge clones generated during cloning for different allocations but that
627 /// are called by the same caller node, to ensure proper function assignment.
628 void mergeClones();
629
630 // Try to partition calls on the given node (already placed into the AllCalls
631 // array) by callee function, creating new copies of Node as needed to hold
632 // calls with different callees, and moving the callee edges appropriately.
633 // Returns true if partitioning was successful.
634 bool partitionCallsByCallee(
635 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
636 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
637
638 /// Save lists of calls with MemProf metadata in each function, for faster
639 /// iteration.
640 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
641
642 /// Map from callsite node to the enclosing caller function.
643 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
644
645 // When exporting to dot, and an allocation id is specified, contains the
646 // context ids on that allocation.
647 DenseSet<uint32_t> DotAllocContextIds;
648
649private:
650 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
651
652 // Structure to keep track of information for each call as we are matching
653 // non-allocation callsites onto context nodes created from the allocation
654 // call metadata / summary contexts.
655 struct CallContextInfo {
656 // The callsite we're trying to match.
657 CallTy Call;
658 // The callsites stack ids that have a context node in the graph.
659 std::vector<uint64_t> StackIds;
660 // The function containing this callsite.
661 const FuncTy *Func;
662 // Initially empty, if needed this will be updated to contain the context
663 // ids for use in a new context node created for this callsite.
664 DenseSet<uint32_t> ContextIds;
665 };
666
667 /// Helper to remove edge from graph, updating edge iterator if it is provided
668 /// (in which case CalleeIter indicates which edge list is being iterated).
669 /// This will also perform the necessary clearing of the ContextEdge members
670 /// to enable later checking if the edge has been removed (since we may have
671 /// other copies of the shared_ptr in existence, and in fact rely on this to
672 /// enable removal while iterating over a copy of a node's edge list).
673 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
674 bool CalleeIter = true);
675
676 /// Assigns the given Node to calls at or inlined into the location with
677 /// the Node's stack id, after post order traversing and processing its
678 /// caller nodes. Uses the call information recorded in the given
679 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
680 /// as needed. Called by updateStackNodes which sets up the given
681 /// StackIdToMatchingCalls map.
682 void assignStackNodesPostOrder(
683 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
684 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
685 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
686 const DenseSet<uint32_t> &ImportantContextIds);
687
688 /// Duplicates the given set of context ids, updating the provided
689 /// map from each original id with the newly generated context ids,
690 /// and returning the new duplicated id set.
691 DenseSet<uint32_t> duplicateContextIds(
692 const DenseSet<uint32_t> &StackSequenceContextIds,
693 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
694
695 /// Propagates all duplicated context ids across the graph.
696 void propagateDuplicateContextIds(
697 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
698
699 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
700 /// else to its callers. Also updates OrigNode's edges to remove any context
701 /// ids moved to the newly created edge.
702 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
703 bool TowardsCallee,
704 DenseSet<uint32_t> RemainingContextIds);
705
706 /// Get the stack id corresponding to the given Id or Index (for IR this will
707 /// return itself, for a summary index this will return the id recorded in the
708 /// index for that stack id index value).
709 uint64_t getStackId(uint64_t IdOrIndex) const {
710 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
711 }
712
713 /// Returns true if the given call targets the callee of the given edge, or if
714 /// we were able to identify the call chain through intermediate tail calls.
715 /// In the latter case new context nodes are added to the graph for the
716 /// identified tail calls, and their synthesized nodes are added to
717 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
718 /// the updated edges and to prepare it for an increment in the caller.
719 bool
720 calleesMatch(CallTy Call, EdgeIter &EI,
721 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
722
723 // Return the callee function of the given call, or nullptr if it can't be
724 // determined
725 const FuncTy *getCalleeFunc(CallTy Call) {
726 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
727 }
728
729 /// Returns true if the given call targets the given function, or if we were
730 /// able to identify the call chain through intermediate tail calls (in which
731 /// case FoundCalleeChain will be populated).
732 bool calleeMatchesFunc(
733 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
734 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
735 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
736 Call, Func, CallerFunc, FoundCalleeChain);
737 }
738
739 /// Returns true if both call instructions have the same callee.
740 bool sameCallee(CallTy Call1, CallTy Call2) {
741 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
742 }
743
744 /// Get a list of nodes corresponding to the stack ids in the given
745 /// callsite's context.
746 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
747 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
748 Call);
749 }
750
751 /// Get the last stack id in the context for callsite.
752 uint64_t getLastStackId(CallTy Call) {
753 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
754 }
755
756 /// Update the allocation call to record type of allocated memory.
757 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
758 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
759 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
760 }
761
762 /// Get the AllocationType assigned to the given allocation instruction clone.
763 AllocationType getAllocationCallType(const CallInfo &Call) const {
764 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
765 }
766
767 /// Update non-allocation call to invoke (possibly cloned) function
768 /// CalleeFunc.
769 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
770 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
771 }
772
773 /// Clone the given function for the given callsite, recording mapping of all
774 /// of the functions tracked calls to their new versions in the CallMap.
775 /// Assigns new clones to clone number CloneNo.
776 FuncInfo cloneFunctionForCallsite(
777 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
778 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
779 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
780 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
781 }
782
783 /// Gets a label to use in the dot graph for the given call clone in the given
784 /// function.
785 std::string getLabel(const FuncTy *Func, const CallTy Call,
786 unsigned CloneNo) const {
787 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
788 }
789
790 // Create and return a new ContextNode.
791 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
792 CallInfo C = CallInfo()) {
793 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
794 auto *NewNode = NodeOwner.back().get();
795 if (F)
796 NodeToCallingFunc[NewNode] = F;
797 NewNode->NodeId = NodeOwner.size();
798 return NewNode;
799 }
800
801 /// Helpers to find the node corresponding to the given call or stackid.
802 ContextNode *getNodeForInst(const CallInfo &C);
803 ContextNode *getNodeForAlloc(const CallInfo &C);
804 ContextNode *getNodeForStackId(uint64_t StackId);
805
806 /// Computes the alloc type corresponding to the given context ids, by
807 /// unioning their recorded alloc types.
808 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
809
810 /// Returns the allocation type of the intersection of the contexts of two
811 /// nodes (based on their provided context id sets), optimized for the case
812 /// when Node1Ids is smaller than Node2Ids.
813 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
814 const DenseSet<uint32_t> &Node2Ids) const;
815
816 /// Returns the allocation type of the intersection of the contexts of two
817 /// nodes (based on their provided context id sets).
818 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
819 const DenseSet<uint32_t> &Node2Ids) const;
820
821 /// Create a clone of Edge's callee and move Edge to that new callee node,
822 /// performing the necessary context id and allocation type updates.
823 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
824 /// moved to an edge to the new callee.
825 ContextNode *
826 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
827 DenseSet<uint32_t> ContextIdsToMove = {});
828
829 /// Change the callee of Edge to existing callee clone NewCallee, performing
830 /// the necessary context id and allocation type updates.
831 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
832 /// moved to an edge to the new callee.
833 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
834 ContextNode *NewCallee,
835 bool NewClone = false,
836 DenseSet<uint32_t> ContextIdsToMove = {});
837
838 /// Change the caller of the edge at the given callee edge iterator to be
839 /// NewCaller, performing the necessary context id and allocation type
840 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
841 /// a simplified version of it as we always move the given edge and all of its
842 /// context ids.
843 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
844 ContextNode *NewCaller);
845
846 /// Recursive helper for marking backedges via DFS.
847 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
848 DenseSet<const ContextNode *> &CurrentStack);
849
850 /// Recursive helper for merging clones.
851 void
852 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
853 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
854 /// Main worker for merging callee clones for a given node.
855 void mergeNodeCalleeClones(
856 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
857 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
858 /// Helper to find other callers of the given set of callee edges that can
859 /// share the same callee merge node.
860 void findOtherCallersToShareMerge(
861 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
862 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
863 DenseSet<ContextNode *> &OtherCallersToShareMerge);
864
865 /// Recursively perform cloning on the graph for the given Node and its
866 /// callers, in order to uniquely identify the allocation behavior of an
867 /// allocation given its context. The context ids of the allocation being
868 /// processed are given in AllocContextIds.
869 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
870 const DenseSet<uint32_t> &AllocContextIds);
871
872 /// Map from each context ID to the AllocationType assigned to that context.
873 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
874
875 /// Map from each contextID to the profiled full contexts and their total
876 /// sizes (there may be more than one due to context trimming),
877 /// optionally populated when requested (via MemProfReportHintedSizes or
878 /// MinClonedColdBytePercent).
879 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
880
881 /// Identifies the context node created for a stack id when adding the MIB
882 /// contexts to the graph. This is used to locate the context nodes when
883 /// trying to assign the corresponding callsites with those stack ids to these
884 /// nodes.
885 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
886
887 /// Saves information for the contexts identified as important (the largest
888 /// cold contexts up to MemProfTopNImportant).
889 struct ImportantContextInfo {
890 // The original list of leaf first stack ids corresponding to this context.
891 std::vector<uint64_t> StackIds;
892 // Max length of stack ids corresponding to a single stack ContextNode for
893 // this context (i.e. the max length of a key in StackIdsToNode below).
894 unsigned MaxLength = 0;
895 // Mapping of slices of the stack ids to the corresponding ContextNode
896 // (there can be multiple stack ids due to inlining). Populated when
897 // updating stack nodes while matching them to the IR or summary.
898 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
899 };
900
901 // Map of important full context ids to information about each.
902 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
903
904 // For each important context id found in Node (if any), records the list of
905 // stack ids that corresponded to the given callsite Node. There can be more
906 // than one in the case of inlining.
907 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
908 // We pass in the Node's context ids to avoid the
909 // overhead of computing them as the caller already has
910 // them in some cases.
911 const DenseSet<uint32_t> &NodeContextIds,
912 const DenseSet<uint32_t> &ImportantContextIds) {
914 assert(ImportantContextIds.empty());
915 return;
916 }
918 set_intersection(NodeContextIds, ImportantContextIds);
919 if (Ids.empty())
920 return;
921 auto Size = StackIds.size();
922 for (auto Id : Ids) {
923 auto &Entry = ImportantContextIdInfo[Id];
924 Entry.StackIdsToNode[StackIds] = Node;
925 // Keep track of the max to simplify later analysis.
926 if (Size > Entry.MaxLength)
927 Entry.MaxLength = Size;
928 }
929 }
930
931 /// Maps to track the calls to their corresponding nodes in the graph.
932 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
933 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
934
935 /// Owner of all ContextNode unique_ptrs.
936 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
937
938 /// Perform sanity checks on graph when requested.
939 void check() const;
940
941 /// Keeps track of the last unique context id assigned.
942 unsigned int LastContextId = 0;
943};
944
945template <typename DerivedCCG, typename FuncTy, typename CallTy>
946using ContextNode =
947 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
948template <typename DerivedCCG, typename FuncTy, typename CallTy>
949using ContextEdge =
950 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
951template <typename DerivedCCG, typename FuncTy, typename CallTy>
952using FuncInfo =
953 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
954template <typename DerivedCCG, typename FuncTy, typename CallTy>
955using CallInfo =
956 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
957
958/// CRTP derived class for graphs built from IR (regular LTO).
959class ModuleCallsiteContextGraph
960 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
961 Instruction *> {
962public:
963 ModuleCallsiteContextGraph(
964 Module &M,
965 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
966
967private:
968 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
969 Instruction *>;
970
971 uint64_t getStackId(uint64_t IdOrIndex) const;
972 const Function *getCalleeFunc(Instruction *Call);
973 bool calleeMatchesFunc(
974 Instruction *Call, const Function *Func, const Function *CallerFunc,
975 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
976 bool sameCallee(Instruction *Call1, Instruction *Call2);
977 bool findProfiledCalleeThroughTailCalls(
978 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
979 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
980 bool &FoundMultipleCalleeChains);
981 uint64_t getLastStackId(Instruction *Call);
982 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
983 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
984 AllocationType getAllocationCallType(const CallInfo &Call) const;
985 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
986 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
987 Instruction *>::FuncInfo
988 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
989 DenseMap<CallInfo, CallInfo> &CallMap,
990 std::vector<CallInfo> &CallsWithMetadataInFunc,
991 unsigned CloneNo);
992 std::string getLabel(const Function *Func, const Instruction *Call,
993 unsigned CloneNo) const;
994
995 const Module &Mod;
996 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
997};
998
999/// Represents a call in the summary index graph, which can either be an
1000/// allocation or an interior callsite node in an allocation's context.
1001/// Holds a pointer to the corresponding data structure in the index.
1002struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1003 IndexCall() : PointerUnion() {}
1004 IndexCall(std::nullptr_t) : IndexCall() {}
1005 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1006 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1007 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1008
1009 IndexCall *operator->() { return this; }
1010
1011 void print(raw_ostream &OS) const {
1012 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1014 OS << *AI;
1015 } else {
1017 assert(CI);
1018 OS << *CI;
1019 }
1020 }
1021};
1022} // namespace
1023
1024namespace llvm {
1025template <> struct simplify_type<IndexCall> {
1027 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1028};
1029template <> struct simplify_type<const IndexCall> {
1031 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1032};
1033} // namespace llvm
1034
1035namespace {
1036/// CRTP derived class for graphs built from summary index (ThinLTO).
1037class IndexCallsiteContextGraph
1038 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1039 IndexCall> {
1040public:
1041 IndexCallsiteContextGraph(
1042 ModuleSummaryIndex &Index,
1043 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1044 isPrevailing);
1045
1046 ~IndexCallsiteContextGraph() {
1047 // Now that we are done with the graph it is safe to add the new
1048 // CallsiteInfo structs to the function summary vectors. The graph nodes
1049 // point into locations within these vectors, so we don't want to add them
1050 // any earlier.
1051 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1052 auto *FS = I.first;
1053 for (auto &Callsite : I.second)
1054 FS->addCallsite(*Callsite.second);
1055 }
1056 }
1057
1058private:
1059 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1060 IndexCall>;
1061
1062 uint64_t getStackId(uint64_t IdOrIndex) const;
1063 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1064 bool calleeMatchesFunc(
1065 IndexCall &Call, const FunctionSummary *Func,
1066 const FunctionSummary *CallerFunc,
1067 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1068 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1069 bool findProfiledCalleeThroughTailCalls(
1070 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1071 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1072 bool &FoundMultipleCalleeChains);
1073 uint64_t getLastStackId(IndexCall &Call);
1074 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1075 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1076 AllocationType getAllocationCallType(const CallInfo &Call) const;
1077 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1078 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1079 IndexCall>::FuncInfo
1080 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1081 DenseMap<CallInfo, CallInfo> &CallMap,
1082 std::vector<CallInfo> &CallsWithMetadataInFunc,
1083 unsigned CloneNo);
1084 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1085 unsigned CloneNo) const;
1086
1087 // Saves mapping from function summaries containing memprof records back to
1088 // its VI, for use in checking and debugging.
1089 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1090
1091 const ModuleSummaryIndex &Index;
1092 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1093 isPrevailing;
1094
1095 // Saves/owns the callsite info structures synthesized for missing tail call
1096 // frames that we discover while building the graph.
1097 // It maps from the summary of the function making the tail call, to a map
1098 // of callee ValueInfo to corresponding synthesized callsite info.
1099 std::unordered_map<FunctionSummary *,
1100 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1101 FunctionCalleesToSynthesizedCallsiteInfos;
1102};
1103} // namespace
1104
1105template <>
1106struct llvm::DenseMapInfo<CallsiteContextGraph<
1107 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1109template <>
1110struct llvm::DenseMapInfo<CallsiteContextGraph<
1111 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1112 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1113template <>
1114struct llvm::DenseMapInfo<IndexCall>
1115 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1116
1117namespace {
1118
1119// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1120// type we should actually use on the corresponding allocation.
1121// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1122// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1123// from NotCold.
1124AllocationType allocTypeToUse(uint8_t AllocTypes) {
1125 assert(AllocTypes != (uint8_t)AllocationType::None);
1126 if (AllocTypes ==
1129 else
1130 return (AllocationType)AllocTypes;
1131}
1132
1133// Helper to check if the alloc types for all edges recorded in the
1134// InAllocTypes vector match the alloc types for all edges in the Edges
1135// vector.
1136template <typename DerivedCCG, typename FuncTy, typename CallTy>
1137bool allocTypesMatch(
1138 const std::vector<uint8_t> &InAllocTypes,
1139 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1140 &Edges) {
1141 // This should be called only when the InAllocTypes vector was computed for
1142 // this set of Edges. Make sure the sizes are the same.
1143 assert(InAllocTypes.size() == Edges.size());
1144 return std::equal(
1145 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1146 [](const uint8_t &l,
1147 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1148 // Can share if one of the edges is None type - don't
1149 // care about the type along that edge as it doesn't
1150 // exist for those context ids.
1151 if (l == (uint8_t)AllocationType::None ||
1152 r->AllocTypes == (uint8_t)AllocationType::None)
1153 return true;
1154 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1155 });
1156}
1157
1158// Helper to check if the alloc types for all edges recorded in the
1159// InAllocTypes vector match the alloc types for callee edges in the given
1160// clone. Because the InAllocTypes were computed from the original node's callee
1161// edges, and other cloning could have happened after this clone was created, we
1162// need to find the matching clone callee edge, which may or may not exist.
1163template <typename DerivedCCG, typename FuncTy, typename CallTy>
1164bool allocTypesMatchClone(
1165 const std::vector<uint8_t> &InAllocTypes,
1166 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1167 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1168 assert(Node);
1169 // InAllocTypes should have been computed for the original node's callee
1170 // edges.
1171 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1172 // First create a map of the clone callee edge callees to the edge alloc type.
1174 EdgeCalleeMap;
1175 for (const auto &E : Clone->CalleeEdges) {
1176 assert(!EdgeCalleeMap.contains(E->Callee));
1177 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1178 }
1179 // Next, walk the original node's callees, and look for the corresponding
1180 // clone edge to that callee.
1181 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1182 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1183 // Not found is ok, we will simply add an edge if we use this clone.
1184 if (Iter == EdgeCalleeMap.end())
1185 continue;
1186 // Can share if one of the edges is None type - don't
1187 // care about the type along that edge as it doesn't
1188 // exist for those context ids.
1189 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1190 Iter->second == (uint8_t)AllocationType::None)
1191 continue;
1192 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1193 return false;
1194 }
1195 return true;
1196}
1197
1198} // end anonymous namespace
1199
1200template <typename DerivedCCG, typename FuncTy, typename CallTy>
1201typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1202CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1203 const CallInfo &C) {
1204 ContextNode *Node = getNodeForAlloc(C);
1205 if (Node)
1206 return Node;
1207
1208 return NonAllocationCallToContextNodeMap.lookup(C);
1209}
1210
1211template <typename DerivedCCG, typename FuncTy, typename CallTy>
1212typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1213CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1214 const CallInfo &C) {
1215 return AllocationCallToContextNodeMap.lookup(C);
1216}
1217
1218template <typename DerivedCCG, typename FuncTy, typename CallTy>
1219typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1220CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1221 uint64_t StackId) {
1222 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1223 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1224 return StackEntryNode->second;
1225 return nullptr;
1226}
1227
1228template <typename DerivedCCG, typename FuncTy, typename CallTy>
1229void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1230 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1231 unsigned int ContextId) {
1232 for (auto &Edge : CallerEdges) {
1233 if (Edge->Caller == Caller) {
1234 Edge->AllocTypes |= (uint8_t)AllocType;
1235 Edge->getContextIds().insert(ContextId);
1236 return;
1237 }
1238 }
1239 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1240 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1241 CallerEdges.push_back(Edge);
1242 Caller->CalleeEdges.push_back(Edge);
1243}
1244
1245template <typename DerivedCCG, typename FuncTy, typename CallTy>
1246void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1247 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1248 assert(!EI || (*EI)->get() == Edge);
1249 assert(!Edge->isRemoved());
1250 // Save the Caller and Callee pointers so we can erase Edge from their edge
1251 // lists after clearing Edge below. We do the clearing first in case it is
1252 // destructed after removing from the edge lists (if those were the last
1253 // shared_ptr references to Edge).
1254 auto *Callee = Edge->Callee;
1255 auto *Caller = Edge->Caller;
1256
1257 // Make sure the edge fields are cleared out so we can properly detect
1258 // removed edges if Edge is not destructed because there is still a shared_ptr
1259 // reference.
1260 Edge->clear();
1261
1262#ifndef NDEBUG
1263 auto CalleeCallerCount = Callee->CallerEdges.size();
1264 auto CallerCalleeCount = Caller->CalleeEdges.size();
1265#endif
1266 if (!EI) {
1267 Callee->eraseCallerEdge(Edge);
1268 Caller->eraseCalleeEdge(Edge);
1269 } else if (CalleeIter) {
1270 Callee->eraseCallerEdge(Edge);
1271 *EI = Caller->CalleeEdges.erase(*EI);
1272 } else {
1273 Caller->eraseCalleeEdge(Edge);
1274 *EI = Callee->CallerEdges.erase(*EI);
1275 }
1276 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1277 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1278}
1279
1280template <typename DerivedCCG, typename FuncTy, typename CallTy>
1281void CallsiteContextGraph<
1282 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1283 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1284 auto Edge = *EI;
1285 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1286 assert(Edge->ContextIds.empty());
1287 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1288 } else
1289 ++EI;
1290 }
1291}
1292
1293template <typename DerivedCCG, typename FuncTy, typename CallTy>
1294void CallsiteContextGraph<
1295 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1296 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1297 auto Edge = *EI;
1298 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1299 assert(Edge->ContextIds.empty());
1300 Edge->Caller->eraseCalleeEdge(Edge.get());
1301 EI = Node->CallerEdges.erase(EI);
1302 } else
1303 ++EI;
1304 }
1305}
1306
1307template <typename DerivedCCG, typename FuncTy, typename CallTy>
1308typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1309CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1310 findEdgeFromCallee(const ContextNode *Callee) {
1311 for (const auto &Edge : CalleeEdges)
1312 if (Edge->Callee == Callee)
1313 return Edge.get();
1314 return nullptr;
1315}
1316
1317template <typename DerivedCCG, typename FuncTy, typename CallTy>
1318typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1319CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1320 findEdgeFromCaller(const ContextNode *Caller) {
1321 for (const auto &Edge : CallerEdges)
1322 if (Edge->Caller == Caller)
1323 return Edge.get();
1324 return nullptr;
1325}
1326
1327template <typename DerivedCCG, typename FuncTy, typename CallTy>
1328void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1329 eraseCalleeEdge(const ContextEdge *Edge) {
1330 auto EI = llvm::find_if(
1331 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1332 return CalleeEdge.get() == Edge;
1333 });
1334 assert(EI != CalleeEdges.end());
1335 CalleeEdges.erase(EI);
1336}
1337
1338template <typename DerivedCCG, typename FuncTy, typename CallTy>
1339void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1340 eraseCallerEdge(const ContextEdge *Edge) {
1341 auto EI = llvm::find_if(
1342 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1343 return CallerEdge.get() == Edge;
1344 });
1345 assert(EI != CallerEdges.end());
1346 CallerEdges.erase(EI);
1347}
1348
1349template <typename DerivedCCG, typename FuncTy, typename CallTy>
1350uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1351 DenseSet<uint32_t> &ContextIds) const {
1352 uint8_t BothTypes =
1353 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1354 uint8_t AllocType = (uint8_t)AllocationType::None;
1355 for (auto Id : ContextIds) {
1356 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1357 // Bail early if alloc type reached both, no further refinement.
1358 if (AllocType == BothTypes)
1359 return AllocType;
1360 }
1361 return AllocType;
1362}
1363
1364template <typename DerivedCCG, typename FuncTy, typename CallTy>
1365uint8_t
1366CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1367 const DenseSet<uint32_t> &Node1Ids,
1368 const DenseSet<uint32_t> &Node2Ids) const {
1369 uint8_t BothTypes =
1370 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1371 uint8_t AllocType = (uint8_t)AllocationType::None;
1372 for (auto Id : Node1Ids) {
1373 if (!Node2Ids.count(Id))
1374 continue;
1375 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1376 // Bail early if alloc type reached both, no further refinement.
1377 if (AllocType == BothTypes)
1378 return AllocType;
1379 }
1380 return AllocType;
1381}
1382
1383template <typename DerivedCCG, typename FuncTy, typename CallTy>
1384uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1385 const DenseSet<uint32_t> &Node1Ids,
1386 const DenseSet<uint32_t> &Node2Ids) const {
1387 if (Node1Ids.size() < Node2Ids.size())
1388 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1389 else
1390 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1391}
1392
1393template <typename DerivedCCG, typename FuncTy, typename CallTy>
1394typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1395CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1396 CallInfo Call, const FuncTy *F) {
1397 assert(!getNodeForAlloc(Call));
1398 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1399 AllocationCallToContextNodeMap[Call] = AllocNode;
1400 // Use LastContextId as a uniq id for MIB allocation nodes.
1401 AllocNode->OrigStackOrAllocId = LastContextId;
1402 // Alloc type should be updated as we add in the MIBs. We should assert
1403 // afterwards that it is not still None.
1404 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1405
1406 return AllocNode;
1407}
1408
1409static std::string getAllocTypeString(uint8_t AllocTypes) {
1410 if (!AllocTypes)
1411 return "None";
1412 std::string Str;
1413 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1414 Str += "NotCold";
1415 if (AllocTypes & (uint8_t)AllocationType::Cold)
1416 Str += "Cold";
1417 return Str;
1418}
1419
1420template <typename DerivedCCG, typename FuncTy, typename CallTy>
1421template <class NodeT, class IteratorT>
1422void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1423 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1424 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1425 ArrayRef<ContextTotalSize> ContextSizeInfo,
1426 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1427 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1428 // is done.
1429 if (AllocType == AllocationType::Hot)
1430 AllocType = AllocationType::NotCold;
1431
1432 ContextIdToAllocationType[++LastContextId] = AllocType;
1433
1434 bool IsImportant = false;
1435 if (!ContextSizeInfo.empty()) {
1436 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1437 // If this is a cold allocation, and we are collecting non-zero largest
1438 // contexts, see if this is a candidate.
1439 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1440 uint64_t TotalCold = 0;
1441 for (auto &CSI : ContextSizeInfo)
1442 TotalCold += CSI.TotalSize;
1443 // Record this context if either we haven't found the first top-n largest
1444 // yet, or if it is larger than the smallest already recorded.
1445 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1446 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1447 // sorted in ascending size of its key which is the size.
1448 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1449 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1450 // Remove old one and its associated entries.
1451 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1452 TotalSizeToContextIdTopNCold.erase(
1453 TotalSizeToContextIdTopNCold.begin());
1454 assert(ImportantContextIdInfo.count(IdToRemove));
1455 ImportantContextIdInfo.erase(IdToRemove);
1456 }
1457 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1458 IsImportant = true;
1459 }
1460 }
1461 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1462 }
1463
1464 // Update alloc type and context ids for this MIB.
1465 AllocNode->AllocTypes |= (uint8_t)AllocType;
1466
1467 // Now add or update nodes for each stack id in alloc's context.
1468 // Later when processing the stack ids on non-alloc callsites we will adjust
1469 // for any inlining in the context.
1470 ContextNode *PrevNode = AllocNode;
1471 // Look for recursion (direct recursion should have been collapsed by
1472 // module summary analysis, here we should just be detecting mutual
1473 // recursion). Mark these nodes so we don't try to clone.
1474 SmallSet<uint64_t, 8> StackIdSet;
1475 // Skip any on the allocation call (inlining).
1476 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1477 ContextIter != StackContext.end(); ++ContextIter) {
1478 auto StackId = getStackId(*ContextIter);
1479 if (IsImportant)
1480 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1481 ContextNode *StackNode = getNodeForStackId(StackId);
1482 if (!StackNode) {
1483 StackNode = createNewNode(/*IsAllocation=*/false);
1484 StackEntryIdToContextNodeMap[StackId] = StackNode;
1485 StackNode->OrigStackOrAllocId = StackId;
1486 }
1487 // Marking a node recursive will prevent its cloning completely, even for
1488 // non-recursive contexts flowing through it.
1490 auto Ins = StackIdSet.insert(StackId);
1491 if (!Ins.second)
1492 StackNode->Recursive = true;
1493 }
1494 StackNode->AllocTypes |= (uint8_t)AllocType;
1495 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1496 PrevNode = StackNode;
1497 }
1498}
1499
1500template <typename DerivedCCG, typename FuncTy, typename CallTy>
1501DenseSet<uint32_t>
1502CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1503 const DenseSet<uint32_t> &StackSequenceContextIds,
1504 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1505 DenseSet<uint32_t> NewContextIds;
1506 for (auto OldId : StackSequenceContextIds) {
1507 NewContextIds.insert(++LastContextId);
1508 OldToNewContextIds[OldId].insert(LastContextId);
1509 assert(ContextIdToAllocationType.count(OldId));
1510 // The new context has the same allocation type and size info as original.
1511 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1512 auto CSI = ContextIdToContextSizeInfos.find(OldId);
1513 if (CSI != ContextIdToContextSizeInfos.end())
1514 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1515 if (DotAllocContextIds.contains(OldId))
1516 DotAllocContextIds.insert(LastContextId);
1517 }
1518 return NewContextIds;
1519}
1520
1521template <typename DerivedCCG, typename FuncTy, typename CallTy>
1522void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1523 propagateDuplicateContextIds(
1524 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1525 // Build a set of duplicated context ids corresponding to the input id set.
1526 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1527 DenseSet<uint32_t> NewIds;
1528 for (auto Id : ContextIds)
1529 if (auto NewId = OldToNewContextIds.find(Id);
1530 NewId != OldToNewContextIds.end())
1531 NewIds.insert_range(NewId->second);
1532 return NewIds;
1533 };
1534
1535 // Recursively update context ids sets along caller edges.
1536 auto UpdateCallers = [&](ContextNode *Node,
1537 DenseSet<const ContextEdge *> &Visited,
1538 auto &&UpdateCallers) -> void {
1539 for (const auto &Edge : Node->CallerEdges) {
1540 auto Inserted = Visited.insert(Edge.get());
1541 if (!Inserted.second)
1542 continue;
1543 ContextNode *NextNode = Edge->Caller;
1544 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1545 // Only need to recursively iterate to NextNode via this caller edge if
1546 // it resulted in any added ids to NextNode.
1547 if (!NewIdsToAdd.empty()) {
1548 Edge->getContextIds().insert_range(NewIdsToAdd);
1549 UpdateCallers(NextNode, Visited, UpdateCallers);
1550 }
1551 }
1552 };
1553
1554 DenseSet<const ContextEdge *> Visited;
1555 for (auto &Entry : AllocationCallToContextNodeMap) {
1556 auto *Node = Entry.second;
1557 UpdateCallers(Node, Visited, UpdateCallers);
1558 }
1559}
1560
1561template <typename DerivedCCG, typename FuncTy, typename CallTy>
1562void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1563 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1564 // This must be passed by value to make a copy since it will be adjusted
1565 // as ids are moved.
1566 DenseSet<uint32_t> RemainingContextIds) {
1567 auto &OrigEdges =
1568 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1569 DenseSet<uint32_t> RecursiveContextIds;
1570 DenseSet<uint32_t> AllCallerContextIds;
1572 // Identify which context ids are recursive which is needed to properly
1573 // update the RemainingContextIds set. The relevant recursive context ids
1574 // are those that are in multiple edges.
1575 for (auto &CE : OrigEdges) {
1576 AllCallerContextIds.reserve(CE->getContextIds().size());
1577 for (auto Id : CE->getContextIds())
1578 if (!AllCallerContextIds.insert(Id).second)
1579 RecursiveContextIds.insert(Id);
1580 }
1581 }
1582 // Increment iterator in loop so that we can remove edges as needed.
1583 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1584 auto Edge = *EI;
1585 DenseSet<uint32_t> NewEdgeContextIds;
1586 DenseSet<uint32_t> NotFoundContextIds;
1587 // Remove any matching context ids from Edge, return set that were found and
1588 // removed, these are the new edge's context ids. Also update the remaining
1589 // (not found ids).
1590 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1591 NotFoundContextIds);
1592 // Update the remaining context ids set for the later edges. This is a
1593 // compile time optimization.
1594 if (RecursiveContextIds.empty()) {
1595 // No recursive ids, so all of the previously remaining context ids that
1596 // were not seen on this edge are the new remaining set.
1597 RemainingContextIds.swap(NotFoundContextIds);
1598 } else {
1599 // Keep the recursive ids in the remaining set as we expect to see those
1600 // on another edge. We can remove the non-recursive remaining ids that
1601 // were seen on this edge, however. We already have the set of remaining
1602 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1603 // non-recursive and only remove those. Note that despite the higher
1604 // overhead of updating the remaining context ids set when recursion
1605 // handling is enabled, it was found to be at worst performance neutral
1606 // and in one case a clear win.
1607 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1608 set_difference(NewEdgeContextIds, RecursiveContextIds);
1609 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1610 }
1611 // If no matching context ids for this edge, skip it.
1612 if (NewEdgeContextIds.empty()) {
1613 ++EI;
1614 continue;
1615 }
1616 if (TowardsCallee) {
1617 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1618 auto NewEdge = std::make_shared<ContextEdge>(
1619 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1620 NewNode->CalleeEdges.push_back(NewEdge);
1621 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1622 } else {
1623 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1624 auto NewEdge = std::make_shared<ContextEdge>(
1625 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1626 NewNode->CallerEdges.push_back(NewEdge);
1627 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1628 }
1629 // Remove old edge if context ids empty.
1630 if (Edge->getContextIds().empty()) {
1631 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1632 continue;
1633 }
1634 ++EI;
1635 }
1636}
1637
1638template <typename DerivedCCG, typename FuncTy, typename CallTy>
1639static void checkEdge(
1640 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1641 // Confirm that alloc type is not None and that we have at least one context
1642 // id.
1643 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1644 assert(!Edge->ContextIds.empty());
1645}
1646
1647template <typename DerivedCCG, typename FuncTy, typename CallTy>
1648static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1649 bool CheckEdges = true) {
1650 if (Node->isRemoved())
1651 return;
1652#ifndef NDEBUG
1653 // Compute node's context ids once for use in asserts.
1654 auto NodeContextIds = Node->getContextIds();
1655#endif
1656 // Node's context ids should be the union of both its callee and caller edge
1657 // context ids.
1658 if (Node->CallerEdges.size()) {
1659 DenseSet<uint32_t> CallerEdgeContextIds(
1660 Node->CallerEdges.front()->ContextIds);
1661 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1662 if (CheckEdges)
1664 set_union(CallerEdgeContextIds, Edge->ContextIds);
1665 }
1666 // Node can have more context ids than callers if some contexts terminate at
1667 // node and some are longer. If we are allowing recursive callsites and
1668 // contexts this will be violated for incompletely cloned recursive cycles,
1669 // so skip the checking in that case.
1671 NodeContextIds == CallerEdgeContextIds ||
1672 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1673 }
1674 if (Node->CalleeEdges.size()) {
1675 DenseSet<uint32_t> CalleeEdgeContextIds(
1676 Node->CalleeEdges.front()->ContextIds);
1677 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1678 if (CheckEdges)
1680 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1681 }
1682 // If we are allowing recursive callsites and contexts this will be violated
1683 // for incompletely cloned recursive cycles, so skip the checking in that
1684 // case.
1686 NodeContextIds == CalleeEdgeContextIds);
1687 }
1688 // FIXME: Since this checking is only invoked under an option, we should
1689 // change the error checking from using assert to something that will trigger
1690 // an error on a release build.
1691#ifndef NDEBUG
1692 // Make sure we don't end up with duplicate edges between the same caller and
1693 // callee.
1695 for (const auto &E : Node->CalleeEdges)
1696 NodeSet.insert(E->Callee);
1697 assert(NodeSet.size() == Node->CalleeEdges.size());
1698#endif
1699}
1700
1701template <typename DerivedCCG, typename FuncTy, typename CallTy>
1702void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1703 assignStackNodesPostOrder(ContextNode *Node,
1704 DenseSet<const ContextNode *> &Visited,
1705 DenseMap<uint64_t, std::vector<CallContextInfo>>
1706 &StackIdToMatchingCalls,
1707 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1708 const DenseSet<uint32_t> &ImportantContextIds) {
1709 auto Inserted = Visited.insert(Node);
1710 if (!Inserted.second)
1711 return;
1712 // Post order traversal. Iterate over a copy since we may add nodes and
1713 // therefore new callers during the recursive call, invalidating any
1714 // iterator over the original edge vector. We don't need to process these
1715 // new nodes as they were already processed on creation.
1716 auto CallerEdges = Node->CallerEdges;
1717 for (auto &Edge : CallerEdges) {
1718 // Skip any that have been removed during the recursion.
1719 if (Edge->isRemoved()) {
1720 assert(!is_contained(Node->CallerEdges, Edge));
1721 continue;
1722 }
1723 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1724 CallToMatchingCall, ImportantContextIds);
1725 }
1726
1727 // If this node's stack id is in the map, update the graph to contain new
1728 // nodes representing any inlining at interior callsites. Note we move the
1729 // associated context ids over to the new nodes.
1730
1731 // Ignore this node if it is for an allocation or we didn't record any
1732 // stack id lists ending at it.
1733 if (Node->IsAllocation ||
1734 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1735 return;
1736
1737 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1738 // Handle the simple case first. A single call with a single stack id.
1739 // In this case there is no need to create any new context nodes, simply
1740 // assign the context node for stack id to this Call.
1741 if (Calls.size() == 1) {
1742 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1743 if (Ids.size() == 1) {
1744 assert(SavedContextIds.empty());
1745 // It should be this Node
1746 assert(Node == getNodeForStackId(Ids[0]));
1747 if (Node->Recursive)
1748 return;
1749 Node->setCall(Call);
1750 NonAllocationCallToContextNodeMap[Call] = Node;
1751 NodeToCallingFunc[Node] = Func;
1752 recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
1753 return;
1754 }
1755 }
1756
1757#ifndef NDEBUG
1758 // Find the node for the last stack id, which should be the same
1759 // across all calls recorded for this id, and is this node's id.
1760 uint64_t LastId = Node->OrigStackOrAllocId;
1761 ContextNode *LastNode = getNodeForStackId(LastId);
1762 // We should only have kept stack ids that had nodes.
1763 assert(LastNode);
1764 assert(LastNode == Node);
1765#else
1766 ContextNode *LastNode = Node;
1767#endif
1768
1769 // Compute the last node's context ids once, as it is shared by all calls in
1770 // this entry.
1771 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1772
1773 [[maybe_unused]] bool PrevIterCreatedNode = false;
1774 bool CreatedNode = false;
1775 for (unsigned I = 0; I < Calls.size();
1776 I++, PrevIterCreatedNode = CreatedNode) {
1777 CreatedNode = false;
1778 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1779 // Skip any for which we didn't assign any ids, these don't get a node in
1780 // the graph.
1781 if (SavedContextIds.empty()) {
1782 // If this call has a matching call (located in the same function and
1783 // having the same stack ids), simply add it to the context node created
1784 // for its matching call earlier. These can be treated the same through
1785 // cloning and get updated at the same time.
1786 if (!CallToMatchingCall.contains(Call))
1787 continue;
1788 auto MatchingCall = CallToMatchingCall[Call];
1789 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1790 // This should only happen if we had a prior iteration, and it didn't
1791 // create a node because of the below recomputation of context ids
1792 // finding none remaining and continuing early.
1793 assert(I > 0 && !PrevIterCreatedNode);
1794 continue;
1795 }
1796 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1797 Call);
1798 continue;
1799 }
1800
1801 assert(LastId == Ids.back());
1802
1803 // Recompute the context ids for this stack id sequence (the
1804 // intersection of the context ids of the corresponding nodes).
1805 // Start with the ids we saved in the map for this call, which could be
1806 // duplicated context ids. We have to recompute as we might have overlap
1807 // overlap between the saved context ids for different last nodes, and
1808 // removed them already during the post order traversal.
1809 set_intersect(SavedContextIds, LastNodeContextIds);
1810 ContextNode *PrevNode = LastNode;
1811 bool Skip = false;
1812 // Iterate backwards through the stack Ids, starting after the last Id
1813 // in the list, which was handled once outside for all Calls.
1814 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1815 auto Id = *IdIter;
1816 ContextNode *CurNode = getNodeForStackId(Id);
1817 // We should only have kept stack ids that had nodes and weren't
1818 // recursive.
1819 assert(CurNode);
1820 assert(!CurNode->Recursive);
1821
1822 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1823 if (!Edge) {
1824 Skip = true;
1825 break;
1826 }
1827 PrevNode = CurNode;
1828
1829 // Update the context ids, which is the intersection of the ids along
1830 // all edges in the sequence.
1831 set_intersect(SavedContextIds, Edge->getContextIds());
1832
1833 // If we now have no context ids for clone, skip this call.
1834 if (SavedContextIds.empty()) {
1835 Skip = true;
1836 break;
1837 }
1838 }
1839 if (Skip)
1840 continue;
1841
1842 // Create new context node.
1843 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1844 NonAllocationCallToContextNodeMap[Call] = NewNode;
1845 CreatedNode = true;
1846 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1847
1848 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1849 assert(FirstNode);
1850
1851 // Connect to callees of innermost stack frame in inlined call chain.
1852 // This updates context ids for FirstNode's callee's to reflect those
1853 // moved to NewNode.
1854 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1855
1856 // Connect to callers of outermost stack frame in inlined call chain.
1857 // This updates context ids for FirstNode's caller's to reflect those
1858 // moved to NewNode.
1859 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1860
1861 // Now we need to remove context ids from edges/nodes between First and
1862 // Last Node.
1863 PrevNode = nullptr;
1864 for (auto Id : Ids) {
1865 ContextNode *CurNode = getNodeForStackId(Id);
1866 // We should only have kept stack ids that had nodes.
1867 assert(CurNode);
1868
1869 // Remove the context ids moved to NewNode from CurNode, and the
1870 // edge from the prior node.
1871 if (PrevNode) {
1872 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1873 // If the sequence contained recursion, we might have already removed
1874 // some edges during the connectNewNode calls above.
1875 if (!PrevEdge) {
1876 PrevNode = CurNode;
1877 continue;
1878 }
1879 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1880 if (PrevEdge->getContextIds().empty())
1881 removeEdgeFromGraph(PrevEdge);
1882 }
1883 // Since we update the edges from leaf to tail, only look at the callee
1884 // edges. This isn't an alloc node, so if there are no callee edges, the
1885 // alloc type is None.
1886 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1887 ? (uint8_t)AllocationType::None
1888 : CurNode->computeAllocType();
1889 PrevNode = CurNode;
1890 }
1891
1892 recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
1893
1894 if (VerifyNodes) {
1895 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1896 for (auto Id : Ids) {
1897 ContextNode *CurNode = getNodeForStackId(Id);
1898 // We should only have kept stack ids that had nodes.
1899 assert(CurNode);
1900 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1901 }
1902 }
1903 }
1904}
1905
1906template <typename DerivedCCG, typename FuncTy, typename CallTy>
1907void CallsiteContextGraph<DerivedCCG, FuncTy,
1908 CallTy>::fixupImportantContexts() {
1909 if (ImportantContextIdInfo.empty())
1910 return;
1911
1912 // Update statistics as we are done building this map at this point.
1913 NumImportantContextIds = ImportantContextIdInfo.size();
1914
1916 return;
1917
1918 if (ExportToDot)
1919 exportToDot("beforestackfixup");
1920
1921 // For each context we identified as important, walk through the saved context
1922 // stack ids in order from leaf upwards, and make sure all edges are correct.
1923 // These can be difficult to get right when updating the graph while mapping
1924 // nodes onto summary or IR, especially when there is recursion. In
1925 // particular, when we have created new nodes to reflect inlining, it is
1926 // sometimes impossible to know exactly how to update the edges in the face of
1927 // recursion, as we have lost the original ordering of the stack ids in the
1928 // contexts.
1929 // TODO: Consider only doing this if we detect the context has recursive
1930 // cycles.
1931 //
1932 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1933 // and let's say A was inlined into B, C, and D. The original graph will have
1934 // multiple recursive cycles through A. When we match the original context
1935 // nodes onto the IR or summary, we will merge {A B} into one context node,
1936 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1937 // above, we should end up with a non-cyclic set of edges like:
1938 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1939 // original ordering, we won't get the edges correct initially (it's
1940 // impossible without the original ordering). Here we do the fixup (add and
1941 // removing edges where necessary) for this context. In the
1942 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1943 // and map entries for {A B}, {A C}, {A D}, and {E}.
1944 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1945 if (Info.StackIdsToNode.empty())
1946 continue;
1947 bool Changed = false;
1948 ContextNode *PrevNode = nullptr;
1949 ContextNode *CurNode = nullptr;
1950 DenseSet<const ContextEdge *> VisitedEdges;
1951 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1952 // Try to identify what callsite ContextNode maps to which slice of the
1953 // context's ordered stack ids.
1954 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1955 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1956 // see if we recorded a context node for that sequence.
1957 auto Len = Info.MaxLength;
1958 auto LenToEnd = AllStackIds.size() - I;
1959 if (Len > LenToEnd)
1960 Len = LenToEnd;
1961 CurNode = nullptr;
1962 // Try to find a recorded context node starting with the longest length
1963 // recorded, and on down until we check for just a single stack node.
1964 for (; Len > 0; Len--) {
1965 // Get the slice of the original stack id sequence to check.
1966 auto CheckStackIds = AllStackIds.slice(I, Len);
1967 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1968 if (EntryIt == Info.StackIdsToNode.end())
1969 continue;
1970 CurNode = EntryIt->second;
1971 // Skip forward so we don't try to look for the ones we just matched.
1972 // We increment by Len - 1, because the outer for loop will increment I.
1973 I += Len - 1;
1974 break;
1975 }
1976 // Give up if we couldn't find a node. Since we need to clone from the
1977 // leaf allocation upwards, no sense in doing anymore fixup further up
1978 // the context if we couldn't match part of the original stack context
1979 // onto a callsite node.
1980 if (!CurNode)
1981 break;
1982 // No edges to fix up until we have a pair of nodes that should be
1983 // adjacent in the graph.
1984 if (!PrevNode)
1985 continue;
1986 // See if we already have a call edge from CurNode to PrevNode.
1987 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1988 if (CurEdge) {
1989 // We already have an edge. Make sure it contains this context id.
1990 if (CurEdge->getContextIds().insert(CurContextId).second) {
1991 NumFixupEdgeIdsInserted++;
1992 Changed = true;
1993 }
1994 } else {
1995 // No edge exists - add one.
1996 NumFixupEdgesAdded++;
1997 DenseSet<uint32_t> ContextIds({CurContextId});
1998 auto AllocType = computeAllocType(ContextIds);
1999 auto NewEdge = std::make_shared<ContextEdge>(
2000 PrevNode, CurNode, AllocType, std::move(ContextIds));
2001 PrevNode->CallerEdges.push_back(NewEdge);
2002 CurNode->CalleeEdges.push_back(NewEdge);
2003 // Save the new edge for the below handling.
2004 CurEdge = NewEdge.get();
2005 Changed = true;
2006 }
2007 VisitedEdges.insert(CurEdge);
2008 // Now remove this context id from any other caller edges calling
2009 // PrevNode.
2010 for (auto &Edge : PrevNode->CallerEdges) {
2011 // Skip the edge updating/created above and edges we have already
2012 // visited (due to recursion).
2013 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2014 Edge->getContextIds().erase(CurContextId);
2015 }
2016 }
2017 if (Changed)
2018 NumFixedContexts++;
2019 }
2020}
2021
2022template <typename DerivedCCG, typename FuncTy, typename CallTy>
2023void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2024 // Map of stack id to all calls with that as the last (outermost caller)
2025 // callsite id that has a context node (some might not due to pruning
2026 // performed during matching of the allocation profile contexts).
2027 // The CallContextInfo contains the Call and a list of its stack ids with
2028 // ContextNodes, the function containing Call, and the set of context ids
2029 // the analysis will eventually identify for use in any new node created
2030 // for that callsite.
2031 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2032 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2033 for (auto &Call : CallsWithMetadata) {
2034 // Ignore allocations, already handled.
2035 if (AllocationCallToContextNodeMap.count(Call))
2036 continue;
2037 auto StackIdsWithContextNodes =
2038 getStackIdsWithContextNodesForCall(Call.call());
2039 // If there were no nodes created for MIBs on allocs (maybe this was in
2040 // the unambiguous part of the MIB stack that was pruned), ignore.
2041 if (StackIdsWithContextNodes.empty())
2042 continue;
2043 // Otherwise, record this Call along with the list of ids for the last
2044 // (outermost caller) stack id with a node.
2045 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2046 {Call.call(), StackIdsWithContextNodes, Func, {}});
2047 }
2048 }
2049
2050 // First make a pass through all stack ids that correspond to a call,
2051 // as identified in the above loop. Compute the context ids corresponding to
2052 // each of these calls when they correspond to multiple stack ids due to
2053 // due to inlining. Perform any duplication of context ids required when
2054 // there is more than one call with the same stack ids. Their (possibly newly
2055 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2056 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2057 // Save a map from each call to any that are found to match it. I.e. located
2058 // in the same function and have the same (possibly pruned) stack ids. We use
2059 // this to avoid creating extra graph nodes as they can be treated the same.
2060 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2061 for (auto &It : StackIdToMatchingCalls) {
2062 auto &Calls = It.getSecond();
2063 // Skip single calls with a single stack id. These don't need a new node.
2064 if (Calls.size() == 1) {
2065 auto &Ids = Calls[0].StackIds;
2066 if (Ids.size() == 1)
2067 continue;
2068 }
2069 // In order to do the best and maximal matching of inlined calls to context
2070 // node sequences we will sort the vectors of stack ids in descending order
2071 // of length, and within each length, lexicographically by stack id. The
2072 // latter is so that we can specially handle calls that have identical stack
2073 // id sequences (either due to cloning or artificially because of the MIB
2074 // context pruning). Those with the same Ids are then sorted by function to
2075 // facilitate efficiently mapping them to the same context node.
2076 // Because the functions are pointers, to ensure a stable sort first assign
2077 // each function pointer to its first index in the Calls array, and then use
2078 // that to sort by.
2079 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2080 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2081 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2083 Calls,
2084 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2085 return A.StackIds.size() > B.StackIds.size() ||
2086 (A.StackIds.size() == B.StackIds.size() &&
2087 (A.StackIds < B.StackIds ||
2088 (A.StackIds == B.StackIds &&
2089 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2090 });
2091
2092 // Find the node for the last stack id, which should be the same
2093 // across all calls recorded for this id, and is the id for this
2094 // entry in the StackIdToMatchingCalls map.
2095 uint64_t LastId = It.getFirst();
2096 ContextNode *LastNode = getNodeForStackId(LastId);
2097 // We should only have kept stack ids that had nodes.
2098 assert(LastNode);
2099
2100 if (LastNode->Recursive)
2101 continue;
2102
2103 // Initialize the context ids with the last node's. We will subsequently
2104 // refine the context ids by computing the intersection along all edges.
2105 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2106 assert(!LastNodeContextIds.empty());
2107
2108#ifndef NDEBUG
2109 // Save the set of functions seen for a particular set of the same stack
2110 // ids. This is used to ensure that they have been correctly sorted to be
2111 // adjacent in the Calls list, since we rely on that to efficiently place
2112 // all such matching calls onto the same context node.
2113 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2114#endif
2115
2116 for (unsigned I = 0; I < Calls.size(); I++) {
2117 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2118 assert(SavedContextIds.empty());
2119 assert(LastId == Ids.back());
2120
2121#ifndef NDEBUG
2122 // If this call has a different set of ids than the last one, clear the
2123 // set used to ensure they are sorted properly.
2124 if (I > 0 && Ids != Calls[I - 1].StackIds)
2125 MatchingIdsFuncSet.clear();
2126#endif
2127
2128 // First compute the context ids for this stack id sequence (the
2129 // intersection of the context ids of the corresponding nodes).
2130 // Start with the remaining saved ids for the last node.
2131 assert(!LastNodeContextIds.empty());
2132 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2133
2134 ContextNode *PrevNode = LastNode;
2135 ContextNode *CurNode = LastNode;
2136 bool Skip = false;
2137
2138 // Iterate backwards through the stack Ids, starting after the last Id
2139 // in the list, which was handled once outside for all Calls.
2140 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2141 auto Id = *IdIter;
2142 CurNode = getNodeForStackId(Id);
2143 // We should only have kept stack ids that had nodes.
2144 assert(CurNode);
2145
2146 if (CurNode->Recursive) {
2147 Skip = true;
2148 break;
2149 }
2150
2151 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2152 // If there is no edge then the nodes belong to different MIB contexts,
2153 // and we should skip this inlined context sequence. For example, this
2154 // particular inlined context may include stack ids A->B, and we may
2155 // indeed have nodes for both A and B, but it is possible that they were
2156 // never profiled in sequence in a single MIB for any allocation (i.e.
2157 // we might have profiled an allocation that involves the callsite A,
2158 // but through a different one of its callee callsites, and we might
2159 // have profiled an allocation that involves callsite B, but reached
2160 // from a different caller callsite).
2161 if (!Edge) {
2162 Skip = true;
2163 break;
2164 }
2165 PrevNode = CurNode;
2166
2167 // Update the context ids, which is the intersection of the ids along
2168 // all edges in the sequence.
2169 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2170
2171 // If we now have no context ids for clone, skip this call.
2172 if (StackSequenceContextIds.empty()) {
2173 Skip = true;
2174 break;
2175 }
2176 }
2177 if (Skip)
2178 continue;
2179
2180 // If some of this call's stack ids did not have corresponding nodes (due
2181 // to pruning), don't include any context ids for contexts that extend
2182 // beyond these nodes. Otherwise we would be matching part of unrelated /
2183 // not fully matching stack contexts. To do this, subtract any context ids
2184 // found in caller nodes of the last node found above.
2185 if (Ids.back() != getLastStackId(Call)) {
2186 for (const auto &PE : LastNode->CallerEdges) {
2187 set_subtract(StackSequenceContextIds, PE->getContextIds());
2188 if (StackSequenceContextIds.empty())
2189 break;
2190 }
2191 // If we now have no context ids for clone, skip this call.
2192 if (StackSequenceContextIds.empty())
2193 continue;
2194 }
2195
2196#ifndef NDEBUG
2197 // If the prior call had the same stack ids this set would not be empty.
2198 // Check if we already have a call that "matches" because it is located
2199 // in the same function. If the Calls list was sorted properly we should
2200 // not encounter this situation as all such entries should be adjacent
2201 // and processed in bulk further below.
2202 assert(!MatchingIdsFuncSet.contains(Func));
2203
2204 MatchingIdsFuncSet.insert(Func);
2205#endif
2206
2207 // Check if the next set of stack ids is the same (since the Calls vector
2208 // of tuples is sorted by the stack ids we can just look at the next one).
2209 // If so, save them in the CallToMatchingCall map so that they get
2210 // assigned to the same context node, and skip them.
2211 bool DuplicateContextIds = false;
2212 for (unsigned J = I + 1; J < Calls.size(); J++) {
2213 auto &CallCtxInfo = Calls[J];
2214 auto &NextIds = CallCtxInfo.StackIds;
2215 if (NextIds != Ids)
2216 break;
2217 auto *NextFunc = CallCtxInfo.Func;
2218 if (NextFunc != Func) {
2219 // We have another Call with the same ids but that cannot share this
2220 // node, must duplicate ids for it.
2221 DuplicateContextIds = true;
2222 break;
2223 }
2224 auto &NextCall = CallCtxInfo.Call;
2225 CallToMatchingCall[NextCall] = Call;
2226 // Update I so that it gets incremented correctly to skip this call.
2227 I = J;
2228 }
2229
2230 // If we don't have duplicate context ids, then we can assign all the
2231 // context ids computed for the original node sequence to this call.
2232 // If there are duplicate calls with the same stack ids then we synthesize
2233 // new context ids that are duplicates of the originals. These are
2234 // assigned to SavedContextIds, which is a reference into the map entry
2235 // for this call, allowing us to access these ids later on.
2236 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2237 StackSequenceContextIds.size());
2238 SavedContextIds =
2239 DuplicateContextIds
2240 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2241 : StackSequenceContextIds;
2242 assert(!SavedContextIds.empty());
2243
2244 if (!DuplicateContextIds) {
2245 // Update saved last node's context ids to remove those that are
2246 // assigned to other calls, so that it is ready for the next call at
2247 // this stack id.
2248 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2249 if (LastNodeContextIds.empty())
2250 break;
2251 }
2252 }
2253 }
2254
2255 // Propagate the duplicate context ids over the graph.
2256 propagateDuplicateContextIds(OldToNewContextIds);
2257
2258 if (VerifyCCG)
2259 check();
2260
2261 // Now perform a post-order traversal over the graph, starting with the
2262 // allocation nodes, essentially processing nodes from callers to callees.
2263 // For any that contains an id in the map, update the graph to contain new
2264 // nodes representing any inlining at interior callsites. Note we move the
2265 // associated context ids over to the new nodes.
2266 DenseSet<const ContextNode *> Visited;
2267 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2268 ImportantContextIdInfo.keys());
2269 for (auto &Entry : AllocationCallToContextNodeMap)
2270 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2271 CallToMatchingCall, ImportantContextIds);
2272
2273 fixupImportantContexts();
2274
2275 if (VerifyCCG)
2276 check();
2277}
2278
2279uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2280 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2281 Call->getMetadata(LLVMContext::MD_callsite));
2282 return CallsiteContext.back();
2283}
2284
2285uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2287 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2288 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2289 // Need to convert index into stack id.
2290 return Index.getStackIdAtIndex(CallsiteContext.back());
2291}
2292
2293static const std::string MemProfCloneSuffix = ".memprof.";
2294
2295static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2296 // We use CloneNo == 0 to refer to the original version, which doesn't get
2297 // renamed with a suffix.
2298 if (!CloneNo)
2299 return Base.str();
2300 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2301}
2302
2303static bool isMemProfClone(const Function &F) {
2304 return F.getName().contains(MemProfCloneSuffix);
2305}
2306
2307// Return the clone number of the given function by extracting it from the
2308// memprof suffix. Assumes the caller has already confirmed it is a memprof
2309// clone.
2310static unsigned getMemProfCloneNum(const Function &F) {
2312 auto Pos = F.getName().find_last_of('.');
2313 assert(Pos > 0);
2314 unsigned CloneNo;
2315 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2316 assert(!Err);
2317 (void)Err;
2318 return CloneNo;
2319}
2320
2321std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2322 const Instruction *Call,
2323 unsigned CloneNo) const {
2324 return (Twine(Call->getFunction()->getName()) + " -> " +
2325 cast<CallBase>(Call)->getCalledFunction()->getName())
2326 .str();
2327}
2328
2329std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2330 const IndexCall &Call,
2331 unsigned CloneNo) const {
2332 auto VI = FSToVIMap.find(Func);
2333 assert(VI != FSToVIMap.end());
2334 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2336 return CallerName + " -> alloc";
2337 else {
2338 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2339 return CallerName + " -> " +
2340 getMemProfFuncName(Callsite->Callee.name(),
2341 Callsite->Clones[CloneNo]);
2342 }
2343}
2344
2345std::vector<uint64_t>
2346ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2347 Instruction *Call) {
2348 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2349 Call->getMetadata(LLVMContext::MD_callsite));
2350 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2351 CallsiteContext);
2352}
2353
2354std::vector<uint64_t>
2355IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2357 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2358 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2359 return getStackIdsWithContextNodes<CallsiteInfo,
2360 SmallVector<unsigned>::const_iterator>(
2361 CallsiteContext);
2362}
2363
2364template <typename DerivedCCG, typename FuncTy, typename CallTy>
2365template <class NodeT, class IteratorT>
2366std::vector<uint64_t>
2367CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2368 CallStack<NodeT, IteratorT> &CallsiteContext) {
2369 std::vector<uint64_t> StackIds;
2370 for (auto IdOrIndex : CallsiteContext) {
2371 auto StackId = getStackId(IdOrIndex);
2372 ContextNode *Node = getNodeForStackId(StackId);
2373 if (!Node)
2374 break;
2375 StackIds.push_back(StackId);
2376 }
2377 return StackIds;
2378}
2379
2380ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2381 Module &M,
2382 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2383 : Mod(M), OREGetter(OREGetter) {
2384 // Map for keeping track of the largest cold contexts up to the number given
2385 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2386 // must be sorted.
2387 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2388 for (auto &F : M) {
2389 std::vector<CallInfo> CallsWithMetadata;
2390 for (auto &BB : F) {
2391 for (auto &I : BB) {
2392 if (!isa<CallBase>(I))
2393 continue;
2394 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2395 CallsWithMetadata.push_back(&I);
2396 auto *AllocNode = addAllocNode(&I, &F);
2397 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2398 assert(CallsiteMD);
2399 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2400 // Add all of the MIBs and their stack nodes.
2401 for (auto &MDOp : MemProfMD->operands()) {
2402 auto *MIBMD = cast<const MDNode>(MDOp);
2403 std::vector<ContextTotalSize> ContextSizeInfo;
2404 // Collect the context size information if it exists.
2405 if (MIBMD->getNumOperands() > 2) {
2406 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2407 MDNode *ContextSizePair =
2408 dyn_cast<MDNode>(MIBMD->getOperand(I));
2409 assert(ContextSizePair->getNumOperands() == 2);
2411 ContextSizePair->getOperand(0))
2412 ->getZExtValue();
2414 ContextSizePair->getOperand(1))
2415 ->getZExtValue();
2416 ContextSizeInfo.push_back({FullStackId, TotalSize});
2417 }
2418 }
2422 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2423 AllocNode, StackContext, CallsiteContext,
2424 getMIBAllocType(MIBMD), ContextSizeInfo,
2425 TotalSizeToContextIdTopNCold);
2426 }
2427 // If exporting the graph to dot and an allocation id of interest was
2428 // specified, record all the context ids for this allocation node.
2429 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2430 DotAllocContextIds = AllocNode->getContextIds();
2431 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2432 // Memprof and callsite metadata on memory allocations no longer
2433 // needed.
2434 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2435 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2436 }
2437 // For callsite metadata, add to list for this function for later use.
2438 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2439 CallsWithMetadata.push_back(&I);
2440 }
2441 }
2442 }
2443 if (!CallsWithMetadata.empty())
2444 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2445 }
2446
2447 if (DumpCCG) {
2448 dbgs() << "CCG before updating call stack chains:\n";
2449 dbgs() << *this;
2450 }
2451
2452 if (ExportToDot)
2453 exportToDot("prestackupdate");
2454
2455 updateStackNodes();
2456
2457 if (ExportToDot)
2458 exportToDot("poststackupdate");
2459
2460 handleCallsitesWithMultipleTargets();
2461
2462 markBackedges();
2463
2464 // Strip off remaining callsite metadata, no longer needed.
2465 for (auto &FuncEntry : FuncToCallsWithMetadata)
2466 for (auto &Call : FuncEntry.second)
2467 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2468}
2469
2470IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2471 ModuleSummaryIndex &Index,
2473 isPrevailing)
2474 : Index(Index), isPrevailing(isPrevailing) {
2475 // Map for keeping track of the largest cold contexts up to the number given
2476 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2477 // must be sorted.
2478 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2479 for (auto &I : Index) {
2480 auto VI = Index.getValueInfo(I);
2481 for (auto &S : VI.getSummaryList()) {
2482 // We should only add the prevailing nodes. Otherwise we may try to clone
2483 // in a weak copy that won't be linked (and may be different than the
2484 // prevailing version).
2485 // We only keep the memprof summary on the prevailing copy now when
2486 // building the combined index, as a space optimization, however don't
2487 // rely on this optimization. The linker doesn't resolve local linkage
2488 // values so don't check whether those are prevailing.
2489 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2490 !isPrevailing(VI.getGUID(), S.get()))
2491 continue;
2492 auto *FS = dyn_cast<FunctionSummary>(S.get());
2493 if (!FS)
2494 continue;
2495 std::vector<CallInfo> CallsWithMetadata;
2496 if (!FS->allocs().empty()) {
2497 for (auto &AN : FS->mutableAllocs()) {
2498 // This can happen because of recursion elimination handling that
2499 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2500 // We still added them to the summary because we need to be able to
2501 // correlate properly in applyImport in the backends.
2502 if (AN.MIBs.empty())
2503 continue;
2504 IndexCall AllocCall(&AN);
2505 CallsWithMetadata.push_back(AllocCall);
2506 auto *AllocNode = addAllocNode(AllocCall, FS);
2507 // Pass an empty CallStack to the CallsiteContext (second)
2508 // parameter, since for ThinLTO we already collapsed out the inlined
2509 // stack ids on the allocation call during ModuleSummaryAnalysis.
2511 EmptyContext;
2512 unsigned I = 0;
2514 AN.ContextSizeInfos.size() == AN.MIBs.size());
2515 // Now add all of the MIBs and their stack nodes.
2516 for (auto &MIB : AN.MIBs) {
2518 StackContext(&MIB);
2519 std::vector<ContextTotalSize> ContextSizeInfo;
2520 if (!AN.ContextSizeInfos.empty()) {
2521 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2522 ContextSizeInfo.push_back({FullStackId, TotalSize});
2523 }
2524 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2525 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2526 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2527 I++;
2528 }
2529 // If exporting the graph to dot and an allocation id of interest was
2530 // specified, record all the context ids for this allocation node.
2531 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2532 DotAllocContextIds = AllocNode->getContextIds();
2533 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2534 // Initialize version 0 on the summary alloc node to the current alloc
2535 // type, unless it has both types in which case make it default, so
2536 // that in the case where we aren't able to clone the original version
2537 // always ends up with the default allocation behavior.
2538 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2539 }
2540 }
2541 // For callsite metadata, add to list for this function for later use.
2542 if (!FS->callsites().empty())
2543 for (auto &SN : FS->mutableCallsites()) {
2544 IndexCall StackNodeCall(&SN);
2545 CallsWithMetadata.push_back(StackNodeCall);
2546 }
2547
2548 if (!CallsWithMetadata.empty())
2549 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2550
2551 if (!FS->allocs().empty() || !FS->callsites().empty())
2552 FSToVIMap[FS] = VI;
2553 }
2554 }
2555
2556 if (DumpCCG) {
2557 dbgs() << "CCG before updating call stack chains:\n";
2558 dbgs() << *this;
2559 }
2560
2561 if (ExportToDot)
2562 exportToDot("prestackupdate");
2563
2564 updateStackNodes();
2565
2566 if (ExportToDot)
2567 exportToDot("poststackupdate");
2568
2569 handleCallsitesWithMultipleTargets();
2570
2571 markBackedges();
2572}
2573
2574template <typename DerivedCCG, typename FuncTy, typename CallTy>
2575void CallsiteContextGraph<DerivedCCG, FuncTy,
2576 CallTy>::handleCallsitesWithMultipleTargets() {
2577 // Look for and workaround callsites that call multiple functions.
2578 // This can happen for indirect calls, which needs better handling, and in
2579 // more rare cases (e.g. macro expansion).
2580 // TODO: To fix this for indirect calls we will want to perform speculative
2581 // devirtualization using either the normal PGO info with ICP, or using the
2582 // information in the profiled MemProf contexts. We can do this prior to
2583 // this transformation for regular LTO, and for ThinLTO we can simulate that
2584 // effect in the summary and perform the actual speculative devirtualization
2585 // while cloning in the ThinLTO backend.
2586
2587 // Keep track of the new nodes synthesized for discovered tail calls missing
2588 // from the profiled contexts.
2589 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2590
2591 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2592 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2593 auto *Node = Entry.second;
2594 assert(Node->Clones.empty());
2595 // Check all node callees and see if in the same function.
2596 // We need to check all of the calls recorded in this Node, because in some
2597 // cases we may have had multiple calls with the same debug info calling
2598 // different callees. This can happen, for example, when an object is
2599 // constructed in the paramter list - the destructor call of the object has
2600 // the same debug info (line/col) as the call the object was passed to.
2601 // Here we will prune any that don't match all callee nodes.
2602 std::vector<CallInfo> AllCalls;
2603 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2604 AllCalls.push_back(Node->Call);
2605 llvm::append_range(AllCalls, Node->MatchingCalls);
2606
2607 // First see if we can partition the calls by callee function, creating new
2608 // nodes to host each set of calls calling the same callees. This is
2609 // necessary for support indirect calls with ThinLTO, for which we
2610 // synthesized CallsiteInfo records for each target. They will all have the
2611 // same callsite stack ids and would be sharing a context node at this
2612 // point. We need to perform separate cloning for each, which will be
2613 // applied along with speculative devirtualization in the ThinLTO backends
2614 // as needed. Note this does not currently support looking through tail
2615 // calls, it is unclear if we need that for indirect call targets.
2616 // First partition calls by callee func. Map indexed by func, value is
2617 // struct with list of matching calls, assigned node.
2618 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2619 continue;
2620
2621 auto It = AllCalls.begin();
2622 // Iterate through the calls until we find the first that matches.
2623 for (; It != AllCalls.end(); ++It) {
2624 auto ThisCall = *It;
2625 bool Match = true;
2626 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2627 ++EI) {
2628 auto Edge = *EI;
2629 if (!Edge->Callee->hasCall())
2630 continue;
2631 assert(NodeToCallingFunc.count(Edge->Callee));
2632 // Check if the called function matches that of the callee node.
2633 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2634 Match = false;
2635 break;
2636 }
2637 }
2638 // Found a call that matches the callee nodes, we can quit now.
2639 if (Match) {
2640 // If the first match is not the primary call on the Node, update it
2641 // now. We will update the list of matching calls further below.
2642 if (Node->Call != ThisCall) {
2643 Node->setCall(ThisCall);
2644 // We need to update the NonAllocationCallToContextNodeMap, but don't
2645 // want to do this during iteration over that map, so save the calls
2646 // that need updated entries.
2647 NewCallToNode.push_back({ThisCall, Node});
2648 }
2649 break;
2650 }
2651 }
2652 // We will update this list below (or leave it cleared if there was no
2653 // match found above).
2654 Node->MatchingCalls.clear();
2655 // If we hit the end of the AllCalls vector, no call matching the callee
2656 // nodes was found, clear the call information in the node.
2657 if (It == AllCalls.end()) {
2658 RemovedEdgesWithMismatchedCallees++;
2659 // Work around by setting Node to have a null call, so it gets
2660 // skipped during cloning. Otherwise assignFunctions will assert
2661 // because its data structures are not designed to handle this case.
2662 Node->setCall(CallInfo());
2663 continue;
2664 }
2665 // Now add back any matching calls that call the same function as the
2666 // matching primary call on Node.
2667 for (++It; It != AllCalls.end(); ++It) {
2668 auto ThisCall = *It;
2669 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2670 continue;
2671 Node->MatchingCalls.push_back(ThisCall);
2672 }
2673 }
2674
2675 // Remove all mismatched nodes identified in the above loop from the node map
2676 // (checking whether they have a null call which is set above). For a
2677 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2678 // to do the removal via remove_if than by individually erasing entries above.
2679 // Also remove any entries if we updated the node's primary call above.
2680 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2681 return !it.second->hasCall() || it.second->Call != it.first;
2682 });
2683
2684 // Add entries for any new primary calls recorded above.
2685 for (auto &[Call, Node] : NewCallToNode)
2686 NonAllocationCallToContextNodeMap[Call] = Node;
2687
2688 // Add the new nodes after the above loop so that the iteration is not
2689 // invalidated.
2690 for (auto &[Call, Node] : TailCallToContextNodeMap)
2691 NonAllocationCallToContextNodeMap[Call] = Node;
2692}
2693
2694template <typename DerivedCCG, typename FuncTy, typename CallTy>
2695bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2696 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2697 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2698 // Struct to keep track of all the calls having the same callee function,
2699 // and the node we eventually assign to them. Eventually we will record the
2700 // context node assigned to this group of calls.
2701 struct CallsWithSameCallee {
2702 std::vector<CallInfo> Calls;
2703 ContextNode *Node = nullptr;
2704 };
2705
2706 // First partition calls by callee function. Build map from each function
2707 // to the list of matching calls.
2709 for (auto ThisCall : AllCalls) {
2710 auto *F = getCalleeFunc(ThisCall.call());
2711 if (F)
2712 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2713 }
2714
2715 // Next, walk through all callee edges. For each callee node, get its
2716 // containing function and see if it was recorded in the above map (meaning we
2717 // have at least one matching call). Build another map from each callee node
2718 // with a matching call to the structure instance created above containing all
2719 // the calls.
2721 for (const auto &Edge : Node->CalleeEdges) {
2722 if (!Edge->Callee->hasCall())
2723 continue;
2724 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2725 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2726 CalleeNodeToCallInfo[Edge->Callee] =
2727 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2728 }
2729
2730 // If there are entries in the second map, then there were no matching
2731 // calls/callees, nothing to do here. Return so we can go to the handling that
2732 // looks through tail calls.
2733 if (CalleeNodeToCallInfo.empty())
2734 return false;
2735
2736 // Walk through all callee edges again. Any and all callee edges that didn't
2737 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2738 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2739 // ignored during cloning. If it is in the map, then we use the node recorded
2740 // in that entry (creating it if needed), and move the callee edge to it.
2741 // The first callee will use the original node instead of creating a new one.
2742 // Note that any of the original calls on this node (in AllCalls) that didn't
2743 // have a callee function automatically get dropped from the node as part of
2744 // this process.
2745 ContextNode *UnmatchedCalleesNode = nullptr;
2746 // Track whether we already assigned original node to a callee.
2747 bool UsedOrigNode = false;
2748 assert(NodeToCallingFunc[Node]);
2749 // Iterate over a copy of Node's callee edges, since we may need to remove
2750 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2751 // makes it less error-prone.
2752 auto CalleeEdges = Node->CalleeEdges;
2753 for (auto &Edge : CalleeEdges) {
2754 if (!Edge->Callee->hasCall())
2755 continue;
2756
2757 // Will be updated below to point to whatever (caller) node this callee edge
2758 // should be moved to.
2759 ContextNode *CallerNodeToUse = nullptr;
2760
2761 // Handle the case where there were no matching calls first. Move this
2762 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2763 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2764 if (!UnmatchedCalleesNode)
2765 UnmatchedCalleesNode =
2766 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2767 CallerNodeToUse = UnmatchedCalleesNode;
2768 } else {
2769 // Look up the information recorded for this callee node, and use the
2770 // recorded caller node (creating it if needed).
2771 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2772 if (!Info->Node) {
2773 // If we haven't assigned any callees to the original node use it.
2774 if (!UsedOrigNode) {
2775 Info->Node = Node;
2776 // Clear the set of matching calls which will be updated below.
2777 Node->MatchingCalls.clear();
2778 UsedOrigNode = true;
2779 } else
2780 Info->Node =
2781 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2782 assert(!Info->Calls.empty());
2783 // The first call becomes the primary call for this caller node, and the
2784 // rest go in the matching calls list.
2785 Info->Node->setCall(Info->Calls.front());
2786 llvm::append_range(Info->Node->MatchingCalls,
2787 llvm::drop_begin(Info->Calls));
2788 // Save the primary call to node correspondence so that we can update
2789 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2790 // caller of this function.
2791 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2792 }
2793 CallerNodeToUse = Info->Node;
2794 }
2795
2796 // Don't need to move edge if we are using the original node;
2797 if (CallerNodeToUse == Node)
2798 continue;
2799
2800 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2801 }
2802 // Now that we are done moving edges, clean up any caller edges that ended
2803 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2804 // caller edges from Node are replicated onto the new callers, and it
2805 // simplifies the handling to leave them until we have moved all
2806 // edges/context ids.
2807 for (auto &I : CalleeNodeToCallInfo)
2808 removeNoneTypeCallerEdges(I.second->Node);
2809 if (UnmatchedCalleesNode)
2810 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2811 removeNoneTypeCallerEdges(Node);
2812
2813 return true;
2814}
2815
2816uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2817 // In the Module (IR) case this is already the Id.
2818 return IdOrIndex;
2819}
2820
2821uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2822 // In the Index case this is an index into the stack id list in the summary
2823 // index, convert it to an Id.
2824 return Index.getStackIdAtIndex(IdOrIndex);
2825}
2826
2827template <typename DerivedCCG, typename FuncTy, typename CallTy>
2828bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2829 CallTy Call, EdgeIter &EI,
2830 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2831 auto Edge = *EI;
2832 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2833 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2834 // Will be populated in order of callee to caller if we find a chain of tail
2835 // calls between the profiled caller and callee.
2836 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2837 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2838 FoundCalleeChain))
2839 return false;
2840
2841 // The usual case where the profiled callee matches that of the IR/summary.
2842 if (FoundCalleeChain.empty())
2843 return true;
2844
2845 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2846 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2847 // If there is already an edge between these nodes, simply update it and
2848 // return.
2849 if (CurEdge) {
2850 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2851 CurEdge->AllocTypes |= Edge->AllocTypes;
2852 return;
2853 }
2854 // Otherwise, create a new edge and insert it into the caller and callee
2855 // lists.
2856 auto NewEdge = std::make_shared<ContextEdge>(
2857 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2858 Callee->CallerEdges.push_back(NewEdge);
2859 if (Caller == Edge->Caller) {
2860 // If we are inserting the new edge into the current edge's caller, insert
2861 // the new edge before the current iterator position, and then increment
2862 // back to the current edge.
2863 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2864 ++EI;
2865 assert(*EI == Edge &&
2866 "Iterator position not restored after insert and increment");
2867 } else
2868 Caller->CalleeEdges.push_back(NewEdge);
2869 };
2870
2871 // Create new nodes for each found callee and connect in between the profiled
2872 // caller and callee.
2873 auto *CurCalleeNode = Edge->Callee;
2874 for (auto &[NewCall, Func] : FoundCalleeChain) {
2875 ContextNode *NewNode = nullptr;
2876 // First check if we have already synthesized a node for this tail call.
2877 if (TailCallToContextNodeMap.count(NewCall)) {
2878 NewNode = TailCallToContextNodeMap[NewCall];
2879 NewNode->AllocTypes |= Edge->AllocTypes;
2880 } else {
2881 FuncToCallsWithMetadata[Func].push_back({NewCall});
2882 // Create Node and record node info.
2883 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2884 TailCallToContextNodeMap[NewCall] = NewNode;
2885 NewNode->AllocTypes = Edge->AllocTypes;
2886 }
2887
2888 // Hook up node to its callee node
2889 AddEdge(NewNode, CurCalleeNode);
2890
2891 CurCalleeNode = NewNode;
2892 }
2893
2894 // Hook up edge's original caller to new callee node.
2895 AddEdge(Edge->Caller, CurCalleeNode);
2896
2897#ifndef NDEBUG
2898 // Save this because Edge's fields get cleared below when removed.
2899 auto *Caller = Edge->Caller;
2900#endif
2901
2902 // Remove old edge
2903 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2904
2905 // To simplify the increment of EI in the caller, subtract one from EI.
2906 // In the final AddEdge call we would have either added a new callee edge,
2907 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2908 // that there is at least one callee edge.
2909 assert(!Caller->CalleeEdges.empty());
2910 --EI;
2911
2912 return true;
2913}
2914
2915bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2916 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2917 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2918 bool &FoundMultipleCalleeChains) {
2919 // Stop recursive search if we have already explored the maximum specified
2920 // depth.
2922 return false;
2923
2924 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2925 FoundCalleeChain.push_back({Callsite, F});
2926 };
2927
2928 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2929 if (!CalleeFunc) {
2930 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2931 assert(Alias);
2932 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2933 assert(CalleeFunc);
2934 }
2935
2936 // Look for tail calls in this function, and check if they either call the
2937 // profiled callee directly, or indirectly (via a recursive search).
2938 // Only succeed if there is a single unique tail call chain found between the
2939 // profiled caller and callee, otherwise we could perform incorrect cloning.
2940 bool FoundSingleCalleeChain = false;
2941 for (auto &BB : *CalleeFunc) {
2942 for (auto &I : BB) {
2943 auto *CB = dyn_cast<CallBase>(&I);
2944 if (!CB || !CB->isTailCall())
2945 continue;
2946 auto *CalledValue = CB->getCalledOperand();
2947 auto *CalledFunction = CB->getCalledFunction();
2948 if (CalledValue && !CalledFunction) {
2949 CalledValue = CalledValue->stripPointerCasts();
2950 // Stripping pointer casts can reveal a called function.
2951 CalledFunction = dyn_cast<Function>(CalledValue);
2952 }
2953 // Check if this is an alias to a function. If so, get the
2954 // called aliasee for the checks below.
2955 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2956 assert(!CalledFunction &&
2957 "Expected null called function in callsite for alias");
2958 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2959 }
2960 if (!CalledFunction)
2961 continue;
2962 if (CalledFunction == ProfiledCallee) {
2963 if (FoundSingleCalleeChain) {
2964 FoundMultipleCalleeChains = true;
2965 return false;
2966 }
2967 FoundSingleCalleeChain = true;
2968 FoundProfiledCalleeCount++;
2969 FoundProfiledCalleeDepth += Depth;
2970 if (Depth > FoundProfiledCalleeMaxDepth)
2971 FoundProfiledCalleeMaxDepth = Depth;
2972 SaveCallsiteInfo(&I, CalleeFunc);
2973 } else if (findProfiledCalleeThroughTailCalls(
2974 ProfiledCallee, CalledFunction, Depth + 1,
2975 FoundCalleeChain, FoundMultipleCalleeChains)) {
2976 // findProfiledCalleeThroughTailCalls should not have returned
2977 // true if FoundMultipleCalleeChains.
2978 assert(!FoundMultipleCalleeChains);
2979 if (FoundSingleCalleeChain) {
2980 FoundMultipleCalleeChains = true;
2981 return false;
2982 }
2983 FoundSingleCalleeChain = true;
2984 SaveCallsiteInfo(&I, CalleeFunc);
2985 } else if (FoundMultipleCalleeChains)
2986 return false;
2987 }
2988 }
2989
2990 return FoundSingleCalleeChain;
2991}
2992
2993const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2994 auto *CB = dyn_cast<CallBase>(Call);
2995 if (!CB->getCalledOperand() || CB->isIndirectCall())
2996 return nullptr;
2997 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2998 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2999 if (Alias)
3000 return dyn_cast<Function>(Alias->getAliasee());
3001 return dyn_cast<Function>(CalleeVal);
3002}
3003
3004bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3005 Instruction *Call, const Function *Func, const Function *CallerFunc,
3006 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3007 auto *CB = dyn_cast<CallBase>(Call);
3008 if (!CB->getCalledOperand() || CB->isIndirectCall())
3009 return false;
3010 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3011 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
3012 if (CalleeFunc == Func)
3013 return true;
3014 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3015 if (Alias && Alias->getAliasee() == Func)
3016 return true;
3017
3018 // Recursively search for the profiled callee through tail calls starting with
3019 // the actual Callee. The discovered tail call chain is saved in
3020 // FoundCalleeChain, and we will fixup the graph to include these callsites
3021 // after returning.
3022 // FIXME: We will currently redo the same recursive walk if we find the same
3023 // mismatched callee from another callsite. We can improve this with more
3024 // bookkeeping of the created chain of new nodes for each mismatch.
3025 unsigned Depth = 1;
3026 bool FoundMultipleCalleeChains = false;
3027 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
3028 FoundCalleeChain,
3029 FoundMultipleCalleeChains)) {
3030 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3031 << Func->getName() << " from " << CallerFunc->getName()
3032 << " that actually called " << CalleeVal->getName()
3033 << (FoundMultipleCalleeChains
3034 ? " (found multiple possible chains)"
3035 : "")
3036 << "\n");
3037 if (FoundMultipleCalleeChains)
3038 FoundProfiledCalleeNonUniquelyCount++;
3039 return false;
3040 }
3041
3042 return true;
3043}
3044
3045bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3046 Instruction *Call2) {
3047 auto *CB1 = cast<CallBase>(Call1);
3048 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3049 return false;
3050 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3051 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
3052 auto *CB2 = cast<CallBase>(Call2);
3053 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3054 return false;
3055 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3056 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
3057 return CalleeFunc1 == CalleeFunc2;
3058}
3059
3060bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3061 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3062 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3063 bool &FoundMultipleCalleeChains) {
3064 // Stop recursive search if we have already explored the maximum specified
3065 // depth.
3067 return false;
3068
3069 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3070 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3071 // been synthesized.
3072 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
3073 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
3074 // StackIds is empty (we don't have debug info available in the index for
3075 // these callsites)
3076 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3077 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
3078 CallsiteInfo *NewCallsiteInfo =
3079 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3080 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
3081 };
3082
3083 // Look for tail calls in this function, and check if they either call the
3084 // profiled callee directly, or indirectly (via a recursive search).
3085 // Only succeed if there is a single unique tail call chain found between the
3086 // profiled caller and callee, otherwise we could perform incorrect cloning.
3087 bool FoundSingleCalleeChain = false;
3088 for (auto &S : CurCallee.getSummaryList()) {
3089 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
3090 !isPrevailing(CurCallee.getGUID(), S.get()))
3091 continue;
3092 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
3093 if (!FS)
3094 continue;
3095 auto FSVI = CurCallee;
3096 auto *AS = dyn_cast<AliasSummary>(S.get());
3097 if (AS)
3098 FSVI = AS->getAliaseeVI();
3099 for (auto &CallEdge : FS->calls()) {
3100 if (!CallEdge.second.hasTailCall())
3101 continue;
3102 if (CallEdge.first == ProfiledCallee) {
3103 if (FoundSingleCalleeChain) {
3104 FoundMultipleCalleeChains = true;
3105 return false;
3106 }
3107 FoundSingleCalleeChain = true;
3108 FoundProfiledCalleeCount++;
3109 FoundProfiledCalleeDepth += Depth;
3110 if (Depth > FoundProfiledCalleeMaxDepth)
3111 FoundProfiledCalleeMaxDepth = Depth;
3112 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3113 // Add FS to FSToVIMap in case it isn't already there.
3114 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3115 FSToVIMap[FS] = FSVI;
3116 } else if (findProfiledCalleeThroughTailCalls(
3117 ProfiledCallee, CallEdge.first, Depth + 1,
3118 FoundCalleeChain, FoundMultipleCalleeChains)) {
3119 // findProfiledCalleeThroughTailCalls should not have returned
3120 // true if FoundMultipleCalleeChains.
3121 assert(!FoundMultipleCalleeChains);
3122 if (FoundSingleCalleeChain) {
3123 FoundMultipleCalleeChains = true;
3124 return false;
3125 }
3126 FoundSingleCalleeChain = true;
3127 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3128 // Add FS to FSToVIMap in case it isn't already there.
3129 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3130 FSToVIMap[FS] = FSVI;
3131 } else if (FoundMultipleCalleeChains)
3132 return false;
3133 }
3134 }
3135
3136 return FoundSingleCalleeChain;
3137}
3138
3139const FunctionSummary *
3140IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3141 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3142 if (Callee.getSummaryList().empty())
3143 return nullptr;
3144 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
3145}
3146
3147bool IndexCallsiteContextGraph::calleeMatchesFunc(
3148 IndexCall &Call, const FunctionSummary *Func,
3149 const FunctionSummary *CallerFunc,
3150 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3151 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3152 // If there is no summary list then this is a call to an externally defined
3153 // symbol.
3154 AliasSummary *Alias =
3155 Callee.getSummaryList().empty()
3156 ? nullptr
3157 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
3158 assert(FSToVIMap.count(Func));
3159 auto FuncVI = FSToVIMap[Func];
3160 if (Callee == FuncVI ||
3161 // If callee is an alias, check the aliasee, since only function
3162 // summary base objects will contain the stack node summaries and thus
3163 // get a context node.
3164 (Alias && Alias->getAliaseeVI() == FuncVI))
3165 return true;
3166
3167 // Recursively search for the profiled callee through tail calls starting with
3168 // the actual Callee. The discovered tail call chain is saved in
3169 // FoundCalleeChain, and we will fixup the graph to include these callsites
3170 // after returning.
3171 // FIXME: We will currently redo the same recursive walk if we find the same
3172 // mismatched callee from another callsite. We can improve this with more
3173 // bookkeeping of the created chain of new nodes for each mismatch.
3174 unsigned Depth = 1;
3175 bool FoundMultipleCalleeChains = false;
3176 if (!findProfiledCalleeThroughTailCalls(
3177 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3178 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3179 << " from " << FSToVIMap[CallerFunc]
3180 << " that actually called " << Callee
3181 << (FoundMultipleCalleeChains
3182 ? " (found multiple possible chains)"
3183 : "")
3184 << "\n");
3185 if (FoundMultipleCalleeChains)
3186 FoundProfiledCalleeNonUniquelyCount++;
3187 return false;
3188 }
3189
3190 return true;
3191}
3192
3193bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3194 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
3195 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
3196 return Callee1 == Callee2;
3197}
3198
3199template <typename DerivedCCG, typename FuncTy, typename CallTy>
3200void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3201 const {
3202 print(dbgs());
3203 dbgs() << "\n";
3204}
3205
3206template <typename DerivedCCG, typename FuncTy, typename CallTy>
3207void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3208 raw_ostream &OS) const {
3209 OS << "Node " << this << "\n";
3210 OS << "\t";
3211 printCall(OS);
3212 if (Recursive)
3213 OS << " (recursive)";
3214 OS << "\n";
3215 if (!MatchingCalls.empty()) {
3216 OS << "\tMatchingCalls:\n";
3217 for (auto &MatchingCall : MatchingCalls) {
3218 OS << "\t";
3219 MatchingCall.print(OS);
3220 OS << "\n";
3221 }
3222 }
3223 OS << "\tNodeId: " << NodeId << "\n";
3224 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3225 OS << "\tContextIds:";
3226 // Make a copy of the computed context ids that we can sort for stability.
3227 auto ContextIds = getContextIds();
3228 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3229 std::sort(SortedIds.begin(), SortedIds.end());
3230 for (auto Id : SortedIds)
3231 OS << " " << Id;
3232 OS << "\n";
3233 OS << "\tCalleeEdges:\n";
3234 for (auto &Edge : CalleeEdges)
3235 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3236 << ")\n";
3237 OS << "\tCallerEdges:\n";
3238 for (auto &Edge : CallerEdges)
3239 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3240 << ")\n";
3241 if (!Clones.empty()) {
3242 OS << "\tClones: ";
3243 bool First = true;
3244 for (auto *C : Clones) {
3245 if (!First)
3246 OS << ", ";
3247 First = false;
3248 OS << C << " NodeId: " << C->NodeId;
3249 }
3250 OS << "\n";
3251 } else if (CloneOf) {
3252 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3253 }
3254}
3255
3256template <typename DerivedCCG, typename FuncTy, typename CallTy>
3257void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3258 const {
3259 print(dbgs());
3260 dbgs() << "\n";
3261}
3262
3263template <typename DerivedCCG, typename FuncTy, typename CallTy>
3264void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3265 raw_ostream &OS) const {
3266 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3267 << (IsBackedge ? " (BE)" : "")
3268 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3269 OS << " ContextIds:";
3270 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3271 std::sort(SortedIds.begin(), SortedIds.end());
3272 for (auto Id : SortedIds)
3273 OS << " " << Id;
3274}
3275
3276template <typename DerivedCCG, typename FuncTy, typename CallTy>
3277void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3278 print(dbgs());
3279}
3280
3281template <typename DerivedCCG, typename FuncTy, typename CallTy>
3282void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3283 raw_ostream &OS) const {
3284 OS << "Callsite Context Graph:\n";
3285 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3286 for (const auto Node : nodes<GraphType>(this)) {
3287 if (Node->isRemoved())
3288 continue;
3289 Node->print(OS);
3290 OS << "\n";
3291 }
3292}
3293
3294template <typename DerivedCCG, typename FuncTy, typename CallTy>
3295void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3296 raw_ostream &OS) const {
3297 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3298 for (const auto Node : nodes<GraphType>(this)) {
3299 if (Node->isRemoved())
3300 continue;
3301 if (!Node->IsAllocation)
3302 continue;
3303 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3304 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3305 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3306 std::sort(SortedIds.begin(), SortedIds.end());
3307 for (auto Id : SortedIds) {
3308 auto TypeI = ContextIdToAllocationType.find(Id);
3309 assert(TypeI != ContextIdToAllocationType.end());
3310 auto CSI = ContextIdToContextSizeInfos.find(Id);
3311 if (CSI != ContextIdToContextSizeInfos.end()) {
3312 for (auto &Info : CSI->second) {
3313 OS << "MemProf hinting: "
3314 << getAllocTypeString((uint8_t)TypeI->second)
3315 << " full allocation context " << Info.FullStackId
3316 << " with total size " << Info.TotalSize << " is "
3317 << getAllocTypeString(Node->AllocTypes) << " after cloning";
3318 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3319 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3320 << " due to cold byte percent";
3321 // Print the internal context id to aid debugging and visualization.
3322 OS << " (context id " << Id << ")";
3323 OS << "\n";
3324 }
3325 }
3326 }
3327 }
3328}
3329
3330template <typename DerivedCCG, typename FuncTy, typename CallTy>
3331void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3332 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3333 for (const auto Node : nodes<GraphType>(this)) {
3334 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3335 for (auto &Edge : Node->CallerEdges)
3337 }
3338}
3339
3340template <typename DerivedCCG, typename FuncTy, typename CallTy>
3341struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3342 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3343 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3344
3345 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3346 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3347
3350 decltype(&getNode)>;
3351
3353 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3354 }
3355
3357 return nodes_iterator(G->NodeOwner.end(), &getNode);
3358 }
3359
3361 return G->NodeOwner.begin()->get();
3362 }
3363
3364 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3365 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3367 return P->Callee;
3368 }
3369
3371 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3372 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3373 decltype(&GetCallee)>;
3374
3376 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3377 }
3378
3380 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3381 }
3382};
3383
3384template <typename DerivedCCG, typename FuncTy, typename CallTy>
3385struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3386 : public DefaultDOTGraphTraits {
3387 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3388 // If the user requested the full graph to be exported, but provided an
3389 // allocation id, or if the user gave a context id and requested more than
3390 // just a specific context to be exported, note that highlighting is
3391 // enabled.
3392 DoHighlight =
3393 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3394 (ContextIdForDot.getNumOccurrences() &&
3396 }
3397
3398 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3400 using NodeRef = typename GTraits::NodeRef;
3401 using ChildIteratorType = typename GTraits::ChildIteratorType;
3402
3403 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3404 std::string LabelString =
3405 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3406 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3407 .str();
3408 LabelString += "\n";
3409 if (Node->hasCall()) {
3410 auto Func = G->NodeToCallingFunc.find(Node);
3411 assert(Func != G->NodeToCallingFunc.end());
3412 LabelString +=
3413 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3414 for (auto &MatchingCall : Node->MatchingCalls) {
3415 LabelString += "\n";
3416 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3417 MatchingCall.cloneNo());
3418 }
3419 } else {
3420 LabelString += "null call";
3421 if (Node->Recursive)
3422 LabelString += " (recursive)";
3423 else
3424 LabelString += " (external)";
3425 }
3426 return LabelString;
3427 }
3428
3430 auto ContextIds = Node->getContextIds();
3431 // If highlighting enabled, see if this node contains any of the context ids
3432 // of interest. If so, it will use a different color and a larger fontsize
3433 // (which makes the node larger as well).
3434 bool Highlight = false;
3435 if (DoHighlight) {
3436 assert(ContextIdForDot.getNumOccurrences() ||
3437 AllocIdForDot.getNumOccurrences());
3438 if (ContextIdForDot.getNumOccurrences())
3439 Highlight = ContextIds.contains(ContextIdForDot);
3440 else
3441 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3442 }
3443 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3444 getContextIds(ContextIds) + "\"")
3445 .str();
3446 // Default fontsize is 14
3447 if (Highlight)
3448 AttributeString += ",fontsize=\"30\"";
3449 AttributeString +=
3450 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3451 .str();
3452 if (Node->CloneOf) {
3453 AttributeString += ",color=\"blue\"";
3454 AttributeString += ",style=\"filled,bold,dashed\"";
3455 } else
3456 AttributeString += ",style=\"filled\"";
3457 return AttributeString;
3458 }
3459
3460 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3461 GraphType G) {
3462 auto &Edge = *(ChildIter.getCurrent());
3463 // If highlighting enabled, see if this edge contains any of the context ids
3464 // of interest. If so, it will use a different color and a heavier arrow
3465 // size and weight (the larger weight makes the highlighted path
3466 // straighter).
3467 bool Highlight = false;
3468 if (DoHighlight) {
3469 assert(ContextIdForDot.getNumOccurrences() ||
3470 AllocIdForDot.getNumOccurrences());
3471 if (ContextIdForDot.getNumOccurrences())
3472 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3473 else
3474 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3475 }
3476 auto Color = getColor(Edge->AllocTypes, Highlight);
3477 std::string AttributeString =
3478 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3479 // fillcolor is the arrow head and color is the line
3480 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3481 "\"")
3482 .str();
3483 if (Edge->IsBackedge)
3484 AttributeString += ",style=\"dotted\"";
3485 // Default penwidth and weight are both 1.
3486 if (Highlight)
3487 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3488 return AttributeString;
3489 }
3490
3491 // Since the NodeOwners list includes nodes that are no longer connected to
3492 // the graph, skip them here.
3494 if (Node->isRemoved())
3495 return true;
3496 // If a scope smaller than the full graph was requested, see if this node
3497 // contains any of the context ids of interest.
3499 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3501 return !Node->getContextIds().contains(ContextIdForDot);
3502 return false;
3503 }
3504
3505private:
3506 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3507 std::string IdString = "ContextIds:";
3508 if (ContextIds.size() < 100) {
3509 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3510 std::sort(SortedIds.begin(), SortedIds.end());
3511 for (auto Id : SortedIds)
3512 IdString += (" " + Twine(Id)).str();
3513 } else {
3514 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3515 }
3516 return IdString;
3517 }
3518
3519 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3520 // If DoHighlight is not enabled, we want to use the highlight colors for
3521 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3522 // both compatible with the color scheme before highlighting was supported,
3523 // and for the NotCold+Cold color the non-highlight color is a bit more
3524 // readable.
3525 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3526 // Color "brown1" actually looks like a lighter red.
3527 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3528 if (AllocTypes == (uint8_t)AllocationType::Cold)
3529 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3530 if (AllocTypes ==
3531 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3532 return Highlight ? "magenta" : "mediumorchid1";
3533 return "gray";
3534 }
3535
3536 static std::string getNodeId(NodeRef Node) {
3537 std::stringstream SStream;
3538 SStream << std::hex << "N0x" << (unsigned long long)Node;
3539 std::string Result = SStream.str();
3540 return Result;
3541 }
3542
3543 // True if we should highlight a specific context or allocation's contexts in
3544 // the emitted graph.
3545 static bool DoHighlight;
3546};
3547
3548template <typename DerivedCCG, typename FuncTy, typename CallTy>
3549bool DOTGraphTraits<
3550 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3551 false;
3552
3553template <typename DerivedCCG, typename FuncTy, typename CallTy>
3554void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3555 std::string Label) const {
3556 WriteGraph(this, "", false, Label,
3557 DotFilePathPrefix + "ccg." + Label + ".dot");
3558}
3559
3560template <typename DerivedCCG, typename FuncTy, typename CallTy>
3561typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3562CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3563 const std::shared_ptr<ContextEdge> &Edge,
3564 DenseSet<uint32_t> ContextIdsToMove) {
3565 ContextNode *Node = Edge->Callee;
3566 assert(NodeToCallingFunc.count(Node));
3567 ContextNode *Clone =
3568 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3569 Node->addClone(Clone);
3570 Clone->MatchingCalls = Node->MatchingCalls;
3571 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3572 ContextIdsToMove);
3573 return Clone;
3574}
3575
3576template <typename DerivedCCG, typename FuncTy, typename CallTy>
3577void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3578 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3579 ContextNode *NewCallee, bool NewClone,
3580 DenseSet<uint32_t> ContextIdsToMove) {
3581 // NewCallee and Edge's current callee must be clones of the same original
3582 // node (Edge's current callee may be the original node too).
3583 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3584
3585 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3586
3587 ContextNode *OldCallee = Edge->Callee;
3588
3589 // We might already have an edge to the new callee from earlier cloning for a
3590 // different allocation. If one exists we will reuse it.
3591 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3592
3593 // Callers will pass an empty ContextIdsToMove set when they want to move the
3594 // edge. Copy in Edge's ids for simplicity.
3595 if (ContextIdsToMove.empty())
3596 ContextIdsToMove = Edge->getContextIds();
3597
3598 // If we are moving all of Edge's ids, then just move the whole Edge.
3599 // Otherwise only move the specified subset, to a new edge if needed.
3600 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3601 // First, update the alloc types on New Callee from Edge.
3602 // Do this before we potentially clear Edge's fields below!
3603 NewCallee->AllocTypes |= Edge->AllocTypes;
3604 // Moving the whole Edge.
3605 if (ExistingEdgeToNewCallee) {
3606 // Since we already have an edge to NewCallee, simply move the ids
3607 // onto it, and remove the existing Edge.
3608 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3609 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3610 assert(Edge->ContextIds == ContextIdsToMove);
3611 removeEdgeFromGraph(Edge.get());
3612 } else {
3613 // Otherwise just reconnect Edge to NewCallee.
3614 Edge->Callee = NewCallee;
3615 NewCallee->CallerEdges.push_back(Edge);
3616 // Remove it from callee where it was previously connected.
3617 OldCallee->eraseCallerEdge(Edge.get());
3618 // Don't need to update Edge's context ids since we are simply
3619 // reconnecting it.
3620 }
3621 } else {
3622 // Only moving a subset of Edge's ids.
3623 // Compute the alloc type of the subset of ids being moved.
3624 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3625 if (ExistingEdgeToNewCallee) {
3626 // Since we already have an edge to NewCallee, simply move the ids
3627 // onto it.
3628 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3629 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3630 } else {
3631 // Otherwise, create a new edge to NewCallee for the ids being moved.
3632 auto NewEdge = std::make_shared<ContextEdge>(
3633 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3634 Edge->Caller->CalleeEdges.push_back(NewEdge);
3635 NewCallee->CallerEdges.push_back(NewEdge);
3636 }
3637 // In either case, need to update the alloc types on NewCallee, and remove
3638 // those ids and update the alloc type on the original Edge.
3639 NewCallee->AllocTypes |= CallerEdgeAllocType;
3640 set_subtract(Edge->ContextIds, ContextIdsToMove);
3641 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3642 }
3643 // Now walk the old callee node's callee edges and move Edge's context ids
3644 // over to the corresponding edge into the clone (which is created here if
3645 // this is a newly created clone).
3646 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3647 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3648 // If this is a direct recursion edge, use NewCallee (the clone) as the
3649 // callee as well, so that any edge updated/created here is also direct
3650 // recursive.
3651 if (CalleeToUse == OldCallee) {
3652 // If this is a recursive edge, see if we already moved a recursive edge
3653 // (which would have to have been this one) - if we were only moving a
3654 // subset of context ids it would still be on OldCallee.
3655 if (EdgeIsRecursive) {
3656 assert(OldCalleeEdge == Edge);
3657 continue;
3658 }
3659 CalleeToUse = NewCallee;
3660 }
3661 // The context ids moving to the new callee are the subset of this edge's
3662 // context ids and the context ids on the caller edge being moved.
3663 DenseSet<uint32_t> EdgeContextIdsToMove =
3664 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3665 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3666 OldCalleeEdge->AllocTypes =
3667 computeAllocType(OldCalleeEdge->getContextIds());
3668 if (!NewClone) {
3669 // Update context ids / alloc type on corresponding edge to NewCallee.
3670 // There is a chance this may not exist if we are reusing an existing
3671 // clone, specifically during function assignment, where we would have
3672 // removed none type edges after creating the clone. If we can't find
3673 // a corresponding edge there, fall through to the cloning below.
3674 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3675 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3676 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3677 continue;
3678 }
3679 }
3680 auto NewEdge = std::make_shared<ContextEdge>(
3681 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3682 EdgeContextIdsToMove);
3683 NewCallee->CalleeEdges.push_back(NewEdge);
3684 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3685 }
3686 // Recompute the node alloc type now that its callee edges have been
3687 // updated (since we will compute from those edges).
3688 OldCallee->AllocTypes = OldCallee->computeAllocType();
3689 // OldCallee alloc type should be None iff its context id set is now empty.
3690 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3691 OldCallee->emptyContextIds());
3692 if (VerifyCCG) {
3693 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3694 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3695 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3696 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3697 /*CheckEdges=*/false);
3698 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3699 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3700 /*CheckEdges=*/false);
3701 }
3702}
3703
3704template <typename DerivedCCG, typename FuncTy, typename CallTy>
3705void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3706 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3707 ContextNode *NewCaller) {
3708 auto *OldCallee = Edge->Callee;
3709 auto *NewCallee = OldCallee;
3710 // If this edge was direct recursive, make any new/updated edge also direct
3711 // recursive to NewCaller.
3712 bool Recursive = Edge->Caller == Edge->Callee;
3713 if (Recursive)
3714 NewCallee = NewCaller;
3715
3716 ContextNode *OldCaller = Edge->Caller;
3717 OldCaller->eraseCalleeEdge(Edge.get());
3718
3719 // We might already have an edge to the new caller. If one exists we will
3720 // reuse it.
3721 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3722
3723 if (ExistingEdgeToNewCaller) {
3724 // Since we already have an edge to NewCaller, simply move the ids
3725 // onto it, and remove the existing Edge.
3726 ExistingEdgeToNewCaller->getContextIds().insert_range(
3727 Edge->getContextIds());
3728 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3729 Edge->ContextIds.clear();
3730 Edge->AllocTypes = (uint8_t)AllocationType::None;
3731 OldCallee->eraseCallerEdge(Edge.get());
3732 } else {
3733 // Otherwise just reconnect Edge to NewCaller.
3734 Edge->Caller = NewCaller;
3735 NewCaller->CalleeEdges.push_back(Edge);
3736 if (Recursive) {
3737 assert(NewCallee == NewCaller);
3738 // In the case of (direct) recursive edges, we update the callee as well
3739 // so that it becomes recursive on the new caller.
3740 Edge->Callee = NewCallee;
3741 NewCallee->CallerEdges.push_back(Edge);
3742 OldCallee->eraseCallerEdge(Edge.get());
3743 }
3744 // Don't need to update Edge's context ids since we are simply
3745 // reconnecting it.
3746 }
3747 // In either case, need to update the alloc types on New Caller.
3748 NewCaller->AllocTypes |= Edge->AllocTypes;
3749
3750 // Now walk the old caller node's caller edges and move Edge's context ids
3751 // over to the corresponding edge into the node (which is created here if
3752 // this is a newly created node). We can tell whether this is a newly created
3753 // node by seeing if it has any caller edges yet.
3754#ifndef NDEBUG
3755 bool IsNewNode = NewCaller->CallerEdges.empty();
3756#endif
3757 // If we just moved a direct recursive edge, presumably its context ids should
3758 // also flow out of OldCaller via some other non-recursive callee edge. We
3759 // don't want to remove the recursive context ids from other caller edges yet,
3760 // otherwise the context ids get into an inconsistent state on OldCaller.
3761 // We will update these context ids on the non-recursive caller edge when and
3762 // if they are updated on the non-recursive callee.
3763 if (!Recursive) {
3764 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3765 auto OldCallerCaller = OldCallerEdge->Caller;
3766 // The context ids moving to the new caller are the subset of this edge's
3767 // context ids and the context ids on the callee edge being moved.
3768 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3769 OldCallerEdge->getContextIds(), Edge->getContextIds());
3770 if (OldCaller == OldCallerCaller) {
3771 OldCallerCaller = NewCaller;
3772 // Don't actually move this one. The caller will move it directly via a
3773 // call to this function with this as the Edge if it is appropriate to
3774 // move to a diff node that has a matching callee (itself).
3775 continue;
3776 }
3777 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3778 OldCallerEdge->AllocTypes =
3779 computeAllocType(OldCallerEdge->getContextIds());
3780 // In this function we expect that any pre-existing node already has edges
3781 // from the same callers as the old node. That should be true in the
3782 // current use case, where we will remove None-type edges after copying
3783 // over all caller edges from the callee.
3784 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3785 // Since we would have skipped caller edges when moving a direct recursive
3786 // edge, this may not hold true when recursive handling enabled.
3787 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3788 if (ExistingCallerEdge) {
3789 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3790 ExistingCallerEdge->AllocTypes |=
3791 computeAllocType(EdgeContextIdsToMove);
3792 continue;
3793 }
3794 auto NewEdge = std::make_shared<ContextEdge>(
3795 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3796 EdgeContextIdsToMove);
3797 NewCaller->CallerEdges.push_back(NewEdge);
3798 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3799 }
3800 }
3801 // Recompute the node alloc type now that its caller edges have been
3802 // updated (since we will compute from those edges).
3803 OldCaller->AllocTypes = OldCaller->computeAllocType();
3804 // OldCaller alloc type should be None iff its context id set is now empty.
3805 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3806 OldCaller->emptyContextIds());
3807 if (VerifyCCG) {
3808 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3809 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3810 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3811 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3812 /*CheckEdges=*/false);
3813 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3814 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3815 /*CheckEdges=*/false);
3816 }
3817}
3818
3819template <typename DerivedCCG, typename FuncTy, typename CallTy>
3820void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3821 recursivelyRemoveNoneTypeCalleeEdges(
3822 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3823 auto Inserted = Visited.insert(Node);
3824 if (!Inserted.second)
3825 return;
3826
3827 removeNoneTypeCalleeEdges(Node);
3828
3829 for (auto *Clone : Node->Clones)
3830 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3831
3832 // The recursive call may remove some of this Node's caller edges.
3833 // Iterate over a copy and skip any that were removed.
3834 auto CallerEdges = Node->CallerEdges;
3835 for (auto &Edge : CallerEdges) {
3836 // Skip any that have been removed by an earlier recursive call.
3837 if (Edge->isRemoved()) {
3838 assert(!is_contained(Node->CallerEdges, Edge));
3839 continue;
3840 }
3841 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3842 }
3843}
3844
3845// This is the standard DFS based backedge discovery algorithm.
3846template <typename DerivedCCG, typename FuncTy, typename CallTy>
3847void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3848 // If we are cloning recursive contexts, find and mark backedges from all root
3849 // callers, using the typical DFS based backedge analysis.
3851 return;
3852 DenseSet<const ContextNode *> Visited;
3853 DenseSet<const ContextNode *> CurrentStack;
3854 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3855 auto *Node = Entry.second;
3856 if (Node->isRemoved())
3857 continue;
3858 // It is a root if it doesn't have callers.
3859 if (!Node->CallerEdges.empty())
3860 continue;
3861 markBackedges(Node, Visited, CurrentStack);
3862 assert(CurrentStack.empty());
3863 }
3864}
3865
3866// Recursive helper for above markBackedges method.
3867template <typename DerivedCCG, typename FuncTy, typename CallTy>
3868void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3869 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3870 DenseSet<const ContextNode *> &CurrentStack) {
3871 auto I = Visited.insert(Node);
3872 // We should only call this for unvisited nodes.
3873 assert(I.second);
3874 (void)I;
3875 for (auto &CalleeEdge : Node->CalleeEdges) {
3876 auto *Callee = CalleeEdge->Callee;
3877 if (Visited.count(Callee)) {
3878 // Since this was already visited we need to check if it is currently on
3879 // the recursive stack in which case it is a backedge.
3880 if (CurrentStack.count(Callee))
3881 CalleeEdge->IsBackedge = true;
3882 continue;
3883 }
3884 CurrentStack.insert(Callee);
3885 markBackedges(Callee, Visited, CurrentStack);
3886 CurrentStack.erase(Callee);
3887 }
3888}
3889
3890template <typename DerivedCCG, typename FuncTy, typename CallTy>
3891void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3892 DenseSet<const ContextNode *> Visited;
3893 for (auto &Entry : AllocationCallToContextNodeMap) {
3894 Visited.clear();
3895 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3896 }
3897 Visited.clear();
3898 for (auto &Entry : AllocationCallToContextNodeMap)
3899 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3900 if (VerifyCCG)
3901 check();
3902}
3903
3904// helper function to check an AllocType is cold or notcold or both.
3911
3912template <typename DerivedCCG, typename FuncTy, typename CallTy>
3913void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3914 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3915 const DenseSet<uint32_t> &AllocContextIds) {
3916 if (VerifyNodes)
3917 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3918 assert(!Node->CloneOf);
3919
3920 // If Node as a null call, then either it wasn't found in the module (regular
3921 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3922 // cloning (e.g. recursion, calls multiple targets, etc).
3923 // Do this here so that we don't try to recursively clone callers below, which
3924 // isn't useful at least for this node.
3925 if (!Node->hasCall())
3926 return;
3927
3928 // No need to look at any callers if allocation type already unambiguous.
3929 if (hasSingleAllocType(Node->AllocTypes))
3930 return;
3931
3932#ifndef NDEBUG
3933 auto Insert =
3934#endif
3935 Visited.insert(Node);
3936 // We should not have visited this node yet.
3937 assert(Insert.second);
3938 // The recursive call to identifyClones may delete the current edge from the
3939 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3940 // in an iterator and having recursive call erase from it. Other edges may
3941 // also get removed during the recursion, which will have null Callee and
3942 // Caller pointers (and are deleted later), so we skip those below.
3943 {
3944 auto CallerEdges = Node->CallerEdges;
3945 for (auto &Edge : CallerEdges) {
3946 // Skip any that have been removed by an earlier recursive call.
3947 if (Edge->isRemoved()) {
3948 assert(!is_contained(Node->CallerEdges, Edge));
3949 continue;
3950 }
3951 // Defer backedges. See comments further below where these edges are
3952 // handled during the cloning of this Node.
3953 if (Edge->IsBackedge) {
3954 // We should only mark these if cloning recursive contexts, where we
3955 // need to do this deferral.
3957 continue;
3958 }
3959 // Ignore any caller we previously visited via another edge.
3960 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3961 identifyClones(Edge->Caller, Visited, AllocContextIds);
3962 }
3963 }
3964 }
3965
3966 // Check if we reached an unambiguous call or have have only a single caller.
3967 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3968 return;
3969
3970 // We need to clone.
3971
3972 // Try to keep the original version as alloc type NotCold. This will make
3973 // cases with indirect calls or any other situation with an unknown call to
3974 // the original function get the default behavior. We do this by sorting the
3975 // CallerEdges of the Node we will clone by alloc type.
3976 //
3977 // Give NotCold edge the lowest sort priority so those edges are at the end of
3978 // the caller edges vector, and stay on the original version (since the below
3979 // code clones greedily until it finds all remaining edges have the same type
3980 // and leaves the remaining ones on the original Node).
3981 //
3982 // We shouldn't actually have any None type edges, so the sorting priority for
3983 // that is arbitrary, and we assert in that case below.
3984 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3985 /*Cold*/ 1,
3986 /*NotColdCold*/ 2};
3987 llvm::stable_sort(Node->CallerEdges,
3988 [&](const std::shared_ptr<ContextEdge> &A,
3989 const std::shared_ptr<ContextEdge> &B) {
3990 // Nodes with non-empty context ids should be sorted
3991 // before those with empty context ids.
3992 if (A->ContextIds.empty())
3993 // Either B ContextIds are non-empty (in which case we
3994 // should return false because B < A), or B ContextIds
3995 // are empty, in which case they are equal, and we
3996 // should maintain the original relative ordering.
3997 return false;
3998 if (B->ContextIds.empty())
3999 return true;
4000
4001 if (A->AllocTypes == B->AllocTypes)
4002 // Use the first context id for each edge as a
4003 // tie-breaker.
4004 return *A->ContextIds.begin() < *B->ContextIds.begin();
4005 return AllocTypeCloningPriority[A->AllocTypes] <
4006 AllocTypeCloningPriority[B->AllocTypes];
4007 });
4008
4009 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4010
4011 DenseSet<uint32_t> RecursiveContextIds;
4013 // If we are allowing recursive callsites, but have also disabled recursive
4014 // contexts, look for context ids that show up in multiple caller edges.
4016 DenseSet<uint32_t> AllCallerContextIds;
4017 for (auto &CE : Node->CallerEdges) {
4018 // Resize to the largest set of caller context ids, since we know the
4019 // final set will be at least that large.
4020 AllCallerContextIds.reserve(CE->getContextIds().size());
4021 for (auto Id : CE->getContextIds())
4022 if (!AllCallerContextIds.insert(Id).second)
4023 RecursiveContextIds.insert(Id);
4024 }
4025 }
4026
4027 // Iterate until we find no more opportunities for disambiguating the alloc
4028 // types via cloning. In most cases this loop will terminate once the Node
4029 // has a single allocation type, in which case no more cloning is needed.
4030 // Iterate over a copy of Node's caller edges, since we may need to remove
4031 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4032 // makes it less error-prone.
4033 auto CallerEdges = Node->CallerEdges;
4034 for (auto &CallerEdge : CallerEdges) {
4035 // Skip any that have been removed by an earlier recursive call.
4036 if (CallerEdge->isRemoved()) {
4037 assert(!is_contained(Node->CallerEdges, CallerEdge));
4038 continue;
4039 }
4040 assert(CallerEdge->Callee == Node);
4041
4042 // See if cloning the prior caller edge left this node with a single alloc
4043 // type or a single caller. In that case no more cloning of Node is needed.
4044 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4045 break;
4046
4047 // If the caller was not successfully matched to a call in the IR/summary,
4048 // there is no point in trying to clone for it as we can't update that call.
4049 if (!CallerEdge->Caller->hasCall())
4050 continue;
4051
4052 // Only need to process the ids along this edge pertaining to the given
4053 // allocation.
4054 auto CallerEdgeContextsForAlloc =
4055 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4056 if (!RecursiveContextIds.empty())
4057 CallerEdgeContextsForAlloc =
4058 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4059 if (CallerEdgeContextsForAlloc.empty())
4060 continue;
4061
4062 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4063
4064 // Compute the node callee edge alloc types corresponding to the context ids
4065 // for this caller edge.
4066 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4067 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
4068 for (auto &CalleeEdge : Node->CalleeEdges)
4069 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4070 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4071
4072 // Don't clone if doing so will not disambiguate any alloc types amongst
4073 // caller edges (including the callee edges that would be cloned).
4074 // Otherwise we will simply move all edges to the clone.
4075 //
4076 // First check if by cloning we will disambiguate the caller allocation
4077 // type from node's allocation type. Query allocTypeToUse so that we don't
4078 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4079 // neither of these should be None type.
4080 //
4081 // Then check if by cloning node at least one of the callee edges will be
4082 // disambiguated by splitting out different context ids.
4083 //
4084 // However, always do the cloning if this is a backedge, in which case we
4085 // have not yet cloned along this caller edge.
4086 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4087 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4088 if (!CallerEdge->IsBackedge &&
4089 allocTypeToUse(CallerAllocTypeForAlloc) ==
4090 allocTypeToUse(Node->AllocTypes) &&
4091 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4092 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4093 continue;
4094 }
4095
4096 if (CallerEdge->IsBackedge) {
4097 // We should only mark these if cloning recursive contexts, where we
4098 // need to do this deferral.
4100 DeferredBackedges++;
4101 }
4102
4103 // If this is a backedge, we now do recursive cloning starting from its
4104 // caller since we may have moved unambiguous caller contexts to a clone
4105 // of this Node in a previous iteration of the current loop, giving more
4106 // opportunity for cloning through the backedge. Because we sorted the
4107 // caller edges earlier so that cold caller edges are first, we would have
4108 // visited and cloned this node for any unamibiguously cold non-recursive
4109 // callers before any ambiguous backedge callers. Note that we don't do this
4110 // if the caller is already cloned or visited during cloning (e.g. via a
4111 // different context path from the allocation).
4112 // TODO: Can we do better in the case where the caller was already visited?
4113 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4114 !Visited.count(CallerEdge->Caller)) {
4115 const auto OrigIdCount = CallerEdge->getContextIds().size();
4116 // Now do the recursive cloning of this backedge's caller, which was
4117 // deferred earlier.
4118 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4119 removeNoneTypeCalleeEdges(CallerEdge->Caller);
4120 // See if the recursive call to identifyClones moved the context ids to a
4121 // new edge from this node to a clone of caller, and switch to looking at
4122 // that new edge so that we clone Node for the new caller clone.
4123 bool UpdatedEdge = false;
4124 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4125 for (auto E : Node->CallerEdges) {
4126 // Only interested in clones of the current edges caller.
4127 if (E->Caller->CloneOf != CallerEdge->Caller)
4128 continue;
4129 // See if this edge contains any of the context ids originally on the
4130 // current caller edge.
4131 auto CallerEdgeContextsForAllocNew =
4132 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4133 if (CallerEdgeContextsForAllocNew.empty())
4134 continue;
4135 // Make sure we don't pick a previously existing caller edge of this
4136 // Node, which would be processed on a different iteration of the
4137 // outer loop over the saved CallerEdges.
4138 if (llvm::is_contained(CallerEdges, E))
4139 continue;
4140 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4141 // are updated further below for all cases where we just invoked
4142 // identifyClones recursively.
4143 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4144 CallerEdge = E;
4145 UpdatedEdge = true;
4146 break;
4147 }
4148 }
4149 // If cloning removed this edge (and we didn't update it to a new edge
4150 // above), we're done with this edge. It's possible we moved all of the
4151 // context ids to an existing clone, in which case there's no need to do
4152 // further processing for them.
4153 if (CallerEdge->isRemoved())
4154 continue;
4155
4156 // Now we need to update the information used for the cloning decisions
4157 // further below, as we may have modified edges and their context ids.
4158
4159 // Note if we changed the CallerEdge above we would have already updated
4160 // the context ids.
4161 if (!UpdatedEdge) {
4162 CallerEdgeContextsForAlloc = set_intersection(
4163 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4164 if (CallerEdgeContextsForAlloc.empty())
4165 continue;
4166 }
4167 // Update the other information that depends on the edges and on the now
4168 // updated CallerEdgeContextsForAlloc.
4169 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4170 CalleeEdgeAllocTypesForCallerEdge.clear();
4171 for (auto &CalleeEdge : Node->CalleeEdges) {
4172 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4173 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4174 }
4175 }
4176
4177 // First see if we can use an existing clone. Check each clone and its
4178 // callee edges for matching alloc types.
4179 ContextNode *Clone = nullptr;
4180 for (auto *CurClone : Node->Clones) {
4181 if (allocTypeToUse(CurClone->AllocTypes) !=
4182 allocTypeToUse(CallerAllocTypeForAlloc))
4183 continue;
4184
4185 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4186 hasSingleAllocType(CallerAllocTypeForAlloc);
4187 // The above check should mean that if both have single alloc types that
4188 // they should be equal.
4189 assert(!BothSingleAlloc ||
4190 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4191
4192 // If either both have a single alloc type (which are the same), or if the
4193 // clone's callee edges have the same alloc types as those for the current
4194 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4195 // then we can reuse this clone.
4196 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4197 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4198 Clone = CurClone;
4199 break;
4200 }
4201 }
4202
4203 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4204 if (Clone)
4205 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
4206 CallerEdgeContextsForAlloc);
4207 else
4208 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
4209
4210 // Sanity check that no alloc types on clone or its edges are None.
4211 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4212 }
4213
4214 // We should still have some context ids on the original Node.
4215 assert(!Node->emptyContextIds());
4216
4217 // Sanity check that no alloc types on node or edges are None.
4218 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4219
4220 if (VerifyNodes)
4221 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4222}
4223
4224void ModuleCallsiteContextGraph::updateAllocationCall(
4225 CallInfo &Call, AllocationType AllocType) {
4226 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
4228 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
4229 "memprof", AllocTypeString);
4230 cast<CallBase>(Call.call())->addFnAttr(A);
4231 OREGetter(Call.call()->getFunction())
4232 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4233 << ore::NV("AllocationCall", Call.call()) << " in clone "
4234 << ore::NV("Caller", Call.call()->getFunction())
4235 << " marked with memprof allocation attribute "
4236 << ore::NV("Attribute", AllocTypeString));
4237}
4238
4239void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4241 auto *AI = cast<AllocInfo *>(Call.call());
4242 assert(AI);
4243 assert(AI->Versions.size() > Call.cloneNo());
4244 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4245}
4246
4248ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4249 const auto *CB = cast<CallBase>(Call.call());
4250 if (!CB->getAttributes().hasFnAttr("memprof"))
4251 return AllocationType::None;
4252 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4253 ? AllocationType::Cold
4254 : AllocationType::NotCold;
4255}
4256
4258IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4259 const auto *AI = cast<AllocInfo *>(Call.call());
4260 assert(AI->Versions.size() > Call.cloneNo());
4261 return (AllocationType)AI->Versions[Call.cloneNo()];
4262}
4263
4264void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4265 FuncInfo CalleeFunc) {
4266 auto *CurF = getCalleeFunc(CallerCall.call());
4267 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4268 if (isMemProfClone(*CurF)) {
4269 // If we already assigned this callsite to call a specific non-default
4270 // clone (i.e. not the original function which is clone 0), ensure that we
4271 // aren't trying to now update it to call a different clone, which is
4272 // indicative of a bug in the graph or function assignment.
4273 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4274 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4275 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4276 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4277 << "\n");
4278 MismatchedCloneAssignments++;
4279 }
4280 }
4281 if (NewCalleeCloneNo > 0)
4282 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4283 OREGetter(CallerCall.call()->getFunction())
4284 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4285 << ore::NV("Call", CallerCall.call()) << " in clone "
4286 << ore::NV("Caller", CallerCall.call()->getFunction())
4287 << " assigned to call function clone "
4288 << ore::NV("Callee", CalleeFunc.func()));
4289}
4290
4291void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4292 FuncInfo CalleeFunc) {
4293 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4294 assert(CI &&
4295 "Caller cannot be an allocation which should not have profiled calls");
4296 assert(CI->Clones.size() > CallerCall.cloneNo());
4297 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4298 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4299 // If we already assigned this callsite to call a specific non-default
4300 // clone (i.e. not the original function which is clone 0), ensure that we
4301 // aren't trying to now update it to call a different clone, which is
4302 // indicative of a bug in the graph or function assignment.
4303 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4304 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4305 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4306 << "\n");
4307 MismatchedCloneAssignments++;
4308 }
4309 CurCalleeCloneNo = NewCalleeCloneNo;
4310}
4311
4312// Update the debug information attached to NewFunc to use the clone Name. Note
4313// this needs to be done for both any existing DISubprogram for the definition,
4314// as well as any separate declaration DISubprogram.
4316 assert(Name == NewFunc->getName());
4317 auto *SP = NewFunc->getSubprogram();
4318 if (!SP)
4319 return;
4320 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4321 SP->replaceLinkageName(MDName);
4322 DISubprogram *Decl = SP->getDeclaration();
4323 if (!Decl)
4324 return;
4325 TempDISubprogram NewDecl = Decl->clone();
4326 NewDecl->replaceLinkageName(MDName);
4327 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4328}
4329
4330CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4331 Instruction *>::FuncInfo
4332ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4333 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4334 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4335 // Use existing LLVM facilities for cloning and obtaining Call in clone
4336 ValueToValueMapTy VMap;
4337 auto *NewFunc = CloneFunction(Func.func(), VMap);
4338 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4339 assert(!Func.func()->getParent()->getFunction(Name));
4340 NewFunc->setName(Name);
4341 updateSubprogramLinkageName(NewFunc, Name);
4342 for (auto &Inst : CallsWithMetadataInFunc) {
4343 // This map always has the initial version in it.
4344 assert(Inst.cloneNo() == 0);
4345 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4346 }
4347 OREGetter(Func.func())
4348 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4349 << "created clone " << ore::NV("NewFunction", NewFunc));
4350 return {NewFunc, CloneNo};
4351}
4352
4353CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4354 IndexCall>::FuncInfo
4355IndexCallsiteContextGraph::cloneFunctionForCallsite(
4356 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4357 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4358 // Check how many clones we have of Call (and therefore function).
4359 // The next clone number is the current size of versions array.
4360 // Confirm this matches the CloneNo provided by the caller, which is based on
4361 // the number of function clones we have.
4362 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4363 ? cast<AllocInfo *>(Call.call())->Versions.size()
4364 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4365 // Walk all the instructions in this function. Create a new version for
4366 // each (by adding an entry to the Versions/Clones summary array), and copy
4367 // over the version being called for the function clone being cloned here.
4368 // Additionally, add an entry to the CallMap for the new function clone,
4369 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4370 // to the new call clone.
4371 for (auto &Inst : CallsWithMetadataInFunc) {
4372 // This map always has the initial version in it.
4373 assert(Inst.cloneNo() == 0);
4374 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4375 assert(AI->Versions.size() == CloneNo);
4376 // We assign the allocation type later (in updateAllocationCall), just add
4377 // an entry for it here.
4378 AI->Versions.push_back(0);
4379 } else {
4380 auto *CI = cast<CallsiteInfo *>(Inst.call());
4381 assert(CI && CI->Clones.size() == CloneNo);
4382 // We assign the clone number later (in updateCall), just add an entry for
4383 // it here.
4384 CI->Clones.push_back(0);
4385 }
4386 CallMap[Inst] = {Inst.call(), CloneNo};
4387 }
4388 return {Func.func(), CloneNo};
4389}
4390
4391// We perform cloning for each allocation node separately. However, this
4392// sometimes results in a situation where the same node calls multiple
4393// clones of the same callee, created for different allocations. This
4394// causes issues when assigning functions to these clones, as each node can
4395// in reality only call a single callee clone.
4396//
4397// To address this, before assigning functions, merge callee clone nodes as
4398// needed using a post order traversal from the allocations. We attempt to
4399// use existing clones as the merge node when legal, and to share them
4400// among callers with the same properties (callers calling the same set of
4401// callee clone nodes for the same allocations).
4402//
4403// Without this fix, in some cases incorrect function assignment will lead
4404// to calling the wrong allocation clone.
4405template <typename DerivedCCG, typename FuncTy, typename CallTy>
4406void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4407 if (!MergeClones)
4408 return;
4409
4410 // Generate a map from context id to the associated allocation node for use
4411 // when merging clones.
4412 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4413 for (auto &Entry : AllocationCallToContextNodeMap) {
4414 auto *Node = Entry.second;
4415 for (auto Id : Node->getContextIds())
4416 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4417 for (auto *Clone : Node->Clones) {
4418 for (auto Id : Clone->getContextIds())
4419 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4420 }
4421 }
4422
4423 // Post order traversal starting from allocations to ensure each callsite
4424 // calls a single clone of its callee. Callee nodes that are clones of each
4425 // other are merged (via new merge nodes if needed) to achieve this.
4426 DenseSet<const ContextNode *> Visited;
4427 for (auto &Entry : AllocationCallToContextNodeMap) {
4428 auto *Node = Entry.second;
4429
4430 mergeClones(Node, Visited, ContextIdToAllocationNode);
4431
4432 // Make a copy so the recursive post order traversal that may create new
4433 // clones doesn't mess up iteration. Note that the recursive traversal
4434 // itself does not call mergeClones on any of these nodes, which are all
4435 // (clones of) allocations.
4436 auto Clones = Node->Clones;
4437 for (auto *Clone : Clones)
4438 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4439 }
4440
4441 if (DumpCCG) {
4442 dbgs() << "CCG after merging:\n";
4443 dbgs() << *this;
4444 }
4445 if (ExportToDot)
4446 exportToDot("aftermerge");
4447
4448 if (VerifyCCG) {
4449 check();
4450 }
4451}
4452
4453// Recursive helper for above mergeClones method.
4454template <typename DerivedCCG, typename FuncTy, typename CallTy>
4455void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4456 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4457 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4458 auto Inserted = Visited.insert(Node);
4459 if (!Inserted.second)
4460 return;
4461
4462 // Iteratively perform merging on this node to handle new caller nodes created
4463 // during the recursive traversal. We could do something more elegant such as
4464 // maintain a worklist, but this is a simple approach that doesn't cause a
4465 // measureable compile time effect, as most nodes don't have many caller
4466 // edges to check.
4467 bool FoundUnvisited = true;
4468 unsigned Iters = 0;
4469 while (FoundUnvisited) {
4470 Iters++;
4471 FoundUnvisited = false;
4472 // Make a copy since the recursive call may move a caller edge to a new
4473 // callee, messing up the iterator.
4474 auto CallerEdges = Node->CallerEdges;
4475 for (auto CallerEdge : CallerEdges) {
4476 // Skip any caller edge moved onto a different callee during recursion.
4477 if (CallerEdge->Callee != Node)
4478 continue;
4479 // If we found an unvisited caller, note that we should check the caller
4480 // edges again as mergeClones may add or change caller nodes.
4481 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4482 FoundUnvisited = true;
4483 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4484 }
4485 }
4486
4487 TotalMergeInvokes++;
4488 TotalMergeIters += Iters;
4489 if (Iters > MaxMergeIters)
4490 MaxMergeIters = Iters;
4491
4492 // Merge for this node after we handle its callers.
4493 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4494}
4495
4496template <typename DerivedCCG, typename FuncTy, typename CallTy>
4497void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4498 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4499 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4500 // Ignore Node if we moved all of its contexts to clones.
4501 if (Node->emptyContextIds())
4502 return;
4503
4504 // First identify groups of clones among Node's callee edges, by building
4505 // a map from each callee base node to the associated callee edges from Node.
4506 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4507 OrigNodeToCloneEdges;
4508 for (const auto &E : Node->CalleeEdges) {
4509 auto *Callee = E->Callee;
4510 if (!Callee->CloneOf && Callee->Clones.empty())
4511 continue;
4512 ContextNode *Base = Callee->getOrigNode();
4513 OrigNodeToCloneEdges[Base].push_back(E);
4514 }
4515
4516 // Helper for callee edge sorting below. Return true if A's callee has fewer
4517 // caller edges than B, or if A is a clone and B is not, or if A's first
4518 // context id is smaller than B's.
4519 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4520 const std::shared_ptr<ContextEdge> &B) {
4521 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4522 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4523 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4524 return true;
4525 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4526 return false;
4527 // Use the first context id for each edge as a
4528 // tie-breaker.
4529 return *A->ContextIds.begin() < *B->ContextIds.begin();
4530 };
4531
4532 // Process each set of callee clones called by Node, performing the needed
4533 // merging.
4534 for (auto Entry : OrigNodeToCloneEdges) {
4535 // CalleeEdges is the set of edges from Node reaching callees that are
4536 // mutual clones of each other.
4537 auto &CalleeEdges = Entry.second;
4538 auto NumCalleeClones = CalleeEdges.size();
4539 // A single edge means there is no merging needed.
4540 if (NumCalleeClones == 1)
4541 continue;
4542 // Sort the CalleeEdges calling this group of clones in ascending order of
4543 // their caller edge counts, putting the original non-clone node first in
4544 // cases of a tie. This simplifies finding an existing node to use as the
4545 // merge node.
4546 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4547
4548 /// Find other callers of the given set of callee edges that can
4549 /// share the same callee merge node. See the comments at this method
4550 /// definition for details.
4551 DenseSet<ContextNode *> OtherCallersToShareMerge;
4552 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4553 OtherCallersToShareMerge);
4554
4555 // Now do the actual merging. Identify existing or create a new MergeNode
4556 // during the first iteration. Move each callee over, along with edges from
4557 // other callers we've determined above can share the same merge node.
4558 ContextNode *MergeNode = nullptr;
4559 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4560 for (auto CalleeEdge : CalleeEdges) {
4561 auto *OrigCallee = CalleeEdge->Callee;
4562 // If we don't have a MergeNode yet (only happens on the first iteration,
4563 // as a new one will be created when we go to move the first callee edge
4564 // over as needed), see if we can use this callee.
4565 if (!MergeNode) {
4566 // If there are no other callers, simply use this callee.
4567 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4568 MergeNode = OrigCallee;
4569 NonNewMergedNodes++;
4570 continue;
4571 }
4572 // Otherwise, if we have identified other caller nodes that can share
4573 // the merge node with Node, see if all of OrigCallee's callers are
4574 // going to share the same merge node. In that case we can use callee
4575 // (since all of its callers would move to the new merge node).
4576 if (!OtherCallersToShareMerge.empty()) {
4577 bool MoveAllCallerEdges = true;
4578 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4579 if (CalleeCallerE == CalleeEdge)
4580 continue;
4581 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4582 MoveAllCallerEdges = false;
4583 break;
4584 }
4585 }
4586 // If we are going to move all callers over, we can use this callee as
4587 // the MergeNode.
4588 if (MoveAllCallerEdges) {
4589 MergeNode = OrigCallee;
4590 NonNewMergedNodes++;
4591 continue;
4592 }
4593 }
4594 }
4595 // Move this callee edge, creating a new merge node if necessary.
4596 if (MergeNode) {
4597 assert(MergeNode != OrigCallee);
4598 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4599 /*NewClone*/ false);
4600 } else {
4601 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4602 NewMergedNodes++;
4603 }
4604 // Now move all identified edges from other callers over to the merge node
4605 // as well.
4606 if (!OtherCallersToShareMerge.empty()) {
4607 // Make and iterate over a copy of OrigCallee's caller edges because
4608 // some of these will be moved off of the OrigCallee and that would mess
4609 // up the iteration from OrigCallee.
4610 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4611 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4612 if (CalleeCallerE == CalleeEdge)
4613 continue;
4614 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4615 continue;
4616 CallerToMoveCount[CalleeCallerE->Caller]++;
4617 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4618 /*NewClone*/ false);
4619 }
4620 }
4621 removeNoneTypeCalleeEdges(OrigCallee);
4622 removeNoneTypeCalleeEdges(MergeNode);
4623 }
4624 }
4625}
4626
4627// Look for other nodes that have edges to the same set of callee
4628// clones as the current Node. Those can share the eventual merge node
4629// (reducing cloning and binary size overhead) iff:
4630// - they have edges to the same set of callee clones
4631// - each callee edge reaches a subset of the same allocations as Node's
4632// corresponding edge to the same callee clone.
4633// The second requirement is to ensure that we don't undo any of the
4634// necessary cloning to distinguish contexts with different allocation
4635// behavior.
4636// FIXME: This is somewhat conservative, as we really just need to ensure
4637// that they don't reach the same allocations as contexts on edges from Node
4638// going to any of the *other* callee clones being merged. However, that
4639// requires more tracking and checking to get right.
4640template <typename DerivedCCG, typename FuncTy, typename CallTy>
4641void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4642 findOtherCallersToShareMerge(
4643 ContextNode *Node,
4644 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4645 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4646 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4647 auto NumCalleeClones = CalleeEdges.size();
4648 // This map counts how many edges to the same callee clone exist for other
4649 // caller nodes of each callee clone.
4650 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4651 // Counts the number of other caller nodes that have edges to all callee
4652 // clones that don't violate the allocation context checking.
4653 unsigned PossibleOtherCallerNodes = 0;
4654
4655 // We only need to look at other Caller nodes if the first callee edge has
4656 // multiple callers (recall they are sorted in ascending order above).
4657 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4658 return;
4659
4660 // For each callee edge:
4661 // - Collect the count of other caller nodes calling the same callees.
4662 // - Collect the alloc nodes reached by contexts on each callee edge.
4663 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4664 for (auto CalleeEdge : CalleeEdges) {
4665 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4666 // For each other caller of the same callee, increment the count of
4667 // edges reaching the same callee clone.
4668 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4669 if (CalleeCallerEdges->Caller == Node) {
4670 assert(CalleeCallerEdges == CalleeEdge);
4671 continue;
4672 }
4673 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4674 // If this caller edge now reaches all of the same callee clones,
4675 // increment the count of candidate other caller nodes.
4676 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4677 NumCalleeClones)
4678 PossibleOtherCallerNodes++;
4679 }
4680 // Collect the alloc nodes reached by contexts on each callee edge, for
4681 // later analysis.
4682 for (auto Id : CalleeEdge->getContextIds()) {
4683 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4684 if (!Alloc) {
4685 // FIXME: unclear why this happens occasionally, presumably
4686 // imperfect graph updates possibly with recursion.
4687 MissingAllocForContextId++;
4688 continue;
4689 }
4690 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4691 }
4692 }
4693
4694 // Now walk the callee edges again, and make sure that for each candidate
4695 // caller node all of its edges to the callees reach the same allocs (or
4696 // a subset) as those along the corresponding callee edge from Node.
4697 for (auto CalleeEdge : CalleeEdges) {
4698 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4699 // Stop if we do not have any (more) candidate other caller nodes.
4700 if (!PossibleOtherCallerNodes)
4701 break;
4702 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4703 // Check each other caller of this callee clone.
4704 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4705 // Not interested in the callee edge from Node itself.
4706 if (CalleeCallerE == CalleeEdge)
4707 continue;
4708 // Skip any callers that didn't have callee edges to all the same
4709 // callee clones.
4710 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4711 NumCalleeClones)
4712 continue;
4713 // Make sure that each context along edge from candidate caller node
4714 // reaches an allocation also reached by this callee edge from Node.
4715 for (auto Id : CalleeCallerE->getContextIds()) {
4716 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4717 if (!Alloc)
4718 continue;
4719 // If not, simply reset the map entry to 0 so caller is ignored, and
4720 // reduce the count of candidate other caller nodes.
4721 if (!CurCalleeAllocNodes.contains(Alloc)) {
4722 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4723 PossibleOtherCallerNodes--;
4724 break;
4725 }
4726 }
4727 }
4728 }
4729
4730 if (!PossibleOtherCallerNodes)
4731 return;
4732
4733 // Build the set of other caller nodes that can use the same callee merge
4734 // node.
4735 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4736 if (Count != NumCalleeClones)
4737 continue;
4738 OtherCallersToShareMerge.insert(OtherCaller);
4739 }
4740}
4741
4742// This method assigns cloned callsites to functions, cloning the functions as
4743// needed. The assignment is greedy and proceeds roughly as follows:
4744//
4745// For each function Func:
4746// For each call with graph Node having clones:
4747// Initialize ClonesWorklist to Node and its clones
4748// Initialize NodeCloneCount to 0
4749// While ClonesWorklist is not empty:
4750// Clone = pop front ClonesWorklist
4751// NodeCloneCount++
4752// If Func has been cloned less than NodeCloneCount times:
4753// If NodeCloneCount is 1:
4754// Assign Clone to original Func
4755// Continue
4756// Create a new function clone
4757// If other callers not assigned to call a function clone yet:
4758// Assign them to call new function clone
4759// Continue
4760// Assign any other caller calling the cloned version to new clone
4761//
4762// For each caller of Clone:
4763// If caller is assigned to call a specific function clone:
4764// If we cannot assign Clone to that function clone:
4765// Create new callsite Clone NewClone
4766// Add NewClone to ClonesWorklist
4767// Continue
4768// Assign Clone to existing caller's called function clone
4769// Else:
4770// If Clone not already assigned to a function clone:
4771// Assign to first function clone without assignment
4772// Assign caller to selected function clone
4773// For each call with graph Node having clones:
4774// If number func clones > number call's callsite Node clones:
4775// Record func CallInfo clones without Node clone in UnassignedCallClones
4776// For callsite Nodes in DFS order from allocations:
4777// If IsAllocation:
4778// Update allocation with alloc type
4779// Else:
4780// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4781// Update call to call recorded callee clone
4782//
4783template <typename DerivedCCG, typename FuncTy, typename CallTy>
4784bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4785 bool Changed = false;
4786
4787 mergeClones();
4788
4789 // Keep track of the assignment of nodes (callsites) to function clones they
4790 // call.
4791 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4792
4793 // Update caller node to call function version CalleeFunc, by recording the
4794 // assignment in CallsiteToCalleeFuncCloneMap.
4795 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4796 const FuncInfo &CalleeFunc) {
4797 assert(Caller->hasCall());
4798 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4799 };
4800
4801 // Information for a single clone of this Func.
4802 struct FuncCloneInfo {
4803 // The function clone.
4804 FuncInfo FuncClone;
4805 // Remappings of each call of interest (from original uncloned call to the
4806 // corresponding cloned call in this function clone).
4807 DenseMap<CallInfo, CallInfo> CallMap;
4808 };
4809
4810 // Map to keep track of information needed to update calls in function clones
4811 // when their corresponding callsite node was not itself cloned for that
4812 // function clone. Because of call context pruning (i.e. we only keep as much
4813 // caller information as needed to distinguish hot vs cold), we may not have
4814 // caller edges coming to each callsite node from all possible function
4815 // callers. A function clone may get created for other callsites in the
4816 // function for which there are caller edges that were not pruned. Any other
4817 // callsites in that function clone, which were not themselved cloned for
4818 // that function clone, should get updated the same way as the corresponding
4819 // callsite in the original function (which may call a clone of its callee).
4820 //
4821 // We build this map after completing function cloning for each function, so
4822 // that we can record the information from its call maps before they are
4823 // destructed. The map will be used as we update calls to update any still
4824 // unassigned call clones. Note that we may create new node clones as we clone
4825 // other functions, so later on we check which node clones were still not
4826 // created. To this end, the inner map is a map from function clone number to
4827 // the list of calls cloned for that function (can be more than one due to the
4828 // Node's MatchingCalls array).
4829 //
4830 // The alternative is creating new callsite clone nodes below as we clone the
4831 // function, but that is tricker to get right and likely more overhead.
4832 //
4833 // Inner map is a std::map so sorted by key (clone number), in order to get
4834 // ordered remarks in the full LTO case.
4835 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4836 UnassignedCallClones;
4837
4838 // Walk all functions for which we saw calls with memprof metadata, and handle
4839 // cloning for each of its calls.
4840 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4841 FuncInfo OrigFunc(Func);
4842 // Map from each clone number of OrigFunc to information about that function
4843 // clone (the function clone FuncInfo and call remappings). The index into
4844 // the vector is the clone number, as function clones are created and
4845 // numbered sequentially.
4846 std::vector<FuncCloneInfo> FuncCloneInfos;
4847 for (auto &Call : CallsWithMetadata) {
4848 ContextNode *Node = getNodeForInst(Call);
4849 // Skip call if we do not have a node for it (all uses of its stack ids
4850 // were either on inlined chains or pruned from the MIBs), or if we did
4851 // not create any clones for it.
4852 if (!Node || Node->Clones.empty())
4853 continue;
4854 assert(Node->hasCall() &&
4855 "Not having a call should have prevented cloning");
4856
4857 // Track the assignment of function clones to clones of the current
4858 // callsite Node being handled.
4859 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4860
4861 // Assign callsite version CallsiteClone to function version FuncClone,
4862 // and also assign (possibly cloned) Call to CallsiteClone.
4863 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4864 CallInfo &Call,
4865 ContextNode *CallsiteClone,
4866 bool IsAlloc) {
4867 // Record the clone of callsite node assigned to this function clone.
4868 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4869
4870 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4871 DenseMap<CallInfo, CallInfo> &CallMap =
4872 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4873 CallInfo CallClone(Call);
4874 if (auto It = CallMap.find(Call); It != CallMap.end())
4875 CallClone = It->second;
4876 CallsiteClone->setCall(CallClone);
4877 // Need to do the same for all matching calls.
4878 for (auto &MatchingCall : Node->MatchingCalls) {
4879 CallInfo CallClone(MatchingCall);
4880 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4881 CallClone = It->second;
4882 // Updates the call in the list.
4883 MatchingCall = CallClone;
4884 }
4885 };
4886
4887 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4888 // performs the necessary fixups (removing none type edges, and
4889 // importantly, propagating any function call assignment of the original
4890 // node to the new clone).
4891 auto MoveEdgeToNewCalleeCloneAndSetUp =
4892 [&](const std::shared_ptr<ContextEdge> &Edge) {
4893 ContextNode *OrigCallee = Edge->Callee;
4894 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4895 removeNoneTypeCalleeEdges(NewClone);
4896 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4897 // If the original Callee was already assigned to call a specific
4898 // function version, make sure its new clone is assigned to call
4899 // that same function clone.
4900 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4901 RecordCalleeFuncOfCallsite(
4902 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4903 return NewClone;
4904 };
4905
4906 // Keep track of the clones of callsite Node that need to be assigned to
4907 // function clones. This list may be expanded in the loop body below if we
4908 // find additional cloning is required.
4909 std::deque<ContextNode *> ClonesWorklist;
4910 // Ignore original Node if we moved all of its contexts to clones.
4911 if (!Node->emptyContextIds())
4912 ClonesWorklist.push_back(Node);
4913 llvm::append_range(ClonesWorklist, Node->Clones);
4914
4915 // Now walk through all of the clones of this callsite Node that we need,
4916 // and determine the assignment to a corresponding clone of the current
4917 // function (creating new function clones as needed).
4918 unsigned NodeCloneCount = 0;
4919 while (!ClonesWorklist.empty()) {
4920 ContextNode *Clone = ClonesWorklist.front();
4921 ClonesWorklist.pop_front();
4922 NodeCloneCount++;
4923 if (VerifyNodes)
4925
4926 // Need to create a new function clone if we have more callsite clones
4927 // than existing function clones, which would have been assigned to an
4928 // earlier clone in the list (we assign callsite clones to function
4929 // clones greedily).
4930 if (FuncCloneInfos.size() < NodeCloneCount) {
4931 // If this is the first callsite copy, assign to original function.
4932 if (NodeCloneCount == 1) {
4933 // Since FuncCloneInfos is empty in this case, no clones have
4934 // been created for this function yet, and no callers should have
4935 // been assigned a function clone for this callee node yet.
4937 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4938 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4939 }));
4940 // Initialize with empty call map, assign Clone to original function
4941 // and its callers, and skip to the next clone.
4942 FuncCloneInfos.push_back(
4943 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
4944 AssignCallsiteCloneToFuncClone(
4945 OrigFunc, Call, Clone,
4946 AllocationCallToContextNodeMap.count(Call));
4947 for (auto &CE : Clone->CallerEdges) {
4948 // Ignore any caller that does not have a recorded callsite Call.
4949 if (!CE->Caller->hasCall())
4950 continue;
4951 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4952 }
4953 continue;
4954 }
4955
4956 // First locate which copy of OrigFunc to clone again. If a caller
4957 // of this callsite clone was already assigned to call a particular
4958 // function clone, we need to redirect all of those callers to the
4959 // new function clone, and update their other callees within this
4960 // function.
4961 FuncInfo PreviousAssignedFuncClone;
4962 auto EI = llvm::find_if(
4963 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4964 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4965 });
4966 bool CallerAssignedToCloneOfFunc = false;
4967 if (EI != Clone->CallerEdges.end()) {
4968 const std::shared_ptr<ContextEdge> &Edge = *EI;
4969 PreviousAssignedFuncClone =
4970 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4971 CallerAssignedToCloneOfFunc = true;
4972 }
4973
4974 // Clone function and save it along with the CallInfo map created
4975 // during cloning in the FuncCloneInfos.
4976 DenseMap<CallInfo, CallInfo> NewCallMap;
4977 unsigned CloneNo = FuncCloneInfos.size();
4978 assert(CloneNo > 0 && "Clone 0 is the original function, which "
4979 "should already exist in the map");
4980 FuncInfo NewFuncClone = cloneFunctionForCallsite(
4981 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4982 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
4983 FunctionClonesAnalysis++;
4984 Changed = true;
4985
4986 // If no caller callsites were already assigned to a clone of this
4987 // function, we can simply assign this clone to the new func clone
4988 // and update all callers to it, then skip to the next clone.
4989 if (!CallerAssignedToCloneOfFunc) {
4990 AssignCallsiteCloneToFuncClone(
4991 NewFuncClone, Call, Clone,
4992 AllocationCallToContextNodeMap.count(Call));
4993 for (auto &CE : Clone->CallerEdges) {
4994 // Ignore any caller that does not have a recorded callsite Call.
4995 if (!CE->Caller->hasCall())
4996 continue;
4997 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4998 }
4999 continue;
5000 }
5001
5002 // We may need to do additional node cloning in this case.
5003 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5004 // that were previously assigned to call PreviousAssignedFuncClone,
5005 // to record that they now call NewFuncClone.
5006 // The none type edge removal may remove some of this Clone's caller
5007 // edges, if it is reached via another of its caller's callees.
5008 // Iterate over a copy and skip any that were removed.
5009 auto CallerEdges = Clone->CallerEdges;
5010 for (auto CE : CallerEdges) {
5011 // Skip any that have been removed on an earlier iteration.
5012 if (CE->isRemoved()) {
5013 assert(!is_contained(Clone->CallerEdges, CE));
5014 continue;
5015 }
5016 assert(CE);
5017 // Ignore any caller that does not have a recorded callsite Call.
5018 if (!CE->Caller->hasCall())
5019 continue;
5020
5021 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5022 // We subsequently fall through to later handling that
5023 // will perform any additional cloning required for
5024 // callers that were calling other function clones.
5025 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5026 PreviousAssignedFuncClone)
5027 continue;
5028
5029 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5030
5031 // If we are cloning a function that was already assigned to some
5032 // callers, then essentially we are creating new callsite clones
5033 // of the other callsites in that function that are reached by those
5034 // callers. Clone the other callees of the current callsite's caller
5035 // that were already assigned to PreviousAssignedFuncClone
5036 // accordingly. This is important since we subsequently update the
5037 // calls from the nodes in the graph and their assignments to callee
5038 // functions recorded in CallsiteToCalleeFuncCloneMap.
5039 // The none type edge removal may remove some of this caller's
5040 // callee edges, if it is reached via another of its callees.
5041 // Iterate over a copy and skip any that were removed.
5042 auto CalleeEdges = CE->Caller->CalleeEdges;
5043 for (auto CalleeEdge : CalleeEdges) {
5044 // Skip any that have been removed on an earlier iteration when
5045 // cleaning up newly None type callee edges.
5046 if (CalleeEdge->isRemoved()) {
5047 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5048 continue;
5049 }
5050 assert(CalleeEdge);
5051 ContextNode *Callee = CalleeEdge->Callee;
5052 // Skip the current callsite, we are looking for other
5053 // callsites Caller calls, as well as any that does not have a
5054 // recorded callsite Call.
5055 if (Callee == Clone || !Callee->hasCall())
5056 continue;
5057 // Skip direct recursive calls. We don't need/want to clone the
5058 // caller node again, and this loop will not behave as expected if
5059 // we tried.
5060 if (Callee == CalleeEdge->Caller)
5061 continue;
5062 ContextNode *NewClone =
5063 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5064 // Moving the edge may have resulted in some none type
5065 // callee edges on the original Callee.
5066 removeNoneTypeCalleeEdges(Callee);
5067 // Update NewClone with the new Call clone of this callsite's Call
5068 // created for the new function clone created earlier.
5069 // Recall that we have already ensured when building the graph
5070 // that each caller can only call callsites within the same
5071 // function, so we are guaranteed that Callee Call is in the
5072 // current OrigFunc.
5073 // CallMap is set up as indexed by original Call at clone 0.
5074 CallInfo OrigCall(Callee->getOrigNode()->Call);
5075 OrigCall.setCloneNo(0);
5076 DenseMap<CallInfo, CallInfo> &CallMap =
5077 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5078 assert(CallMap.count(OrigCall));
5079 CallInfo NewCall(CallMap[OrigCall]);
5080 assert(NewCall);
5081 NewClone->setCall(NewCall);
5082 // Need to do the same for all matching calls.
5083 for (auto &MatchingCall : NewClone->MatchingCalls) {
5084 CallInfo OrigMatchingCall(MatchingCall);
5085 OrigMatchingCall.setCloneNo(0);
5086 assert(CallMap.count(OrigMatchingCall));
5087 CallInfo NewCall(CallMap[OrigMatchingCall]);
5088 assert(NewCall);
5089 // Updates the call in the list.
5090 MatchingCall = NewCall;
5091 }
5092 }
5093 }
5094 // Fall through to handling below to perform the recording of the
5095 // function for this callsite clone. This enables handling of cases
5096 // where the callers were assigned to different clones of a function.
5097 }
5098
5099 auto FindFirstAvailFuncClone = [&]() {
5100 // Find first function in FuncCloneInfos without an assigned
5101 // clone of this callsite Node. We should always have one
5102 // available at this point due to the earlier cloning when the
5103 // FuncCloneInfos size was smaller than the clone number.
5104 for (auto &CF : FuncCloneInfos) {
5105 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5106 return CF.FuncClone;
5107 }
5109 "Expected an available func clone for this callsite clone");
5110 };
5111
5112 // See if we can use existing function clone. Walk through
5113 // all caller edges to see if any have already been assigned to
5114 // a clone of this callsite's function. If we can use it, do so. If not,
5115 // because that function clone is already assigned to a different clone
5116 // of this callsite, then we need to clone again.
5117 // Basically, this checking is needed to handle the case where different
5118 // caller functions/callsites may need versions of this function
5119 // containing different mixes of callsite clones across the different
5120 // callsites within the function. If that happens, we need to create
5121 // additional function clones to handle the various combinations.
5122 //
5123 // Keep track of any new clones of this callsite created by the
5124 // following loop, as well as any existing clone that we decided to
5125 // assign this clone to.
5126 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5127 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5128 // Iterate over a copy of Clone's caller edges, since we may need to
5129 // remove edges in the moveEdgeTo* methods, and this simplifies the
5130 // handling and makes it less error-prone.
5131 auto CloneCallerEdges = Clone->CallerEdges;
5132 for (auto &Edge : CloneCallerEdges) {
5133 // Skip removed edges (due to direct recursive edges updated when
5134 // updating callee edges when moving an edge and subsequently
5135 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5136 if (Edge->isRemoved())
5137 continue;
5138 // Ignore any caller that does not have a recorded callsite Call.
5139 if (!Edge->Caller->hasCall())
5140 continue;
5141 // If this caller already assigned to call a version of OrigFunc, need
5142 // to ensure we can assign this callsite clone to that function clone.
5143 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5144 FuncInfo FuncCloneCalledByCaller =
5145 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5146 // First we need to confirm that this function clone is available
5147 // for use by this callsite node clone.
5148 //
5149 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5150 // its callsite clones, one of those callsite clones X could have
5151 // been assigned to the same function clone called by Edge's caller
5152 // - if Edge's caller calls another callsite within Node's original
5153 // function, and that callsite has another caller reaching clone X.
5154 // We need to clone Node again in this case.
5155 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5156 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5157 Clone) ||
5158 // Detect when we have multiple callers of this callsite that
5159 // have already been assigned to specific, and different, clones
5160 // of OrigFunc (due to other unrelated callsites in Func they
5161 // reach via call contexts). Is this Clone of callsite Node
5162 // assigned to a different clone of OrigFunc? If so, clone Node
5163 // again.
5164 (FuncCloneAssignedToCurCallsiteClone &&
5165 FuncCloneAssignedToCurCallsiteClone !=
5166 FuncCloneCalledByCaller)) {
5167 // We need to use a different newly created callsite clone, in
5168 // order to assign it to another new function clone on a
5169 // subsequent iteration over the Clones array (adjusted below).
5170 // Note we specifically do not reset the
5171 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5172 // when this new clone is processed later we know which version of
5173 // the function to copy (so that other callsite clones we have
5174 // assigned to that function clone are properly cloned over). See
5175 // comments in the function cloning handling earlier.
5176
5177 // Check if we already have cloned this callsite again while
5178 // walking through caller edges, for a caller calling the same
5179 // function clone. If so, we can move this edge to that new clone
5180 // rather than creating yet another new clone.
5181 if (FuncCloneToNewCallsiteCloneMap.count(
5182 FuncCloneCalledByCaller)) {
5183 ContextNode *NewClone =
5184 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5185 moveEdgeToExistingCalleeClone(Edge, NewClone);
5186 // Cleanup any none type edges cloned over.
5187 removeNoneTypeCalleeEdges(NewClone);
5188 } else {
5189 // Create a new callsite clone.
5190 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5191 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5192 NewClone;
5193 // Add to list of clones and process later.
5194 ClonesWorklist.push_back(NewClone);
5195 }
5196 // Moving the caller edge may have resulted in some none type
5197 // callee edges.
5198 removeNoneTypeCalleeEdges(Clone);
5199 // We will handle the newly created callsite clone in a subsequent
5200 // iteration over this Node's Clones.
5201 continue;
5202 }
5203
5204 // Otherwise, we can use the function clone already assigned to this
5205 // caller.
5206 if (!FuncCloneAssignedToCurCallsiteClone) {
5207 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5208 // Assign Clone to FuncCloneCalledByCaller
5209 AssignCallsiteCloneToFuncClone(
5210 FuncCloneCalledByCaller, Call, Clone,
5211 AllocationCallToContextNodeMap.count(Call));
5212 } else
5213 // Don't need to do anything - callsite is already calling this
5214 // function clone.
5215 assert(FuncCloneAssignedToCurCallsiteClone ==
5216 FuncCloneCalledByCaller);
5217
5218 } else {
5219 // We have not already assigned this caller to a version of
5220 // OrigFunc. Do the assignment now.
5221
5222 // First check if we have already assigned this callsite clone to a
5223 // clone of OrigFunc for another caller during this iteration over
5224 // its caller edges.
5225 if (!FuncCloneAssignedToCurCallsiteClone) {
5226 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5227 assert(FuncCloneAssignedToCurCallsiteClone);
5228 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5229 AssignCallsiteCloneToFuncClone(
5230 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5231 AllocationCallToContextNodeMap.count(Call));
5232 } else
5233 assert(FuncCloneToCurNodeCloneMap
5234 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5235 // Update callers to record function version called.
5236 RecordCalleeFuncOfCallsite(Edge->Caller,
5237 FuncCloneAssignedToCurCallsiteClone);
5238 }
5239 }
5240 // If we didn't assign a function clone to this callsite clone yet, e.g.
5241 // none of its callers has a non-null call, do the assignment here.
5242 // We want to ensure that every callsite clone is assigned to some
5243 // function clone, so that the call updates below work as expected.
5244 // In particular if this is the original callsite, we want to ensure it
5245 // is assigned to the original function, otherwise the original function
5246 // will appear available for assignment to other callsite clones,
5247 // leading to unintended effects. For one, the unknown and not updated
5248 // callers will call into cloned paths leading to the wrong hints,
5249 // because they still call the original function (clone 0). Also,
5250 // because all callsites start out as being clone 0 by default, we can't
5251 // easily distinguish between callsites explicitly assigned to clone 0
5252 // vs those never assigned, which can lead to multiple updates of the
5253 // calls when invoking updateCall below, with mismatched clone values.
5254 // TODO: Add a flag to the callsite nodes or some other mechanism to
5255 // better distinguish and identify callsite clones that are not getting
5256 // assigned to function clones as expected.
5257 if (!FuncCloneAssignedToCurCallsiteClone) {
5258 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5259 assert(FuncCloneAssignedToCurCallsiteClone &&
5260 "No available func clone for this callsite clone");
5261 AssignCallsiteCloneToFuncClone(
5262 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5263 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5264 }
5265 }
5266 if (VerifyCCG) {
5268 for (const auto &PE : Node->CalleeEdges)
5270 for (const auto &CE : Node->CallerEdges)
5272 for (auto *Clone : Node->Clones) {
5274 for (const auto &PE : Clone->CalleeEdges)
5276 for (const auto &CE : Clone->CallerEdges)
5278 }
5279 }
5280 }
5281
5282 if (FuncCloneInfos.size() < 2)
5283 continue;
5284
5285 // In this case there is more than just the original function copy.
5286 // Record call clones of any callsite nodes in the function that did not
5287 // themselves get cloned for all of the function clones.
5288 for (auto &Call : CallsWithMetadata) {
5289 ContextNode *Node = getNodeForInst(Call);
5290 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5291 continue;
5292 // If Node has enough clones already to cover all function clones, we can
5293 // skip it. Need to add one for the original copy.
5294 // Use >= in case there were clones that were skipped due to having empty
5295 // context ids
5296 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5297 continue;
5298 // First collect all function clones we cloned this callsite node for.
5299 // They may not be sequential due to empty clones e.g.
5300 DenseSet<unsigned> NodeCallClones;
5301 for (auto *C : Node->Clones)
5302 NodeCallClones.insert(C->Call.cloneNo());
5303 unsigned I = 0;
5304 // Now check all the function clones.
5305 for (auto &FC : FuncCloneInfos) {
5306 // Function clones should be sequential.
5307 assert(FC.FuncClone.cloneNo() == I);
5308 // Skip the first clone which got the original call.
5309 // Also skip any other clones created for this Node.
5310 if (++I == 1 || NodeCallClones.contains(I)) {
5311 continue;
5312 }
5313 // Record the call clones created for this callsite in this function
5314 // clone.
5315 auto &CallVector = UnassignedCallClones[Node][I];
5316 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5317 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5318 CallInfo CallClone = It->second;
5319 CallVector.push_back(CallClone);
5320 } else {
5321 // All but the original clone (skipped earlier) should have an entry
5322 // for all calls.
5323 assert(false && "Expected to find call in CallMap");
5324 }
5325 // Need to do the same for all matching calls.
5326 for (auto &MatchingCall : Node->MatchingCalls) {
5327 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5328 CallInfo CallClone = It->second;
5329 CallVector.push_back(CallClone);
5330 } else {
5331 // All but the original clone (skipped earlier) should have an entry
5332 // for all calls.
5333 assert(false && "Expected to find call in CallMap");
5334 }
5335 }
5336 }
5337 }
5338 }
5339
5340 uint8_t BothTypes =
5341 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5342
5343 auto UpdateCalls = [&](ContextNode *Node,
5344 DenseSet<const ContextNode *> &Visited,
5345 auto &&UpdateCalls) {
5346 auto Inserted = Visited.insert(Node);
5347 if (!Inserted.second)
5348 return;
5349
5350 for (auto *Clone : Node->Clones)
5351 UpdateCalls(Clone, Visited, UpdateCalls);
5352
5353 for (auto &Edge : Node->CallerEdges)
5354 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5355
5356 // Skip if either no call to update, or if we ended up with no context ids
5357 // (we moved all edges onto other clones).
5358 if (!Node->hasCall() || Node->emptyContextIds())
5359 return;
5360
5361 if (Node->IsAllocation) {
5362 auto AT = allocTypeToUse(Node->AllocTypes);
5363 // If the allocation type is ambiguous, and more aggressive hinting
5364 // has been enabled via the MinClonedColdBytePercent flag, see if this
5365 // allocation should be hinted cold anyway because its fraction cold bytes
5366 // allocated is at least the given threshold.
5367 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5368 !ContextIdToContextSizeInfos.empty()) {
5369 uint64_t TotalCold = 0;
5370 uint64_t Total = 0;
5371 for (auto Id : Node->getContextIds()) {
5372 auto TypeI = ContextIdToAllocationType.find(Id);
5373 assert(TypeI != ContextIdToAllocationType.end());
5374 auto CSI = ContextIdToContextSizeInfos.find(Id);
5375 if (CSI != ContextIdToContextSizeInfos.end()) {
5376 for (auto &Info : CSI->second) {
5377 Total += Info.TotalSize;
5378 if (TypeI->second == AllocationType::Cold)
5379 TotalCold += Info.TotalSize;
5380 }
5381 }
5382 }
5383 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5384 AT = AllocationType::Cold;
5385 }
5386 updateAllocationCall(Node->Call, AT);
5387 assert(Node->MatchingCalls.empty());
5388 return;
5389 }
5390
5391 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5392 return;
5393
5394 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5395 updateCall(Node->Call, CalleeFunc);
5396 // Update all the matching calls as well.
5397 for (auto &Call : Node->MatchingCalls)
5398 updateCall(Call, CalleeFunc);
5399
5400 // Now update all calls recorded earlier that are still in function clones
5401 // which don't have a clone of this callsite node.
5402 if (!UnassignedCallClones.contains(Node))
5403 return;
5404 DenseSet<unsigned> NodeCallClones;
5405 for (auto *C : Node->Clones)
5406 NodeCallClones.insert(C->Call.cloneNo());
5407 // Note that we already confirmed Node is in this map a few lines above.
5408 auto &ClonedCalls = UnassignedCallClones[Node];
5409 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5410 // Should start at 1 as we never create an entry for original node.
5411 assert(CloneNo > 0);
5412 // If we subsequently created a clone, skip this one.
5413 if (NodeCallClones.contains(CloneNo))
5414 continue;
5415 // Use the original Node's CalleeFunc.
5416 for (auto &Call : CallVector)
5417 updateCall(Call, CalleeFunc);
5418 }
5419 };
5420
5421 // Performs DFS traversal starting from allocation nodes to update calls to
5422 // reflect cloning decisions recorded earlier. For regular LTO this will
5423 // update the actual calls in the IR to call the appropriate function clone
5424 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5425 // are recorded in the summary entries.
5426 DenseSet<const ContextNode *> Visited;
5427 for (auto &Entry : AllocationCallToContextNodeMap)
5428 UpdateCalls(Entry.second, Visited, UpdateCalls);
5429
5430 return Changed;
5431}
5432
5433// Compute a SHA1 hash of the callsite and alloc version information of clone I
5434// in the summary, to use in detection of duplicate clones.
5436 SHA1 Hasher;
5437 // Update hash with any callsites that call non-default (non-zero) callee
5438 // versions.
5439 for (auto &SN : FS->callsites()) {
5440 // In theory all callsites and allocs in this function should have the same
5441 // number of clone entries, but handle any discrepancies gracefully below
5442 // for NDEBUG builds.
5443 assert(
5444 SN.Clones.size() > I &&
5445 "Callsite summary has fewer entries than other summaries in function");
5446 if (SN.Clones.size() <= I || !SN.Clones[I])
5447 continue;
5448 uint8_t Data[sizeof(SN.Clones[I])];
5449 support::endian::write32le(Data, SN.Clones[I]);
5450 Hasher.update(Data);
5451 }
5452 // Update hash with any allocs that have non-default (non-None) hints.
5453 for (auto &AN : FS->allocs()) {
5454 // In theory all callsites and allocs in this function should have the same
5455 // number of clone entries, but handle any discrepancies gracefully below
5456 // for NDEBUG builds.
5457 assert(AN.Versions.size() > I &&
5458 "Alloc summary has fewer entries than other summaries in function");
5459 if (AN.Versions.size() <= I ||
5460 (AllocationType)AN.Versions[I] == AllocationType::None)
5461 continue;
5462 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5463 }
5464 return support::endian::read64le(Hasher.result().data());
5465}
5466
5468 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5470 &FuncToAliasMap,
5471 FunctionSummary *FS) {
5472 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5473 // We might have created this when adjusting callsite in another
5474 // function. It should be a declaration.
5475 assert(DeclGV->isDeclaration());
5476 NewGV->takeName(DeclGV);
5477 DeclGV->replaceAllUsesWith(NewGV);
5478 DeclGV->eraseFromParent();
5479 };
5480
5481 // Handle aliases to this function, and create analogous alias clones to the
5482 // provided clone of this function.
5483 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5484 if (!FuncToAliasMap.count(&F))
5485 return;
5486 for (auto *A : FuncToAliasMap[&F]) {
5487 std::string AliasName = getMemProfFuncName(A->getName(), I);
5488 auto *PrevA = M.getNamedAlias(AliasName);
5489 auto *NewA = GlobalAlias::create(A->getValueType(),
5490 A->getType()->getPointerAddressSpace(),
5491 A->getLinkage(), AliasName, NewF);
5492 NewA->copyAttributesFrom(A);
5493 if (PrevA)
5494 TakeDeclNameAndReplace(PrevA, NewA);
5495 }
5496 };
5497
5498 // The first "clone" is the original copy, we should only call this if we
5499 // needed to create new clones.
5500 assert(NumClones > 1);
5502 VMaps.reserve(NumClones - 1);
5503 FunctionsClonedThinBackend++;
5504
5505 // Map of hash of callsite/alloc versions to the instantiated function clone
5506 // (possibly the original) implementing those calls. Used to avoid
5507 // instantiating duplicate function clones.
5508 // FIXME: Ideally the thin link would not generate such duplicate clones to
5509 // start with, but right now it happens due to phase ordering in the function
5510 // assignment and possible new clones that produces. We simply make each
5511 // duplicate an alias to the matching instantiated clone recorded in the map
5512 // (except for available_externally which are made declarations as they would
5513 // be aliases in the prevailing module, and available_externally aliases are
5514 // not well supported right now).
5516
5517 // Save the hash of the original function version.
5518 HashToFunc[ComputeHash(FS, 0)] = &F;
5519
5520 for (unsigned I = 1; I < NumClones; I++) {
5521 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5522 std::string Name = getMemProfFuncName(F.getName(), I);
5523 auto Hash = ComputeHash(FS, I);
5524 // If this clone would duplicate a previously seen clone, don't generate the
5525 // duplicate clone body, just make an alias to satisfy any (potentially
5526 // cross-module) references.
5527 if (HashToFunc.contains(Hash)) {
5528 FunctionCloneDuplicatesThinBackend++;
5529 auto *Func = HashToFunc[Hash];
5530 if (Func->hasAvailableExternallyLinkage()) {
5531 // Skip these as EliminateAvailableExternallyPass does not handle
5532 // available_externally aliases correctly and we end up with an
5533 // available_externally alias to a declaration. Just create a
5534 // declaration for now as we know we will have a definition in another
5535 // module.
5536 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5537 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5538 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5539 continue;
5540 }
5541 auto *PrevF = M.getFunction(Name);
5542 auto *Alias = GlobalAlias::create(Name, Func);
5543 if (PrevF)
5544 TakeDeclNameAndReplace(PrevF, Alias);
5545 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5546 << "created clone alias " << ore::NV("Alias", Alias));
5547
5548 // Now handle aliases to this function, and clone those as well.
5549 CloneFuncAliases(Func, I);
5550 continue;
5551 }
5552 auto *NewF = CloneFunction(&F, *VMaps.back());
5553 HashToFunc[Hash] = NewF;
5554 FunctionClonesThinBackend++;
5555 // Strip memprof and callsite metadata from clone as they are no longer
5556 // needed.
5557 for (auto &BB : *NewF) {
5558 for (auto &Inst : BB) {
5559 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5560 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5561 }
5562 }
5563 auto *PrevF = M.getFunction(Name);
5564 if (PrevF)
5565 TakeDeclNameAndReplace(PrevF, NewF);
5566 else
5567 NewF->setName(Name);
5568 updateSubprogramLinkageName(NewF, Name);
5569 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5570 << "created clone " << ore::NV("NewFunction", NewF));
5571
5572 // Now handle aliases to this function, and clone those as well.
5573 CloneFuncAliases(NewF, I);
5574 }
5575 return VMaps;
5576}
5577
5578// Locate the summary for F. This is complicated by the fact that it might
5579// have been internalized or promoted.
5581 const ModuleSummaryIndex *ImportSummary,
5582 const Function *CallingFunc = nullptr) {
5583 // FIXME: Ideally we would retain the original GUID in some fashion on the
5584 // function (e.g. as metadata), but for now do our best to locate the
5585 // summary without that information.
5586 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5587 if (!TheFnVI)
5588 // See if theFn was internalized, by checking index directly with
5589 // original name (this avoids the name adjustment done by getGUID() for
5590 // internal symbols).
5591 TheFnVI = ImportSummary->getValueInfo(
5593 if (TheFnVI)
5594 return TheFnVI;
5595 // Now query with the original name before any promotion was performed.
5596 StringRef OrigName =
5598 // When this pass is enabled, we always add thinlto_src_file provenance
5599 // metadata to imported function definitions, which allows us to recreate the
5600 // original internal symbol's GUID.
5601 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5602 // If this is a call to an imported/promoted local for which we didn't import
5603 // the definition, the metadata will not exist on the declaration. However,
5604 // since we are doing this early, before any inlining in the LTO backend, we
5605 // can simply look at the metadata on the calling function which must have
5606 // been from the same module if F was an internal symbol originally.
5607 if (!SrcFileMD && F.isDeclaration()) {
5608 // We would only call this for a declaration for a direct callsite, in which
5609 // case the caller would have provided the calling function pointer.
5610 assert(CallingFunc);
5611 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5612 // If this is a promoted local (OrigName != F.getName()), since this is a
5613 // declaration, it must be imported from a different module and therefore we
5614 // should always find the metadata on its calling function. Any call to a
5615 // promoted local that came from this module should still be a definition.
5616 assert(SrcFileMD || OrigName == F.getName());
5617 }
5618 StringRef SrcFile = M.getSourceFileName();
5619 if (SrcFileMD)
5620 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5621 std::string OrigId = GlobalValue::getGlobalIdentifier(
5622 OrigName, GlobalValue::InternalLinkage, SrcFile);
5623 TheFnVI = ImportSummary->getValueInfo(
5625 // Internal func in original module may have gotten a numbered suffix if we
5626 // imported an external function with the same name. This happens
5627 // automatically during IR linking for naming conflicts. It would have to
5628 // still be internal in that case (otherwise it would have been renamed on
5629 // promotion in which case we wouldn't have a naming conflict).
5630 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5631 F.getName().contains('.')) {
5632 OrigName = F.getName().rsplit('.').first;
5634 OrigName, GlobalValue::InternalLinkage, SrcFile);
5635 TheFnVI = ImportSummary->getValueInfo(
5637 }
5638 // The only way we may not have a VI is if this is a declaration created for
5639 // an imported reference. For distributed ThinLTO we may not have a VI for
5640 // such declarations in the distributed summary.
5641 assert(TheFnVI || F.isDeclaration());
5642 return TheFnVI;
5643}
5644
5645bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5646 Module &M) {
5647 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5648 Symtab = std::make_unique<InstrProfSymtab>();
5649 // Don't add canonical names, to avoid multiple functions to the symtab
5650 // when they both have the same root name with "." suffixes stripped.
5651 // If we pick the wrong one then this could lead to incorrect ICP and calling
5652 // a memprof clone that we don't actually create (resulting in linker unsats).
5653 // What this means is that the GUID of the function (or its PGOFuncName
5654 // metadata) *must* match that in the VP metadata to allow promotion.
5655 // In practice this should not be a limitation, since local functions should
5656 // have PGOFuncName metadata and global function names shouldn't need any
5657 // special handling (they should not get the ".llvm.*" suffix that the
5658 // canonicalization handling is attempting to strip).
5659 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5660 std::string SymtabFailure = toString(std::move(E));
5661 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5662 return false;
5663 }
5664 return true;
5665}
5666
5667#ifndef NDEBUG
5668// Sanity check that the MIB stack ids match between the summary and
5669// instruction metadata.
5671 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5672 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5673 const ModuleSummaryIndex *ImportSummary) {
5674 auto MIBIter = AllocNode.MIBs.begin();
5675 for (auto &MDOp : MemProfMD->operands()) {
5676 assert(MIBIter != AllocNode.MIBs.end());
5677 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5678 auto *MIBMD = cast<const MDNode>(MDOp);
5679 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5680 assert(StackMDNode);
5681 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5682 auto ContextIterBegin =
5683 StackContext.beginAfterSharedPrefix(CallsiteContext);
5684 // Skip the checking on the first iteration.
5685 uint64_t LastStackContextId =
5686 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5687 : 0;
5688 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5689 ++ContextIter) {
5690 // If this is a direct recursion, simply skip the duplicate
5691 // entries, to be consistent with how the summary ids were
5692 // generated during ModuleSummaryAnalysis.
5693 if (LastStackContextId == *ContextIter)
5694 continue;
5695 LastStackContextId = *ContextIter;
5696 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5697 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5698 *ContextIter);
5699 StackIdIndexIter++;
5700 }
5701 MIBIter++;
5702 }
5703}
5704#endif
5705
5706bool MemProfContextDisambiguation::applyImport(Module &M) {
5707 assert(ImportSummary);
5708 bool Changed = false;
5709
5710 // We also need to clone any aliases that reference cloned functions, because
5711 // the modified callsites may invoke via the alias. Keep track of the aliases
5712 // for each function.
5713 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5714 FuncToAliasMap;
5715 for (auto &A : M.aliases()) {
5716 auto *Aliasee = A.getAliaseeObject();
5717 if (auto *F = dyn_cast<Function>(Aliasee))
5718 FuncToAliasMap[F].insert(&A);
5719 }
5720
5721 if (!initializeIndirectCallPromotionInfo(M))
5722 return false;
5723
5724 for (auto &F : M) {
5725 if (F.isDeclaration() || isMemProfClone(F))
5726 continue;
5727
5728 OptimizationRemarkEmitter ORE(&F);
5729
5731 bool ClonesCreated = false;
5732 unsigned NumClonesCreated = 0;
5733 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5734 // We should at least have version 0 which is the original copy.
5735 assert(NumClones > 0);
5736 // If only one copy needed use original.
5737 if (NumClones == 1)
5738 return;
5739 // If we already performed cloning of this function, confirm that the
5740 // requested number of clones matches (the thin link should ensure the
5741 // number of clones for each constituent callsite is consistent within
5742 // each function), before returning.
5743 if (ClonesCreated) {
5744 assert(NumClonesCreated == NumClones);
5745 return;
5746 }
5747 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5748 // The first "clone" is the original copy, which doesn't have a VMap.
5749 assert(VMaps.size() == NumClones - 1);
5750 Changed = true;
5751 ClonesCreated = true;
5752 NumClonesCreated = NumClones;
5753 };
5754
5755 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5756 Function *CalledFunction, FunctionSummary *FS) {
5757 // Perform cloning if not yet done.
5758 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5759
5760 assert(!isMemProfClone(*CalledFunction));
5761
5762 // Because we update the cloned calls by calling setCalledOperand (see
5763 // comment below), out of an abundance of caution make sure the called
5764 // function was actually the called operand (or its aliasee). We also
5765 // strip pointer casts when looking for calls (to match behavior during
5766 // summary generation), however, with opaque pointers in theory this
5767 // should not be an issue. Note we still clone the current function
5768 // (containing this call) above, as that could be needed for its callers.
5769 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5770 if (CalledFunction != CB->getCalledOperand() &&
5771 (!GA || CalledFunction != GA->getAliaseeObject())) {
5772 SkippedCallsCloning++;
5773 return;
5774 }
5775 // Update the calls per the summary info.
5776 // Save orig name since it gets updated in the first iteration
5777 // below.
5778 auto CalleeOrigName = CalledFunction->getName();
5779 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5780 // If the VMap is empty, this clone was a duplicate of another and was
5781 // created as an alias or a declaration.
5782 if (J > 0 && VMaps[J - 1]->empty())
5783 continue;
5784 // Do nothing if this version calls the original version of its
5785 // callee.
5786 if (!StackNode.Clones[J])
5787 continue;
5788 auto NewF = M.getOrInsertFunction(
5789 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5790 CalledFunction->getFunctionType());
5791 CallBase *CBClone;
5792 // Copy 0 is the original function.
5793 if (!J)
5794 CBClone = CB;
5795 else
5796 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5797 // Set the called operand directly instead of calling setCalledFunction,
5798 // as the latter mutates the function type on the call. In rare cases
5799 // we may have a slightly different type on a callee function
5800 // declaration due to it being imported from a different module with
5801 // incomplete types. We really just want to change the name of the
5802 // function to the clone, and not make any type changes.
5803 CBClone->setCalledOperand(NewF.getCallee());
5804 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5805 << ore::NV("Call", CBClone) << " in clone "
5806 << ore::NV("Caller", CBClone->getFunction())
5807 << " assigned to call function clone "
5808 << ore::NV("Callee", NewF.getCallee()));
5809 }
5810 };
5811
5812 // Locate the summary for F.
5813 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5814 // If not found, this could be an imported local (see comment in
5815 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5816 // module (where it would have been promoted to global scope so should
5817 // satisfy any reference in this module).
5818 if (!TheFnVI)
5819 continue;
5820
5821 auto *GVSummary =
5822 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5823 if (!GVSummary) {
5824 // Must have been imported, use the summary which matches the definition。
5825 // (might be multiple if this was a linkonce_odr).
5826 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5827 assert(SrcModuleMD &&
5828 "enable-import-metadata is needed to emit thinlto_src_module");
5829 StringRef SrcModule =
5830 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5831 for (auto &GVS : TheFnVI.getSummaryList()) {
5832 if (GVS->modulePath() == SrcModule) {
5833 GVSummary = GVS.get();
5834 break;
5835 }
5836 }
5837 assert(GVSummary && GVSummary->modulePath() == SrcModule);
5838 }
5839
5840 // If this was an imported alias skip it as we won't have the function
5841 // summary, and it should be cloned in the original module.
5842 if (isa<AliasSummary>(GVSummary))
5843 continue;
5844
5845 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5846
5847 if (FS->allocs().empty() && FS->callsites().empty())
5848 continue;
5849
5850 auto SI = FS->callsites().begin();
5851 auto AI = FS->allocs().begin();
5852
5853 // To handle callsite infos synthesized for tail calls which have missing
5854 // frames in the profiled context, map callee VI to the synthesized callsite
5855 // info.
5856 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5857 // Iterate the callsites for this function in reverse, since we place all
5858 // those synthesized for tail calls at the end.
5859 for (auto CallsiteIt = FS->callsites().rbegin();
5860 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5861 auto &Callsite = *CallsiteIt;
5862 // Stop as soon as we see a non-synthesized callsite info (see comment
5863 // above loop). All the entries added for discovered tail calls have empty
5864 // stack ids.
5865 if (!Callsite.StackIdIndices.empty())
5866 break;
5867 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5868 }
5869
5870 // Keeps track of needed ICP for the function.
5871 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5872
5873 // Assume for now that the instructions are in the exact same order
5874 // as when the summary was created, but confirm this is correct by
5875 // matching the stack ids.
5876 for (auto &BB : F) {
5877 for (auto &I : BB) {
5878 auto *CB = dyn_cast<CallBase>(&I);
5879 // Same handling as when creating module summary.
5880 if (!mayHaveMemprofSummary(CB))
5881 continue;
5882
5883 auto *CalledValue = CB->getCalledOperand();
5884 auto *CalledFunction = CB->getCalledFunction();
5885 if (CalledValue && !CalledFunction) {
5886 CalledValue = CalledValue->stripPointerCasts();
5887 // Stripping pointer casts can reveal a called function.
5888 CalledFunction = dyn_cast<Function>(CalledValue);
5889 }
5890 // Check if this is an alias to a function. If so, get the
5891 // called aliasee for the checks below.
5892 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5893 assert(!CalledFunction &&
5894 "Expected null called function in callsite for alias");
5895 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5896 }
5897
5898 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5899 I.getMetadata(LLVMContext::MD_callsite));
5900 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5901
5902 // Include allocs that were already assigned a memprof function
5903 // attribute in the statistics. Only do this for those that do not have
5904 // memprof metadata, since we add an "ambiguous" memprof attribute by
5905 // default.
5906 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5907 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5908 ? AllocTypeColdThinBackend++
5909 : AllocTypeNotColdThinBackend++;
5910 OrigAllocsThinBackend++;
5911 AllocVersionsThinBackend++;
5912 if (!MaxAllocVersionsThinBackend)
5913 MaxAllocVersionsThinBackend = 1;
5914 continue;
5915 }
5916
5917 if (MemProfMD) {
5918 // Consult the next alloc node.
5919 assert(AI != FS->allocs().end());
5920 auto &AllocNode = *(AI++);
5921
5922#ifndef NDEBUG
5923 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5924 ImportSummary);
5925#endif
5926
5927 // Perform cloning if not yet done.
5928 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
5929
5930 OrigAllocsThinBackend++;
5931 AllocVersionsThinBackend += AllocNode.Versions.size();
5932 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5933 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5934
5935 // If there is only one version that means we didn't end up
5936 // considering this function for cloning, and in that case the alloc
5937 // will still be none type or should have gotten the default NotCold.
5938 // Skip that after calling clone helper since that does some sanity
5939 // checks that confirm we haven't decided yet that we need cloning.
5940 // We might have a single version that is cold due to the
5941 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5942 // case.
5943 if (AllocNode.Versions.size() == 1 &&
5944 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5945 assert((AllocationType)AllocNode.Versions[0] ==
5946 AllocationType::NotCold ||
5947 (AllocationType)AllocNode.Versions[0] ==
5948 AllocationType::None);
5949 UnclonableAllocsThinBackend++;
5950 continue;
5951 }
5952
5953 // All versions should have a singular allocation type.
5954 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5955 return Type == ((uint8_t)AllocationType::NotCold |
5956 (uint8_t)AllocationType::Cold);
5957 }));
5958
5959 // Update the allocation types per the summary info.
5960 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5961 // If the VMap is empty, this clone was a duplicate of another and
5962 // was created as an alias or a declaration.
5963 if (J > 0 && VMaps[J - 1]->empty())
5964 continue;
5965 // Ignore any that didn't get an assigned allocation type.
5966 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5967 continue;
5968 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5969 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5970 : AllocTypeNotColdThinBackend++;
5971 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5972 auto A = llvm::Attribute::get(F.getContext(), "memprof",
5973 AllocTypeString);
5974 CallBase *CBClone;
5975 // Copy 0 is the original function.
5976 if (!J)
5977 CBClone = CB;
5978 else
5979 // Since VMaps are only created for new clones, we index with
5980 // clone J-1 (J==0 is the original clone and does not have a VMaps
5981 // entry).
5982 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5984 CBClone->addFnAttr(A);
5985 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5986 << ore::NV("AllocationCall", CBClone) << " in clone "
5987 << ore::NV("Caller", CBClone->getFunction())
5988 << " marked with memprof allocation attribute "
5989 << ore::NV("Attribute", AllocTypeString));
5990 }
5991 } else if (!CallsiteContext.empty()) {
5992 if (!CalledFunction) {
5993#ifndef NDEBUG
5994 // We should have skipped inline assembly calls.
5995 auto *CI = dyn_cast<CallInst>(CB);
5996 assert(!CI || !CI->isInlineAsm());
5997#endif
5998 // We should have skipped direct calls via a Constant.
5999 assert(CalledValue && !isa<Constant>(CalledValue));
6000
6001 // This is an indirect call, see if we have profile information and
6002 // whether any clones were recorded for the profiled targets (that
6003 // we synthesized CallsiteInfo summary records for when building the
6004 // index).
6005 auto NumClones =
6006 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
6007
6008 // Perform cloning if not yet done. This is done here in case
6009 // we don't need to do ICP, but might need to clone this
6010 // function as it is the target of other cloned calls.
6011 if (NumClones)
6012 CloneFuncIfNeeded(NumClones, FS);
6013 }
6014
6015 else {
6016 // Consult the next callsite node.
6017 assert(SI != FS->callsites().end());
6018 auto &StackNode = *(SI++);
6019
6020#ifndef NDEBUG
6021 // Sanity check that the stack ids match between the summary and
6022 // instruction metadata.
6023 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6024 for (auto StackId : CallsiteContext) {
6025 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6026 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6027 StackId);
6028 StackIdIndexIter++;
6029 }
6030#endif
6031
6032 CloneCallsite(StackNode, CB, CalledFunction, FS);
6033 }
6034 } else if (CB->isTailCall() && CalledFunction) {
6035 // Locate the synthesized callsite info for the callee VI, if any was
6036 // created, and use that for cloning.
6037 ValueInfo CalleeVI =
6038 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
6039 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
6040 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
6041 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6042 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6043 }
6044 }
6045 }
6046 }
6047
6048 // Now do any promotion required for cloning.
6049 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6050 }
6051
6052 // We skip some of the functions and instructions above, so remove all the
6053 // metadata in a single sweep here.
6054 for (auto &F : M) {
6055 // We can skip memprof clones because createFunctionClones already strips
6056 // the metadata from the newly created clones.
6057 if (F.isDeclaration() || isMemProfClone(F))
6058 continue;
6059 for (auto &BB : F) {
6060 for (auto &I : BB) {
6061 if (!isa<CallBase>(I))
6062 continue;
6063 I.setMetadata(LLVMContext::MD_memprof, nullptr);
6064 I.setMetadata(LLVMContext::MD_callsite, nullptr);
6065 }
6066 }
6067 }
6068
6069 return Changed;
6070}
6071
6072unsigned MemProfContextDisambiguation::recordICPInfo(
6073 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6075 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6076 // First see if we have profile information for this indirect call.
6077 uint32_t NumCandidates;
6078 uint64_t TotalCount;
6079 auto CandidateProfileData =
6080 ICallAnalysis->getPromotionCandidatesForInstruction(
6081 CB, TotalCount, NumCandidates, MaxSummaryIndirectEdges);
6082 if (CandidateProfileData.empty())
6083 return 0;
6084
6085 // Iterate through all of the candidate profiled targets along with the
6086 // CallsiteInfo summary records synthesized for them when building the index,
6087 // and see if any are cloned and/or refer to clones.
6088 bool ICPNeeded = false;
6089 unsigned NumClones = 0;
6090 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
6091 for (const auto &Candidate : CandidateProfileData) {
6092#ifndef NDEBUG
6093 auto CalleeValueInfo =
6094#endif
6095 ImportSummary->getValueInfo(Candidate.Value);
6096 // We might not have a ValueInfo if this is a distributed
6097 // ThinLTO backend and decided not to import that function.
6098 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6099 assert(SI != AllCallsites.end());
6100 auto &StackNode = *(SI++);
6101 // See if any of the clones of the indirect callsite for this
6102 // profiled target should call a cloned version of the profiled
6103 // target. We only need to do the ICP here if so.
6104 ICPNeeded |= llvm::any_of(StackNode.Clones,
6105 [](unsigned CloneNo) { return CloneNo != 0; });
6106 // Every callsite in the same function should have been cloned the same
6107 // number of times.
6108 assert(!NumClones || NumClones == StackNode.Clones.size());
6109 NumClones = StackNode.Clones.size();
6110 }
6111 if (!ICPNeeded)
6112 return NumClones;
6113 // Save information for ICP, which is performed later to avoid messing up the
6114 // current function traversal.
6115 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
6116 TotalCount, CallsiteInfoStartIndex});
6117 return NumClones;
6118}
6119
6120void MemProfContextDisambiguation::performICP(
6121 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6122 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6123 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6124 OptimizationRemarkEmitter &ORE) {
6125 // Now do any promotion required for cloning. Specifically, for each
6126 // recorded ICP candidate (which was only recorded because one clone of that
6127 // candidate should call a cloned target), we perform ICP (speculative
6128 // devirtualization) for each clone of the callsite, and update its callee
6129 // to the appropriate clone. Note that the ICP compares against the original
6130 // version of the target, which is what is in the vtable.
6131 for (auto &Info : ICallAnalysisInfo) {
6132 auto *CB = Info.CB;
6133 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6134 auto TotalCount = Info.TotalCount;
6135 unsigned NumPromoted = 0;
6136 unsigned NumClones = 0;
6137
6138 for (auto &Candidate : Info.CandidateProfileData) {
6139 auto &StackNode = AllCallsites[CallsiteIndex++];
6140
6141 // All calls in the same function must have the same number of clones.
6142 assert(!NumClones || NumClones == StackNode.Clones.size());
6143 NumClones = StackNode.Clones.size();
6144
6145 // See if the target is in the module. If it wasn't imported, it is
6146 // possible that this profile could have been collected on a different
6147 // target (or version of the code), and we need to be conservative
6148 // (similar to what is done in the ICP pass).
6149 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
6150 if (TargetFunction == nullptr ||
6151 // Any ThinLTO global dead symbol removal should have already
6152 // occurred, so it should be safe to promote when the target is a
6153 // declaration.
6154 // TODO: Remove internal option once more fully tested.
6156 TargetFunction->isDeclaration())) {
6157 ORE.emit([&]() {
6158 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6159 << "Memprof cannot promote indirect call: target with md5sum "
6160 << ore::NV("target md5sum", Candidate.Value) << " not found";
6161 });
6162 // FIXME: See if we can use the new declaration importing support to
6163 // at least get the declarations imported for this case. Hot indirect
6164 // targets should have been imported normally, however.
6165 continue;
6166 }
6167
6168 // Check if legal to promote
6169 const char *Reason = nullptr;
6170 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
6171 ORE.emit([&]() {
6172 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6173 << "Memprof cannot promote indirect call to "
6174 << ore::NV("TargetFunction", TargetFunction)
6175 << " with count of " << ore::NV("TotalCount", TotalCount)
6176 << ": " << Reason;
6177 });
6178 continue;
6179 }
6180
6181 assert(!isMemProfClone(*TargetFunction));
6182
6183 // Handle each call clone, applying ICP so that each clone directly
6184 // calls the specified callee clone, guarded by the appropriate ICP
6185 // check.
6186 CallBase *CBClone = CB;
6187 for (unsigned J = 0; J < NumClones; J++) {
6188 // If the VMap is empty, this clone was a duplicate of another and was
6189 // created as an alias or a declaration.
6190 if (J > 0 && VMaps[J - 1]->empty())
6191 continue;
6192 // Copy 0 is the original function.
6193 if (J > 0)
6194 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6195 // We do the promotion using the original name, so that the comparison
6196 // is against the name in the vtable. Then just below, change the new
6197 // direct call to call the cloned function.
6198 auto &DirectCall =
6199 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
6200 TotalCount, isSamplePGO, &ORE);
6201 auto *TargetToUse = TargetFunction;
6202 // Call original if this version calls the original version of its
6203 // callee.
6204 if (StackNode.Clones[J]) {
6205 TargetToUse =
6206 cast<Function>(M.getOrInsertFunction(
6207 getMemProfFuncName(TargetFunction->getName(),
6208 StackNode.Clones[J]),
6209 TargetFunction->getFunctionType())
6210 .getCallee());
6211 }
6212 DirectCall.setCalledFunction(TargetToUse);
6213 // During matching we generate synthetic VP metadata for indirect calls
6214 // not already having any, from the memprof profile's callee GUIDs. If
6215 // we subsequently promote and inline those callees, we currently lose
6216 // the ability to generate this synthetic VP metadata. Optionally apply
6217 // a noinline attribute to promoted direct calls, where the threshold is
6218 // set to capture synthetic VP metadata targets which get a count of 1.
6220 Candidate.Count < MemProfICPNoInlineThreshold)
6221 DirectCall.setIsNoInline();
6222 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6223 << ore::NV("Call", CBClone) << " in clone "
6224 << ore::NV("Caller", CBClone->getFunction())
6225 << " promoted and assigned to call function clone "
6226 << ore::NV("Callee", TargetToUse));
6227 }
6228
6229 // Update TotalCount (all clones should get same count above)
6230 TotalCount -= Candidate.Count;
6231 NumPromoted++;
6232 }
6233 // Adjust the MD.prof metadata for all clones, now that we have the new
6234 // TotalCount and the number promoted.
6235 CallBase *CBClone = CB;
6236 for (unsigned J = 0; J < NumClones; J++) {
6237 // If the VMap is empty, this clone was a duplicate of another and was
6238 // created as an alias or a declaration.
6239 if (J > 0 && VMaps[J - 1]->empty())
6240 continue;
6241 // Copy 0 is the original function.
6242 if (J > 0)
6243 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6244 // First delete the old one.
6245 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6246 // If all promoted, we don't need the MD.prof metadata.
6247 // Otherwise we need update with the un-promoted records back.
6248 if (TotalCount != 0)
6250 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
6251 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
6252 }
6253 }
6254}
6255
6256template <typename DerivedCCG, typename FuncTy, typename CallTy>
6257bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
6258 if (DumpCCG) {
6259 dbgs() << "CCG before cloning:\n";
6260 dbgs() << *this;
6261 }
6262 if (ExportToDot)
6263 exportToDot("postbuild");
6264
6265 if (VerifyCCG) {
6266 check();
6267 }
6268
6269 identifyClones();
6270
6271 if (VerifyCCG) {
6272 check();
6273 }
6274
6275 if (DumpCCG) {
6276 dbgs() << "CCG after cloning:\n";
6277 dbgs() << *this;
6278 }
6279 if (ExportToDot)
6280 exportToDot("cloned");
6281
6282 bool Changed = assignFunctions();
6283
6284 if (DumpCCG) {
6285 dbgs() << "CCG after assigning function clones:\n";
6286 dbgs() << *this;
6287 }
6288 if (ExportToDot)
6289 exportToDot("clonefuncassign");
6290
6292 printTotalSizes(errs());
6293
6294 return Changed;
6295}
6296
6297bool MemProfContextDisambiguation::processModule(
6298 Module &M,
6299 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6300
6301 // If we have an import summary, then the cloning decisions were made during
6302 // the thin link on the index. Apply them and return.
6303 if (ImportSummary)
6304 return applyImport(M);
6305
6306 // TODO: If/when other types of memprof cloning are enabled beyond just for
6307 // hot and cold, we will need to change this to individually control the
6308 // AllocationType passed to addStackNodesForMIB during CCG construction.
6309 // Note that we specifically check this after applying imports above, so that
6310 // the option isn't needed to be passed to distributed ThinLTO backend
6311 // clang processes, which won't necessarily have visibility into the linker
6312 // dependences. Instead the information is communicated from the LTO link to
6313 // the backends via the combined summary index.
6314 if (!SupportsHotColdNew)
6315 return false;
6316
6317 ModuleCallsiteContextGraph CCG(M, OREGetter);
6318 return CCG.process();
6319}
6320
6322 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6323 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6324 // Check the dot graph printing options once here, to make sure we have valid
6325 // and expected combinations.
6326 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6328 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6330 !ContextIdForDot.getNumOccurrences())
6332 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6333 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6334 ContextIdForDot.getNumOccurrences())
6336 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6337 "-memprof-dot-context-id");
6338 if (ImportSummary) {
6339 // The MemProfImportSummary should only be used for testing ThinLTO
6340 // distributed backend handling via opt, in which case we don't have a
6341 // summary from the pass pipeline.
6343 return;
6344 }
6345 if (MemProfImportSummary.empty())
6346 return;
6347
6348 auto ReadSummaryFile =
6350 if (!ReadSummaryFile) {
6351 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6352 "Error loading file '" + MemProfImportSummary +
6353 "': ");
6354 return;
6355 }
6356 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6357 if (!ImportSummaryForTestingOrErr) {
6358 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6359 "Error parsing file '" + MemProfImportSummary +
6360 "': ");
6361 return;
6362 }
6363 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6364 ImportSummary = ImportSummaryForTesting.get();
6365}
6366
6369 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6370 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6371 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6372 };
6373 if (!processModule(M, OREGetter))
6374 return PreservedAnalyses::all();
6375 return PreservedAnalyses::none();
6376}
6377
6379 ModuleSummaryIndex &Index,
6381 isPrevailing) {
6382 // TODO: If/when other types of memprof cloning are enabled beyond just for
6383 // hot and cold, we will need to change this to individually control the
6384 // AllocationType passed to addStackNodesForMIB during CCG construction.
6385 // The index was set from the option, so these should be in sync.
6386 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6387 if (!SupportsHotColdNew)
6388 return;
6389
6390 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6391 CCG.process();
6392}
6393
6394// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6395// when we don't have an index that has recorded that we are linking with
6396// allocation libraries containing the necessary APIs for downstream
6397// transformations.
6399 // The profile matcher applies hotness attributes directly for allocations,
6400 // and those will cause us to generate calls to the hot/cold interfaces
6401 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6402 // link then assume we don't want these calls (e.g. not linking with
6403 // the appropriate library, or otherwise trying to disable this behavior).
6404 bool Changed = false;
6405 for (auto &F : M) {
6406 for (auto &BB : F) {
6407 for (auto &I : BB) {
6408 auto *CI = dyn_cast<CallBase>(&I);
6409 if (!CI)
6410 continue;
6411 if (CI->hasFnAttr("memprof")) {
6412 CI->removeFnAttr("memprof");
6413 Changed = true;
6414 }
6415 if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
6416 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6417 continue;
6418 }
6419 // Strip off all memprof metadata as it is no longer needed.
6420 // Importantly, this avoids the addition of new memprof attributes
6421 // after inlining propagation.
6422 CI->setMetadata(LLVMContext::MD_memprof, nullptr);
6423 CI->setMetadata(LLVMContext::MD_callsite, nullptr);
6424 Changed = true;
6425 }
6426 }
6427 }
6428 if (!Changed)
6429 return PreservedAnalyses::all();
6430 return PreservedAnalyses::none();
6431}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(2), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
const_pointer iterator
Definition ArrayRef.h:47
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:164
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:598
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:77
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:93
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:161
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:669
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1317
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:150
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:96
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:102
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
bool erase(const ValueT &V)
Definition DenseSet.h:100
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:695
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:65
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
constexpr from_range_t from_range
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
cl::opt< unsigned > MemProfTopNImportant("memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
cl::opt< unsigned > MaxSummaryIndirectEdges("module-summary-max-indirect-edges", cl::init(0), cl::Hidden, cl::desc("Max number of summary edges added from " "indirect call profile metadata"))
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1245
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
cl::opt< bool > MemProfFixupImportant("memprof-fixup-important", cl::init(true), cl::Hidden, cl::desc("Enables edge fixup for important contexts"))
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34