LLVM 23.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
44#include "llvm/Support/SHA1.h"
46#include "llvm/Transforms/IPO.h"
50#include <deque>
51#include <sstream>
52#include <vector>
53using namespace llvm;
54using namespace llvm::memprof;
55
56#define DEBUG_TYPE "memprof-context-disambiguation"
57
58STATISTIC(FunctionClonesAnalysis,
59 "Number of function clones created during whole program analysis");
60STATISTIC(FunctionClonesThinBackend,
61 "Number of function clones created during ThinLTO backend");
62STATISTIC(FunctionsClonedThinBackend,
63 "Number of functions that had clones created during ThinLTO backend");
65 FunctionCloneDuplicatesThinBackend,
66 "Number of function clone duplicates detected during ThinLTO backend");
67STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
68 "cloned) during whole program analysis");
69STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
70 "during whole program analysis");
71STATISTIC(AllocTypeNotColdThinBackend,
72 "Number of not cold static allocations (possibly cloned) during "
73 "ThinLTO backend");
74STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
75 "(possibly cloned) during ThinLTO backend");
76STATISTIC(OrigAllocsThinBackend,
77 "Number of original (not cloned) allocations with memprof profiles "
78 "during ThinLTO backend");
80 AllocVersionsThinBackend,
81 "Number of allocation versions (including clones) during ThinLTO backend");
82STATISTIC(MaxAllocVersionsThinBackend,
83 "Maximum number of allocation versions created for an original "
84 "allocation during ThinLTO backend");
85STATISTIC(UnclonableAllocsThinBackend,
86 "Number of unclonable ambigous allocations during ThinLTO backend");
87STATISTIC(RemovedEdgesWithMismatchedCallees,
88 "Number of edges removed due to mismatched callees (profiled vs IR)");
89STATISTIC(FoundProfiledCalleeCount,
90 "Number of profiled callees found via tail calls");
91STATISTIC(FoundProfiledCalleeDepth,
92 "Aggregate depth of profiled callees found via tail calls");
93STATISTIC(FoundProfiledCalleeMaxDepth,
94 "Maximum depth of profiled callees found via tail calls");
95STATISTIC(FoundProfiledCalleeNonUniquelyCount,
96 "Number of profiled callees found via multiple tail call chains");
97STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
98STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
99STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
100STATISTIC(MissingAllocForContextId,
101 "Number of missing alloc nodes for context ids");
102STATISTIC(SkippedCallsCloning,
103 "Number of calls skipped during cloning due to unexpected operand");
104STATISTIC(MismatchedCloneAssignments,
105 "Number of callsites assigned to call multiple non-matching clones");
106STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
107STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
108STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
109STATISTIC(NumImportantContextIds, "Number of important context ids");
110STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
111STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
112STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
113STATISTIC(AliaseesPrevailingInDiffModuleFromAlias,
114 "Number of aliasees prevailing in a different module than its alias");
115
117 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
118 cl::value_desc("filename"),
119 cl::desc("Specify the path prefix of the MemProf dot files."));
120
121static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
123 cl::desc("Export graph to dot files."));
124
125// TODO: Remove this option once new handling is validated more widely.
127 "memprof-merge-iteration", cl::init(true), cl::Hidden,
128 cl::desc("Iteratively apply merging on a node to catch new callers"));
129
130// How much of the graph to export to dot.
132 All, // The full CCG graph.
133 Alloc, // Only contexts for the specified allocation.
134 Context, // Only the specified context.
135};
136
138 "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
141 clEnumValN(DotScope::All, "all", "Export full callsite graph"),
143 "Export only nodes with contexts feeding given "
144 "-memprof-dot-alloc-id"),
145 clEnumValN(DotScope::Context, "context",
146 "Export only nodes with given -memprof-dot-context-id")));
147
149 AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
150 cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
151 "or to highlight if -memprof-dot-scope=all"));
152
154 "memprof-dot-context-id", cl::init(0), cl::Hidden,
155 cl::desc("Id of context to export if -memprof-dot-scope=context or to "
156 "highlight otherwise"));
157
158static cl::opt<bool>
159 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
160 cl::desc("Dump CallingContextGraph to stdout after each stage."));
161
162static cl::opt<bool>
163 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
164 cl::desc("Perform verification checks on CallingContextGraph."));
165
166static cl::opt<bool>
167 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
168 cl::desc("Perform frequent verification checks on nodes."));
169
171 "memprof-import-summary",
172 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
173 cl::Hidden);
174
176 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
178 cl::desc("Max depth to recursively search for missing "
179 "frames through tail calls."));
180
181// Optionally enable cloning of callsites involved with recursive cycles
183 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
184 cl::desc("Allow cloning of callsites involved in recursive cycles"));
185
187 "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
188 cl::desc("Allow cloning of contexts through recursive cycles"));
189
190// Generally this is needed for correct assignment of allocation clones to
191// function clones, however, allow it to be disabled for debugging while the
192// functionality is new and being tested more widely.
193static cl::opt<bool>
194 MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
195 cl::desc("Merge clones before assigning functions"));
196
197// When disabled, try to detect and prevent cloning of recursive contexts.
198// This is only necessary until we support cloning through recursive cycles.
199// Leave on by default for now, as disabling requires a little bit of compile
200// time overhead and doesn't affect correctness, it will just inflate the cold
201// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
203 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
204 cl::desc("Allow cloning of contexts having recursive cycles"));
205
206// Set the minimum absolute count threshold for allowing inlining of indirect
207// calls promoted during cloning.
209 "memprof-icp-noinline-threshold", cl::init(0), cl::Hidden,
210 cl::desc("Minimum absolute count for promoted target to be inlinable"));
211
212namespace llvm {
214 "enable-memprof-context-disambiguation", cl::Hidden,
215 cl::desc("Enable MemProf context disambiguation"));
216
217// Indicate we are linking with an allocator that supports hot/cold operator
218// new interfaces.
220 "supports-hot-cold-new", cl::init(false), cl::Hidden,
221 cl::desc("Linking with hot/cold operator new interfaces"));
222
224 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
225 cl::desc(
226 "Require target function definition when promoting indirect calls"));
227
230
232 "memprof-top-n-important", cl::init(10), cl::Hidden,
233 cl::desc("Number of largest cold contexts to consider important"));
234
236 "memprof-fixup-important", cl::init(true), cl::Hidden,
237 cl::desc("Enables edge fixup for important contexts"));
238
240
241} // namespace llvm
242
243namespace {
244
245/// CRTP base for graphs built from either IR or ThinLTO summary index.
246///
247/// The graph represents the call contexts in all memprof metadata on allocation
248/// calls, with nodes for the allocations themselves, as well as for the calls
249/// in each context. The graph is initially built from the allocation memprof
250/// metadata (or summary) MIBs. It is then updated to match calls with callsite
251/// metadata onto the nodes, updating it to reflect any inlining performed on
252/// those calls.
253///
254/// Each MIB (representing an allocation's call context with allocation
255/// behavior) is assigned a unique context id during the graph build. The edges
256/// and nodes in the graph are decorated with the context ids they carry. This
257/// is used to correctly update the graph when cloning is performed so that we
258/// can uniquify the context for a single (possibly cloned) allocation.
259template <typename DerivedCCG, typename FuncTy, typename CallTy>
260class CallsiteContextGraph {
261public:
262 CallsiteContextGraph() = default;
263 CallsiteContextGraph(const CallsiteContextGraph &) = default;
264 CallsiteContextGraph(CallsiteContextGraph &&) = default;
265
266 /// Main entry point to perform analysis and transformations on graph.
267 bool process(function_ref<void(StringRef, StringRef, const Twine &)>
268 EmitRemark = nullptr,
269 bool AllowExtraAnalysis = false);
270
271 /// Perform cloning on the graph necessary to uniquely identify the allocation
272 /// behavior of an allocation based on its context.
273 void identifyClones();
274
275 /// Assign callsite clones to functions, cloning functions as needed to
276 /// accommodate the combinations of their callsite clones reached by callers.
277 /// For regular LTO this clones functions and callsites in the IR, but for
278 /// ThinLTO the cloning decisions are noted in the summaries and later applied
279 /// in applyImport.
280 bool assignFunctions();
281
282 void dump() const;
283 void print(raw_ostream &OS) const;
284 void printTotalSizes(raw_ostream &OS,
285 function_ref<void(StringRef, StringRef, const Twine &)>
286 EmitRemark = nullptr) const;
287
289 const CallsiteContextGraph &CCG) {
290 CCG.print(OS);
291 return OS;
292 }
293
294 friend struct GraphTraits<
295 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
296 friend struct DOTGraphTraits<
297 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
298
299 void exportToDot(std::string Label) const;
300
301 /// Represents a function clone via FuncTy pointer and clone number pair.
302 struct FuncInfo final
303 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
304 using Base = std::pair<FuncTy *, unsigned>;
305 FuncInfo(const Base &B) : Base(B) {}
306 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
307 explicit operator bool() const { return this->first != nullptr; }
308 FuncTy *func() const { return this->first; }
309 unsigned cloneNo() const { return this->second; }
310 };
311
312 /// Represents a callsite clone via CallTy and clone number pair.
313 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
314 using Base = std::pair<CallTy, unsigned>;
315 CallInfo(const Base &B) : Base(B) {}
316 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
317 : Base(Call, CloneNo) {}
318 explicit operator bool() const { return (bool)this->first; }
319 CallTy call() const { return this->first; }
320 unsigned cloneNo() const { return this->second; }
321 void setCloneNo(unsigned N) { this->second = N; }
322 void print(raw_ostream &OS) const {
323 if (!operator bool()) {
324 assert(!cloneNo());
325 OS << "null Call";
326 return;
327 }
328 call()->print(OS);
329 OS << "\t(clone " << cloneNo() << ")";
330 }
331 void dump() const {
332 print(dbgs());
333 dbgs() << "\n";
334 }
335 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
336 Call.print(OS);
337 return OS;
338 }
339 };
340
341 struct ContextEdge;
342
343 /// Node in the Callsite Context Graph
344 struct ContextNode {
345 // Assigned to nodes as they are created, useful for debugging.
346 unsigned NodeId = 0;
347
348 // Keep this for now since in the IR case where we have an Instruction* it
349 // is not as immediately discoverable. Used for printing richer information
350 // when dumping graph.
351 bool IsAllocation;
352
353 // Keeps track of when the Call was reset to null because there was
354 // recursion.
355 bool Recursive = false;
356
357 // This will be formed by ORing together the AllocationType enum values
358 // for contexts including this node.
359 uint8_t AllocTypes = 0;
360
361 // The corresponding allocation or interior call. This is the primary call
362 // for which we have created this node.
363 CallInfo Call;
364
365 // List of other calls that can be treated the same as the primary call
366 // through cloning. I.e. located in the same function and have the same
367 // (possibly pruned) stack ids. They will be updated the same way as the
368 // primary call when assigning to function clones.
369 SmallVector<CallInfo, 0> MatchingCalls;
370
371 // For alloc nodes this is a unique id assigned when constructed, and for
372 // callsite stack nodes it is the original stack id when the node is
373 // constructed from the memprof MIB metadata on the alloc nodes. Note that
374 // this is only used when matching callsite metadata onto the stack nodes
375 // created when processing the allocation memprof MIBs, and for labeling
376 // nodes in the dot graph. Therefore we don't bother to assign a value for
377 // clones.
378 uint64_t OrigStackOrAllocId = 0;
379
380 // Edges to all callees in the profiled call stacks.
381 // TODO: Should this be a map (from Callee node) for more efficient lookup?
382 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
383
384 // Edges to all callers in the profiled call stacks.
385 // TODO: Should this be a map (from Caller node) for more efficient lookup?
386 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
387
388 // Returns true if we need to look at the callee edges for determining the
389 // node context ids and allocation type.
390 bool useCallerEdgesForContextInfo() const {
391 // Typically if the callee edges are empty either the caller edges are
392 // also empty, or this is an allocation (leaf node). However, if we are
393 // allowing recursive callsites and contexts this will be violated for
394 // incompletely cloned recursive cycles.
395 assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
397 // When cloning for a recursive context, during cloning we might be in the
398 // midst of cloning for a recurrence and have moved context ids off of a
399 // caller edge onto the clone but not yet off of the incoming caller
400 // (back) edge. If we don't look at those we miss the fact that this node
401 // still has context ids of interest.
402 return IsAllocation || CloneRecursiveContexts;
403 }
404
405 // Compute the context ids for this node from the union of its edge context
406 // ids.
407 DenseSet<uint32_t> getContextIds() const {
408 unsigned Count = 0;
409 // Compute the number of ids for reserve below. In general we only need to
410 // look at one set of edges, typically the callee edges, since other than
411 // allocations and in some cases during recursion cloning, all the context
412 // ids on the callers should also flow out via callee edges.
413 for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
414 Count += Edge->getContextIds().size();
415 DenseSet<uint32_t> ContextIds;
416 ContextIds.reserve(Count);
418 CalleeEdges, useCallerEdgesForContextInfo()
419 ? CallerEdges
420 : std::vector<std::shared_ptr<ContextEdge>>());
421 for (const auto &Edge : Edges)
422 ContextIds.insert_range(Edge->getContextIds());
423 return ContextIds;
424 }
425
426 // Compute the allocation type for this node from the OR of its edge
427 // allocation types.
428 uint8_t computeAllocType() const {
429 uint8_t BothTypes =
433 CalleeEdges, useCallerEdgesForContextInfo()
434 ? CallerEdges
435 : std::vector<std::shared_ptr<ContextEdge>>());
436 for (const auto &Edge : Edges) {
437 AllocType |= Edge->AllocTypes;
438 // Bail early if alloc type reached both, no further refinement.
439 if (AllocType == BothTypes)
440 return AllocType;
441 }
442 return AllocType;
443 }
444
445 // The context ids set for this node is empty if its edge context ids are
446 // also all empty.
447 bool emptyContextIds() const {
449 CalleeEdges, useCallerEdgesForContextInfo()
450 ? CallerEdges
451 : std::vector<std::shared_ptr<ContextEdge>>());
452 for (const auto &Edge : Edges) {
453 if (!Edge->getContextIds().empty())
454 return false;
455 }
456 return true;
457 }
458
459 // List of clones of this ContextNode, initially empty.
460 std::vector<ContextNode *> Clones;
461
462 // If a clone, points to the original uncloned node.
463 ContextNode *CloneOf = nullptr;
464
465 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
466
467 ContextNode(bool IsAllocation, CallInfo C)
468 : IsAllocation(IsAllocation), Call(C) {}
469
470 void addClone(ContextNode *Clone) {
471 if (CloneOf) {
472 CloneOf->Clones.push_back(Clone);
473 Clone->CloneOf = CloneOf;
474 } else {
475 Clones.push_back(Clone);
476 assert(!Clone->CloneOf);
477 Clone->CloneOf = this;
478 }
479 }
480
481 ContextNode *getOrigNode() {
482 if (!CloneOf)
483 return this;
484 return CloneOf;
485 }
486
487 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
488 unsigned int ContextId);
489
490 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
491 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
492 void eraseCalleeEdge(const ContextEdge *Edge);
493 void eraseCallerEdge(const ContextEdge *Edge);
494
495 void setCall(CallInfo C) { Call = std::move(C); }
496
497 bool hasCall() const { return (bool)Call.call(); }
498
499 void printCall(raw_ostream &OS) const { Call.print(OS); }
500
501 // True if this node was effectively removed from the graph, in which case
502 // it should have an allocation type of None and empty context ids.
503 bool isRemoved() const {
504 // Typically if the callee edges are empty either the caller edges are
505 // also empty, or this is an allocation (leaf node). However, if we are
506 // allowing recursive callsites and contexts this will be violated for
507 // incompletely cloned recursive cycles.
509 (AllocTypes == (uint8_t)AllocationType::None) ==
510 emptyContextIds());
511 return AllocTypes == (uint8_t)AllocationType::None;
512 }
513
514 void dump() const;
515 void print(raw_ostream &OS) const;
516
517 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
518 Node.print(OS);
519 return OS;
520 }
521 };
522
523 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
524 /// callee.
525 struct ContextEdge {
526 ContextNode *Callee;
527 ContextNode *Caller;
528
529 // This will be formed by ORing together the AllocationType enum values
530 // for contexts including this edge.
531 uint8_t AllocTypes = 0;
532
533 // Set just before initiating cloning when cloning of recursive contexts is
534 // enabled. Used to defer cloning of backedges until we have done cloning of
535 // the callee node for non-backedge caller edges. This exposes cloning
536 // opportunities through the backedge of the cycle.
537 // TODO: Note that this is not updated during cloning, and it is unclear
538 // whether that would be needed.
539 bool IsBackedge = false;
540
541 // The set of IDs for contexts including this edge.
542 DenseSet<uint32_t> ContextIds;
543
544 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
545 DenseSet<uint32_t> ContextIds)
546 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
547 ContextIds(std::move(ContextIds)) {}
548
549 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
550
551 // Helper to clear the fields of this edge when we are removing it from the
552 // graph.
553 inline void clear() {
554 ContextIds.clear();
555 AllocTypes = (uint8_t)AllocationType::None;
556 Caller = nullptr;
557 Callee = nullptr;
558 }
559
560 // Check if edge was removed from the graph. This is useful while iterating
561 // over a copy of edge lists when performing operations that mutate the
562 // graph in ways that might remove one of the edges.
563 inline bool isRemoved() const {
564 if (Callee || Caller)
565 return false;
566 // Any edges that have been removed from the graph but are still in a
567 // shared_ptr somewhere should have all fields null'ed out by clear()
568 // above.
569 assert(AllocTypes == (uint8_t)AllocationType::None);
570 assert(ContextIds.empty());
571 return true;
572 }
573
574 void dump() const;
575 void print(raw_ostream &OS) const;
576
577 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
578 Edge.print(OS);
579 return OS;
580 }
581 };
582
583 /// Helpers to remove edges that have allocation type None (due to not
584 /// carrying any context ids) after transformations.
585 void removeNoneTypeCalleeEdges(ContextNode *Node);
586 void removeNoneTypeCallerEdges(ContextNode *Node);
587 void
588 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
590
591protected:
592 /// Get a list of nodes corresponding to the stack ids in the given callsite
593 /// context.
594 template <class NodeT, class IteratorT>
595 std::vector<uint64_t>
596 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
597
598 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
599 /// metadata (or summary).
600 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
601
602 /// Adds nodes for the given MIB stack ids.
603 template <class NodeT, class IteratorT>
604 void addStackNodesForMIB(
605 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
607 ArrayRef<ContextTotalSize> ContextSizeInfo,
608 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
609
610 /// Matches all callsite metadata (or summary) to the nodes created for
611 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
612 /// inlining performed on those callsite instructions.
613 void updateStackNodes();
614
615 /// Optionally fixup edges for the N largest cold contexts to better enable
616 /// cloning. This is particularly helpful if the context includes recursion
617 /// as well as inlining, resulting in a single stack node for multiple stack
618 /// ids in the context. With recursion it is particularly difficult to get the
619 /// edge updates correct as in the general case we have lost the original
620 /// stack id ordering for the context. Do more expensive fixup for the largest
621 /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
622 void fixupImportantContexts();
623
624 /// Update graph to conservatively handle any callsite stack nodes that target
625 /// multiple different callee target functions.
626 void handleCallsitesWithMultipleTargets();
627
628 /// Mark backedges via the standard DFS based backedge algorithm.
629 void markBackedges();
630
631 /// Merge clones generated during cloning for different allocations but that
632 /// are called by the same caller node, to ensure proper function assignment.
633 void mergeClones();
634
635 // Try to partition calls on the given node (already placed into the AllCalls
636 // array) by callee function, creating new copies of Node as needed to hold
637 // calls with different callees, and moving the callee edges appropriately.
638 // Returns true if partitioning was successful.
639 bool partitionCallsByCallee(
640 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
641 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
642
643 /// Save lists of calls with MemProf metadata in each function, for faster
644 /// iteration.
645 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
646
647 /// Map from callsite node to the enclosing caller function.
648 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
649
650 // When exporting to dot, and an allocation id is specified, contains the
651 // context ids on that allocation.
652 DenseSet<uint32_t> DotAllocContextIds;
653
654private:
655 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
656
657 // Structure to keep track of information for each call as we are matching
658 // non-allocation callsites onto context nodes created from the allocation
659 // call metadata / summary contexts.
660 struct CallContextInfo {
661 // The callsite we're trying to match.
662 CallTy Call;
663 // The callsites stack ids that have a context node in the graph.
664 std::vector<uint64_t> StackIds;
665 // The function containing this callsite.
666 const FuncTy *Func;
667 // Initially empty, if needed this will be updated to contain the context
668 // ids for use in a new context node created for this callsite.
669 DenseSet<uint32_t> ContextIds;
670 };
671
672 /// Helper to remove edge from graph, updating edge iterator if it is provided
673 /// (in which case CalleeIter indicates which edge list is being iterated).
674 /// This will also perform the necessary clearing of the ContextEdge members
675 /// to enable later checking if the edge has been removed (since we may have
676 /// other copies of the shared_ptr in existence, and in fact rely on this to
677 /// enable removal while iterating over a copy of a node's edge list).
678 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
679 bool CalleeIter = true);
680
681 /// Assigns the given Node to calls at or inlined into the location with
682 /// the Node's stack id, after post order traversing and processing its
683 /// caller nodes. Uses the call information recorded in the given
684 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
685 /// as needed. Called by updateStackNodes which sets up the given
686 /// StackIdToMatchingCalls map.
687 void assignStackNodesPostOrder(
688 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
689 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
690 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
691 const DenseSet<uint32_t> &ImportantContextIds);
692
693 /// Duplicates the given set of context ids, updating the provided
694 /// map from each original id with the newly generated context ids,
695 /// and returning the new duplicated id set.
696 DenseSet<uint32_t> duplicateContextIds(
697 const DenseSet<uint32_t> &StackSequenceContextIds,
698 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
699
700 /// Propagates all duplicated context ids across the graph.
701 void propagateDuplicateContextIds(
702 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
703
704 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
705 /// else to its callers. Also updates OrigNode's edges to remove any context
706 /// ids moved to the newly created edge.
707 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
708 bool TowardsCallee,
709 DenseSet<uint32_t> RemainingContextIds);
710
711 /// Get the stack id corresponding to the given Id or Index (for IR this will
712 /// return itself, for a summary index this will return the id recorded in the
713 /// index for that stack id index value).
714 uint64_t getStackId(uint64_t IdOrIndex) const {
715 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
716 }
717
718 /// Returns true if the given call targets the callee of the given edge, or if
719 /// we were able to identify the call chain through intermediate tail calls.
720 /// In the latter case new context nodes are added to the graph for the
721 /// identified tail calls, and their synthesized nodes are added to
722 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
723 /// the updated edges and to prepare it for an increment in the caller.
724 bool
725 calleesMatch(CallTy Call, EdgeIter &EI,
726 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
727
728 // Return the callee function of the given call, or nullptr if it can't be
729 // determined
730 const FuncTy *getCalleeFunc(CallTy Call) {
731 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
732 }
733
734 /// Returns true if the given call targets the given function, or if we were
735 /// able to identify the call chain through intermediate tail calls (in which
736 /// case FoundCalleeChain will be populated).
737 bool calleeMatchesFunc(
738 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
739 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
740 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
741 Call, Func, CallerFunc, FoundCalleeChain);
742 }
743
744 /// Returns true if both call instructions have the same callee.
745 bool sameCallee(CallTy Call1, CallTy Call2) {
746 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
747 }
748
749 /// Get a list of nodes corresponding to the stack ids in the given
750 /// callsite's context.
751 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
752 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
753 Call);
754 }
755
756 /// Get the last stack id in the context for callsite.
757 uint64_t getLastStackId(CallTy Call) {
758 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
759 }
760
761 /// Update the allocation call to record type of allocated memory.
762 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
763 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
764 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
765 }
766
767 /// Get the AllocationType assigned to the given allocation instruction clone.
768 AllocationType getAllocationCallType(const CallInfo &Call) const {
769 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
770 }
771
772 /// Update non-allocation call to invoke (possibly cloned) function
773 /// CalleeFunc.
774 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
775 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
776 }
777
778 /// Clone the given function for the given callsite, recording mapping of all
779 /// of the functions tracked calls to their new versions in the CallMap.
780 /// Assigns new clones to clone number CloneNo.
781 FuncInfo cloneFunctionForCallsite(
782 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
783 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
784 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
785 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
786 }
787
788 /// Gets a label to use in the dot graph for the given call clone in the given
789 /// function.
790 std::string getLabel(const FuncTy *Func, const CallTy Call,
791 unsigned CloneNo) const {
792 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
793 }
794
795 // Create and return a new ContextNode.
796 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
797 CallInfo C = CallInfo()) {
798 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
799 auto *NewNode = NodeOwner.back().get();
800 if (F)
801 NodeToCallingFunc[NewNode] = F;
802 NewNode->NodeId = NodeOwner.size();
803 return NewNode;
804 }
805
806 /// Helpers to find the node corresponding to the given call or stackid.
807 ContextNode *getNodeForInst(const CallInfo &C);
808 ContextNode *getNodeForAlloc(const CallInfo &C);
809 ContextNode *getNodeForStackId(uint64_t StackId);
810
811 /// Computes the alloc type corresponding to the given context ids, by
812 /// unioning their recorded alloc types.
813 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
814
815 /// Returns the allocation type of the intersection of the contexts of two
816 /// nodes (based on their provided context id sets), optimized for the case
817 /// when Node1Ids is smaller than Node2Ids.
818 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
819 const DenseSet<uint32_t> &Node2Ids) const;
820
821 /// Returns the allocation type of the intersection of the contexts of two
822 /// nodes (based on their provided context id sets).
823 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
824 const DenseSet<uint32_t> &Node2Ids) const;
825
826 /// Create a clone of Edge's callee and move Edge to that new callee node,
827 /// performing the necessary context id and allocation type updates.
828 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
829 /// moved to an edge to the new callee.
830 ContextNode *
831 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
832 DenseSet<uint32_t> ContextIdsToMove = {});
833
834 /// Change the callee of Edge to existing callee clone NewCallee, performing
835 /// the necessary context id and allocation type updates.
836 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
837 /// moved to an edge to the new callee.
838 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
839 ContextNode *NewCallee,
840 bool NewClone = false,
841 DenseSet<uint32_t> ContextIdsToMove = {});
842
843 /// Change the caller of the edge at the given callee edge iterator to be
844 /// NewCaller, performing the necessary context id and allocation type
845 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
846 /// a simplified version of it as we always move the given edge and all of its
847 /// context ids.
848 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
849 ContextNode *NewCaller);
850
851 /// Recursive helper for marking backedges via DFS.
852 void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
853 DenseSet<const ContextNode *> &CurrentStack);
854
855 /// Recursive helper for merging clones.
856 void
857 mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
858 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
859 /// Main worker for merging callee clones for a given node.
860 void mergeNodeCalleeClones(
861 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
862 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
863 /// Helper to find other callers of the given set of callee edges that can
864 /// share the same callee merge node.
865 void findOtherCallersToShareMerge(
866 ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
867 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
868 DenseSet<ContextNode *> &OtherCallersToShareMerge);
869
870 /// Recursively perform cloning on the graph for the given Node and its
871 /// callers, in order to uniquely identify the allocation behavior of an
872 /// allocation given its context. The context ids of the allocation being
873 /// processed are given in AllocContextIds.
874 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
875 const DenseSet<uint32_t> &AllocContextIds);
876
877 /// Map from each context ID to the AllocationType assigned to that context.
878 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
879
880 /// Map from each contextID to the profiled full contexts and their total
881 /// sizes (there may be more than one due to context trimming),
882 /// optionally populated when requested (via MemProfReportHintedSizes or
883 /// MinClonedColdBytePercent).
884 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
885
886 /// Identifies the context node created for a stack id when adding the MIB
887 /// contexts to the graph. This is used to locate the context nodes when
888 /// trying to assign the corresponding callsites with those stack ids to these
889 /// nodes.
890 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
891
892 /// Saves information for the contexts identified as important (the largest
893 /// cold contexts up to MemProfTopNImportant).
894 struct ImportantContextInfo {
895 // The original list of leaf first stack ids corresponding to this context.
896 std::vector<uint64_t> StackIds;
897 // Max length of stack ids corresponding to a single stack ContextNode for
898 // this context (i.e. the max length of a key in StackIdsToNode below).
899 unsigned MaxLength = 0;
900 // Mapping of slices of the stack ids to the corresponding ContextNode
901 // (there can be multiple stack ids due to inlining). Populated when
902 // updating stack nodes while matching them to the IR or summary.
903 std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
904 };
905
906 // Map of important full context ids to information about each.
907 DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
908
909 // For each important context id found in Node (if any), records the list of
910 // stack ids that corresponded to the given callsite Node. There can be more
911 // than one in the case of inlining.
912 void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
913 // We pass in the Node's context ids to avoid the
914 // overhead of computing them as the caller already has
915 // them in some cases.
916 const DenseSet<uint32_t> &NodeContextIds,
917 const DenseSet<uint32_t> &ImportantContextIds) {
919 assert(ImportantContextIds.empty());
920 return;
921 }
923 set_intersection(NodeContextIds, ImportantContextIds);
924 if (Ids.empty())
925 return;
926 auto Size = StackIds.size();
927 for (auto Id : Ids) {
928 auto &Entry = ImportantContextIdInfo[Id];
929 Entry.StackIdsToNode[StackIds] = Node;
930 // Keep track of the max to simplify later analysis.
931 if (Size > Entry.MaxLength)
932 Entry.MaxLength = Size;
933 }
934 }
935
936 /// Maps to track the calls to their corresponding nodes in the graph.
937 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
938 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
939
940 /// Owner of all ContextNode unique_ptrs.
941 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
942
943 /// Perform sanity checks on graph when requested.
944 void check() const;
945
946 /// Keeps track of the last unique context id assigned.
947 unsigned int LastContextId = 0;
948};
949
950template <typename DerivedCCG, typename FuncTy, typename CallTy>
951using ContextNode =
952 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
953template <typename DerivedCCG, typename FuncTy, typename CallTy>
954using ContextEdge =
955 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
956template <typename DerivedCCG, typename FuncTy, typename CallTy>
957using FuncInfo =
958 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
959template <typename DerivedCCG, typename FuncTy, typename CallTy>
960using CallInfo =
961 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
962
963/// CRTP derived class for graphs built from IR (regular LTO).
964class ModuleCallsiteContextGraph
965 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
966 Instruction *> {
967public:
968 ModuleCallsiteContextGraph(
969 Module &M,
970 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
971
972private:
973 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
974 Instruction *>;
975
976 uint64_t getStackId(uint64_t IdOrIndex) const;
977 const Function *getCalleeFunc(Instruction *Call);
978 bool calleeMatchesFunc(
979 Instruction *Call, const Function *Func, const Function *CallerFunc,
980 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
981 bool sameCallee(Instruction *Call1, Instruction *Call2);
982 bool findProfiledCalleeThroughTailCalls(
983 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
984 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
985 bool &FoundMultipleCalleeChains);
986 uint64_t getLastStackId(Instruction *Call);
987 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
988 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
989 AllocationType getAllocationCallType(const CallInfo &Call) const;
990 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
991 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
992 Instruction *>::FuncInfo
993 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
994 DenseMap<CallInfo, CallInfo> &CallMap,
995 std::vector<CallInfo> &CallsWithMetadataInFunc,
996 unsigned CloneNo);
997 std::string getLabel(const Function *Func, const Instruction *Call,
998 unsigned CloneNo) const;
999
1000 const Module &Mod;
1001 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
1002};
1003
1004/// Represents a call in the summary index graph, which can either be an
1005/// allocation or an interior callsite node in an allocation's context.
1006/// Holds a pointer to the corresponding data structure in the index.
1007struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
1008 IndexCall() : PointerUnion() {}
1009 IndexCall(std::nullptr_t) : IndexCall() {}
1010 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
1011 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
1012 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
1013
1014 IndexCall *operator->() { return this; }
1015
1016 void print(raw_ostream &OS) const {
1017 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
1019 OS << *AI;
1020 } else {
1022 assert(CI);
1023 OS << *CI;
1024 }
1025 }
1026};
1027} // namespace
1028
1029namespace llvm {
1030template <> struct simplify_type<IndexCall> {
1032 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
1033};
1034template <> struct simplify_type<const IndexCall> {
1036 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
1037};
1038} // namespace llvm
1039
1040namespace {
1041/// CRTP derived class for graphs built from summary index (ThinLTO).
1042class IndexCallsiteContextGraph
1043 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1044 IndexCall> {
1045public:
1046 IndexCallsiteContextGraph(
1047 ModuleSummaryIndex &Index,
1048 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1049 isPrevailing);
1050
1051 ~IndexCallsiteContextGraph() {
1052 // Now that we are done with the graph it is safe to add the new
1053 // CallsiteInfo structs to the function summary vectors. The graph nodes
1054 // point into locations within these vectors, so we don't want to add them
1055 // any earlier.
1056 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
1057 auto *FS = I.first;
1058 for (auto &Callsite : I.second)
1059 FS->addCallsite(std::move(*Callsite.second));
1060 }
1061 }
1062
1063private:
1064 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1065 IndexCall>;
1066
1067 uint64_t getStackId(uint64_t IdOrIndex) const;
1068 const FunctionSummary *getCalleeFunc(IndexCall &Call);
1069 bool calleeMatchesFunc(
1070 IndexCall &Call, const FunctionSummary *Func,
1071 const FunctionSummary *CallerFunc,
1072 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
1073 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
1074 bool findProfiledCalleeThroughTailCalls(
1075 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1076 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1077 bool &FoundMultipleCalleeChains);
1078 uint64_t getLastStackId(IndexCall &Call);
1079 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
1080 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
1081 AllocationType getAllocationCallType(const CallInfo &Call) const;
1082 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
1083 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
1084 IndexCall>::FuncInfo
1085 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
1086 DenseMap<CallInfo, CallInfo> &CallMap,
1087 std::vector<CallInfo> &CallsWithMetadataInFunc,
1088 unsigned CloneNo);
1089 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
1090 unsigned CloneNo) const;
1091 DenseSet<GlobalValue::GUID> findAliaseeGUIDsPrevailingInDifferentModule();
1092
1093 // Saves mapping from function summaries containing memprof records back to
1094 // its VI, for use in checking and debugging.
1095 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1096
1097 const ModuleSummaryIndex &Index;
1098 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1099 isPrevailing;
1100
1101 // Saves/owns the callsite info structures synthesized for missing tail call
1102 // frames that we discover while building the graph.
1103 // It maps from the summary of the function making the tail call, to a map
1104 // of callee ValueInfo to corresponding synthesized callsite info.
1105 DenseMap<FunctionSummary *,
1106 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1107 FunctionCalleesToSynthesizedCallsiteInfos;
1108};
1109} // namespace
1110
1111template <>
1112struct llvm::DenseMapInfo<CallsiteContextGraph<
1113 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1115template <>
1116struct llvm::DenseMapInfo<CallsiteContextGraph<
1117 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1118 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1119template <>
1120struct llvm::DenseMapInfo<IndexCall>
1121 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1122
1123namespace {
1124
1125// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1126// type we should actually use on the corresponding allocation.
1127// If we can't clone a node that has NotCold+Cold alloc type, we will fall
1128// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1129// from NotCold.
1130AllocationType allocTypeToUse(uint8_t AllocTypes) {
1131 assert(AllocTypes != (uint8_t)AllocationType::None);
1132 if (AllocTypes ==
1135 else
1136 return (AllocationType)AllocTypes;
1137}
1138
1139// Helper to check if the alloc types for all edges recorded in the
1140// InAllocTypes vector match the alloc types for all edges in the Edges
1141// vector.
1142template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143bool allocTypesMatch(
1144 const std::vector<uint8_t> &InAllocTypes,
1145 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1146 &Edges) {
1147 // This should be called only when the InAllocTypes vector was computed for
1148 // this set of Edges. Make sure the sizes are the same.
1149 assert(InAllocTypes.size() == Edges.size());
1150 return std::equal(
1151 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1152 [](const uint8_t &l,
1153 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1154 // Can share if one of the edges is None type - don't
1155 // care about the type along that edge as it doesn't
1156 // exist for those context ids.
1157 if (l == (uint8_t)AllocationType::None ||
1158 r->AllocTypes == (uint8_t)AllocationType::None)
1159 return true;
1160 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1161 });
1162}
1163
1164// Helper to check if the alloc types for all edges recorded in the
1165// InAllocTypes vector match the alloc types for callee edges in the given
1166// clone. Because the InAllocTypes were computed from the original node's callee
1167// edges, and other cloning could have happened after this clone was created, we
1168// need to find the matching clone callee edge, which may or may not exist.
1169template <typename DerivedCCG, typename FuncTy, typename CallTy>
1170bool allocTypesMatchClone(
1171 const std::vector<uint8_t> &InAllocTypes,
1172 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1173 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1174 assert(Node);
1175 // InAllocTypes should have been computed for the original node's callee
1176 // edges.
1177 assert(InAllocTypes.size() == Node->CalleeEdges.size());
1178 // First create a map of the clone callee edge callees to the edge alloc type.
1180 EdgeCalleeMap;
1181 for (const auto &E : Clone->CalleeEdges) {
1182 assert(!EdgeCalleeMap.contains(E->Callee));
1183 EdgeCalleeMap[E->Callee] = E->AllocTypes;
1184 }
1185 // Next, walk the original node's callees, and look for the corresponding
1186 // clone edge to that callee.
1187 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1188 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1189 // Not found is ok, we will simply add an edge if we use this clone.
1190 if (Iter == EdgeCalleeMap.end())
1191 continue;
1192 // Can share if one of the edges is None type - don't
1193 // care about the type along that edge as it doesn't
1194 // exist for those context ids.
1195 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1196 Iter->second == (uint8_t)AllocationType::None)
1197 continue;
1198 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1199 return false;
1200 }
1201 return true;
1202}
1203
1204} // end anonymous namespace
1205
1206template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1208CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1209 const CallInfo &C) {
1210 ContextNode *Node = getNodeForAlloc(C);
1211 if (Node)
1212 return Node;
1213
1214 return NonAllocationCallToContextNodeMap.lookup(C);
1215}
1216
1217template <typename DerivedCCG, typename FuncTy, typename CallTy>
1218typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1219CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1220 const CallInfo &C) {
1221 return AllocationCallToContextNodeMap.lookup(C);
1222}
1223
1224template <typename DerivedCCG, typename FuncTy, typename CallTy>
1225typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1226CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1227 uint64_t StackId) {
1228 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1229 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1230 return StackEntryNode->second;
1231 return nullptr;
1232}
1233
1234template <typename DerivedCCG, typename FuncTy, typename CallTy>
1235void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1236 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1237 unsigned int ContextId) {
1238 for (auto &Edge : CallerEdges) {
1239 if (Edge->Caller == Caller) {
1240 Edge->AllocTypes |= (uint8_t)AllocType;
1241 Edge->getContextIds().insert(ContextId);
1242 return;
1243 }
1244 }
1245 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1246 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1247 CallerEdges.push_back(Edge);
1248 Caller->CalleeEdges.push_back(Edge);
1249}
1250
1251template <typename DerivedCCG, typename FuncTy, typename CallTy>
1252void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1253 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1254 assert(!EI || (*EI)->get() == Edge);
1255 assert(!Edge->isRemoved());
1256 // Save the Caller and Callee pointers so we can erase Edge from their edge
1257 // lists after clearing Edge below. We do the clearing first in case it is
1258 // destructed after removing from the edge lists (if those were the last
1259 // shared_ptr references to Edge).
1260 auto *Callee = Edge->Callee;
1261 auto *Caller = Edge->Caller;
1262
1263 // Make sure the edge fields are cleared out so we can properly detect
1264 // removed edges if Edge is not destructed because there is still a shared_ptr
1265 // reference.
1266 Edge->clear();
1267
1268#ifndef NDEBUG
1269 auto CalleeCallerCount = Callee->CallerEdges.size();
1270 auto CallerCalleeCount = Caller->CalleeEdges.size();
1271#endif
1272 if (!EI) {
1273 Callee->eraseCallerEdge(Edge);
1274 Caller->eraseCalleeEdge(Edge);
1275 } else if (CalleeIter) {
1276 Callee->eraseCallerEdge(Edge);
1277 *EI = Caller->CalleeEdges.erase(*EI);
1278 } else {
1279 Caller->eraseCalleeEdge(Edge);
1280 *EI = Callee->CallerEdges.erase(*EI);
1281 }
1282 assert(Callee->CallerEdges.size() < CalleeCallerCount);
1283 assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1284}
1285
1286template <typename DerivedCCG, typename FuncTy, typename CallTy>
1287void CallsiteContextGraph<
1288 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1289 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1290 auto Edge = *EI;
1291 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1292 assert(Edge->ContextIds.empty());
1293 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1294 } else
1295 ++EI;
1296 }
1297}
1298
1299template <typename DerivedCCG, typename FuncTy, typename CallTy>
1300void CallsiteContextGraph<
1301 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1302 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1303 auto Edge = *EI;
1304 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1305 assert(Edge->ContextIds.empty());
1306 Edge->Caller->eraseCalleeEdge(Edge.get());
1307 EI = Node->CallerEdges.erase(EI);
1308 } else
1309 ++EI;
1310 }
1311}
1312
1313template <typename DerivedCCG, typename FuncTy, typename CallTy>
1314typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1315CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1316 findEdgeFromCallee(const ContextNode *Callee) {
1317 for (const auto &Edge : CalleeEdges)
1318 if (Edge->Callee == Callee)
1319 return Edge.get();
1320 return nullptr;
1321}
1322
1323template <typename DerivedCCG, typename FuncTy, typename CallTy>
1324typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1325CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1326 findEdgeFromCaller(const ContextNode *Caller) {
1327 for (const auto &Edge : CallerEdges)
1328 if (Edge->Caller == Caller)
1329 return Edge.get();
1330 return nullptr;
1331}
1332
1333template <typename DerivedCCG, typename FuncTy, typename CallTy>
1334void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1335 eraseCalleeEdge(const ContextEdge *Edge) {
1336 auto EI = llvm::find_if(
1337 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1338 return CalleeEdge.get() == Edge;
1339 });
1340 assert(EI != CalleeEdges.end());
1341 CalleeEdges.erase(EI);
1342}
1343
1344template <typename DerivedCCG, typename FuncTy, typename CallTy>
1345void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1346 eraseCallerEdge(const ContextEdge *Edge) {
1347 auto EI = llvm::find_if(
1348 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1349 return CallerEdge.get() == Edge;
1350 });
1351 assert(EI != CallerEdges.end());
1352 CallerEdges.erase(EI);
1353}
1354
1355template <typename DerivedCCG, typename FuncTy, typename CallTy>
1356uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1357 DenseSet<uint32_t> &ContextIds) const {
1358 uint8_t BothTypes =
1359 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1360 uint8_t AllocType = (uint8_t)AllocationType::None;
1361 for (auto Id : ContextIds) {
1362 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1363 // Bail early if alloc type reached both, no further refinement.
1364 if (AllocType == BothTypes)
1365 return AllocType;
1366 }
1367 return AllocType;
1368}
1369
1370template <typename DerivedCCG, typename FuncTy, typename CallTy>
1371uint8_t
1372CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1373 const DenseSet<uint32_t> &Node1Ids,
1374 const DenseSet<uint32_t> &Node2Ids) const {
1375 uint8_t BothTypes =
1376 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1377 uint8_t AllocType = (uint8_t)AllocationType::None;
1378 for (auto Id : Node1Ids) {
1379 if (!Node2Ids.count(Id))
1380 continue;
1381 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1382 // Bail early if alloc type reached both, no further refinement.
1383 if (AllocType == BothTypes)
1384 return AllocType;
1385 }
1386 return AllocType;
1387}
1388
1389template <typename DerivedCCG, typename FuncTy, typename CallTy>
1390uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1391 const DenseSet<uint32_t> &Node1Ids,
1392 const DenseSet<uint32_t> &Node2Ids) const {
1393 if (Node1Ids.size() < Node2Ids.size())
1394 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1395 else
1396 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1397}
1398
1399template <typename DerivedCCG, typename FuncTy, typename CallTy>
1400typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1401CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1402 CallInfo Call, const FuncTy *F) {
1403 assert(!getNodeForAlloc(Call));
1404 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1405 AllocationCallToContextNodeMap[Call] = AllocNode;
1406 // Use LastContextId as a uniq id for MIB allocation nodes.
1407 AllocNode->OrigStackOrAllocId = LastContextId;
1408 // Alloc type should be updated as we add in the MIBs. We should assert
1409 // afterwards that it is not still None.
1410 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1411
1412 return AllocNode;
1413}
1414
1415static std::string getAllocTypeString(uint8_t AllocTypes) {
1416 if (!AllocTypes)
1417 return "None";
1418 std::string Str;
1419 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1420 Str += "NotCold";
1421 if (AllocTypes & (uint8_t)AllocationType::Cold)
1422 Str += "Cold";
1423 return Str;
1424}
1425
1426template <typename DerivedCCG, typename FuncTy, typename CallTy>
1427template <class NodeT, class IteratorT>
1428void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1429 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1430 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1431 ArrayRef<ContextTotalSize> ContextSizeInfo,
1432 std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
1433 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1434 // is done.
1435 if (AllocType == AllocationType::Hot)
1436 AllocType = AllocationType::NotCold;
1437
1438 ContextIdToAllocationType[++LastContextId] = AllocType;
1439
1440 bool IsImportant = false;
1441 if (!ContextSizeInfo.empty()) {
1442 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1443 // If this is a cold allocation, and we are collecting non-zero largest
1444 // contexts, see if this is a candidate.
1445 if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
1446 uint64_t TotalCold = 0;
1447 for (auto &CSI : ContextSizeInfo)
1448 TotalCold += CSI.TotalSize;
1449 // Record this context if either we haven't found the first top-n largest
1450 // yet, or if it is larger than the smallest already recorded.
1451 if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
1452 // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
1453 // sorted in ascending size of its key which is the size.
1454 TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
1455 if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
1456 // Remove old one and its associated entries.
1457 auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
1458 TotalSizeToContextIdTopNCold.erase(
1459 TotalSizeToContextIdTopNCold.begin());
1460 assert(ImportantContextIdInfo.count(IdToRemove));
1461 ImportantContextIdInfo.erase(IdToRemove);
1462 }
1463 TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
1464 IsImportant = true;
1465 }
1466 }
1467 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1468 }
1469
1470 // Update alloc type and context ids for this MIB.
1471 AllocNode->AllocTypes |= (uint8_t)AllocType;
1472
1473 // Now add or update nodes for each stack id in alloc's context.
1474 // Later when processing the stack ids on non-alloc callsites we will adjust
1475 // for any inlining in the context.
1476 ContextNode *PrevNode = AllocNode;
1477 // Look for recursion (direct recursion should have been collapsed by
1478 // module summary analysis, here we should just be detecting mutual
1479 // recursion). Mark these nodes so we don't try to clone.
1480 SmallSet<uint64_t, 8> StackIdSet;
1481 // Skip any on the allocation call (inlining).
1482 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1483 ContextIter != StackContext.end(); ++ContextIter) {
1484 auto StackId = getStackId(*ContextIter);
1485 if (IsImportant)
1486 ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
1487 ContextNode *StackNode = getNodeForStackId(StackId);
1488 if (!StackNode) {
1489 StackNode = createNewNode(/*IsAllocation=*/false);
1490 StackEntryIdToContextNodeMap[StackId] = StackNode;
1491 StackNode->OrigStackOrAllocId = StackId;
1492 }
1493 // Marking a node recursive will prevent its cloning completely, even for
1494 // non-recursive contexts flowing through it.
1496 auto Ins = StackIdSet.insert(StackId);
1497 if (!Ins.second)
1498 StackNode->Recursive = true;
1499 }
1500 StackNode->AllocTypes |= (uint8_t)AllocType;
1501 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1502 PrevNode = StackNode;
1503 }
1504}
1505
1506template <typename DerivedCCG, typename FuncTy, typename CallTy>
1507DenseSet<uint32_t>
1508CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1509 const DenseSet<uint32_t> &StackSequenceContextIds,
1510 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1511 DenseSet<uint32_t> NewContextIds;
1512 for (auto OldId : StackSequenceContextIds) {
1513 NewContextIds.insert(++LastContextId);
1514 OldToNewContextIds[OldId].insert(LastContextId);
1515 assert(ContextIdToAllocationType.count(OldId));
1516 // The new context has the same allocation type and size info as original.
1517 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1518 auto CSI = ContextIdToContextSizeInfos.find(OldId);
1519 if (CSI != ContextIdToContextSizeInfos.end())
1520 ContextIdToContextSizeInfos[LastContextId] = CSI->second;
1521 if (DotAllocContextIds.contains(OldId))
1522 DotAllocContextIds.insert(LastContextId);
1523 }
1524 return NewContextIds;
1525}
1526
1527template <typename DerivedCCG, typename FuncTy, typename CallTy>
1528void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1529 propagateDuplicateContextIds(
1530 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1531 // Build a set of duplicated context ids corresponding to the input id set.
1532 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1533 DenseSet<uint32_t> NewIds;
1534 for (auto Id : ContextIds)
1535 if (auto NewId = OldToNewContextIds.find(Id);
1536 NewId != OldToNewContextIds.end())
1537 NewIds.insert_range(NewId->second);
1538 return NewIds;
1539 };
1540
1541 // Recursively update context ids sets along caller edges.
1542 auto UpdateCallers = [&](ContextNode *Node,
1543 DenseSet<const ContextEdge *> &Visited,
1544 auto &&UpdateCallers) -> void {
1545 for (const auto &Edge : Node->CallerEdges) {
1546 auto Inserted = Visited.insert(Edge.get());
1547 if (!Inserted.second)
1548 continue;
1549 ContextNode *NextNode = Edge->Caller;
1550 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1551 // Only need to recursively iterate to NextNode via this caller edge if
1552 // it resulted in any added ids to NextNode.
1553 if (!NewIdsToAdd.empty()) {
1554 Edge->getContextIds().insert_range(NewIdsToAdd);
1555 UpdateCallers(NextNode, Visited, UpdateCallers);
1556 }
1557 }
1558 };
1559
1560 DenseSet<const ContextEdge *> Visited;
1561 for (auto &Entry : AllocationCallToContextNodeMap) {
1562 auto *Node = Entry.second;
1563 UpdateCallers(Node, Visited, UpdateCallers);
1564 }
1565}
1566
1567template <typename DerivedCCG, typename FuncTy, typename CallTy>
1568void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1569 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1570 // This must be passed by value to make a copy since it will be adjusted
1571 // as ids are moved.
1572 DenseSet<uint32_t> RemainingContextIds) {
1573 auto &OrigEdges =
1574 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1575 DenseSet<uint32_t> RecursiveContextIds;
1576 DenseSet<uint32_t> AllCallerContextIds;
1578 // Identify which context ids are recursive which is needed to properly
1579 // update the RemainingContextIds set. The relevant recursive context ids
1580 // are those that are in multiple edges.
1581 for (auto &CE : OrigEdges) {
1582 AllCallerContextIds.reserve(CE->getContextIds().size());
1583 for (auto Id : CE->getContextIds())
1584 if (!AllCallerContextIds.insert(Id).second)
1585 RecursiveContextIds.insert(Id);
1586 }
1587 }
1588 // Increment iterator in loop so that we can remove edges as needed.
1589 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1590 auto Edge = *EI;
1591 DenseSet<uint32_t> NewEdgeContextIds;
1592 DenseSet<uint32_t> NotFoundContextIds;
1593 // Remove any matching context ids from Edge, return set that were found and
1594 // removed, these are the new edge's context ids. Also update the remaining
1595 // (not found ids).
1596 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1597 NotFoundContextIds);
1598 // Update the remaining context ids set for the later edges. This is a
1599 // compile time optimization.
1600 if (RecursiveContextIds.empty()) {
1601 // No recursive ids, so all of the previously remaining context ids that
1602 // were not seen on this edge are the new remaining set.
1603 RemainingContextIds.swap(NotFoundContextIds);
1604 } else {
1605 // Keep the recursive ids in the remaining set as we expect to see those
1606 // on another edge. We can remove the non-recursive remaining ids that
1607 // were seen on this edge, however. We already have the set of remaining
1608 // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1609 // non-recursive and only remove those. Note that despite the higher
1610 // overhead of updating the remaining context ids set when recursion
1611 // handling is enabled, it was found to be at worst performance neutral
1612 // and in one case a clear win.
1613 DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1614 set_difference(NewEdgeContextIds, RecursiveContextIds);
1615 set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1616 }
1617 // If no matching context ids for this edge, skip it.
1618 if (NewEdgeContextIds.empty()) {
1619 ++EI;
1620 continue;
1621 }
1622 if (TowardsCallee) {
1623 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1624 auto NewEdge = std::make_shared<ContextEdge>(
1625 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1626 NewNode->CalleeEdges.push_back(NewEdge);
1627 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1628 } else {
1629 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1630 auto NewEdge = std::make_shared<ContextEdge>(
1631 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1632 NewNode->CallerEdges.push_back(NewEdge);
1633 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1634 }
1635 // Remove old edge if context ids empty.
1636 if (Edge->getContextIds().empty()) {
1637 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1638 continue;
1639 }
1640 ++EI;
1641 }
1642}
1643
1644template <typename DerivedCCG, typename FuncTy, typename CallTy>
1645static void checkEdge(
1646 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1647 // Confirm that alloc type is not None and that we have at least one context
1648 // id.
1649 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1650 assert(!Edge->ContextIds.empty());
1651}
1652
1653template <typename DerivedCCG, typename FuncTy, typename CallTy>
1654static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1655 bool CheckEdges = true) {
1656 if (Node->isRemoved())
1657 return;
1658#ifndef NDEBUG
1659 // Compute node's context ids once for use in asserts.
1660 auto NodeContextIds = Node->getContextIds();
1661#endif
1662 // Node's context ids should be the union of both its callee and caller edge
1663 // context ids.
1664 if (Node->CallerEdges.size()) {
1665 DenseSet<uint32_t> CallerEdgeContextIds(
1666 Node->CallerEdges.front()->ContextIds);
1667 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1668 if (CheckEdges)
1670 set_union(CallerEdgeContextIds, Edge->ContextIds);
1671 }
1672 // Node can have more context ids than callers if some contexts terminate at
1673 // node and some are longer. If we are allowing recursive callsites and
1674 // contexts this will be violated for incompletely cloned recursive cycles,
1675 // so skip the checking in that case.
1677 NodeContextIds == CallerEdgeContextIds ||
1678 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1679 }
1680 if (Node->CalleeEdges.size()) {
1681 DenseSet<uint32_t> CalleeEdgeContextIds(
1682 Node->CalleeEdges.front()->ContextIds);
1683 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1684 if (CheckEdges)
1686 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1687 }
1688 // If we are allowing recursive callsites and contexts this will be violated
1689 // for incompletely cloned recursive cycles, so skip the checking in that
1690 // case.
1692 NodeContextIds == CalleeEdgeContextIds);
1693 }
1694 // FIXME: Since this checking is only invoked under an option, we should
1695 // change the error checking from using assert to something that will trigger
1696 // an error on a release build.
1697#ifndef NDEBUG
1698 // Make sure we don't end up with duplicate edges between the same caller and
1699 // callee.
1701 for (const auto &E : Node->CalleeEdges)
1702 NodeSet.insert(E->Callee);
1703 assert(NodeSet.size() == Node->CalleeEdges.size());
1704#endif
1705}
1706
1707template <typename DerivedCCG, typename FuncTy, typename CallTy>
1708void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1709 assignStackNodesPostOrder(ContextNode *Node,
1710 DenseSet<const ContextNode *> &Visited,
1711 DenseMap<uint64_t, std::vector<CallContextInfo>>
1712 &StackIdToMatchingCalls,
1713 DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
1714 const DenseSet<uint32_t> &ImportantContextIds) {
1715 auto Inserted = Visited.insert(Node);
1716 if (!Inserted.second)
1717 return;
1718 // Post order traversal. Iterate over a copy since we may add nodes and
1719 // therefore new callers during the recursive call, invalidating any
1720 // iterator over the original edge vector. We don't need to process these
1721 // new nodes as they were already processed on creation.
1722 auto CallerEdges = Node->CallerEdges;
1723 for (auto &Edge : CallerEdges) {
1724 // Skip any that have been removed during the recursion.
1725 if (Edge->isRemoved()) {
1726 assert(!is_contained(Node->CallerEdges, Edge));
1727 continue;
1728 }
1729 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1730 CallToMatchingCall, ImportantContextIds);
1731 }
1732
1733 // If this node's stack id is in the map, update the graph to contain new
1734 // nodes representing any inlining at interior callsites. Note we move the
1735 // associated context ids over to the new nodes.
1736
1737 // Ignore this node if it is for an allocation or we didn't record any
1738 // stack id lists ending at it.
1739 if (Node->IsAllocation ||
1740 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1741 return;
1742
1743 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1744 // Handle the simple case first. A single call with a single stack id.
1745 // In this case there is no need to create any new context nodes, simply
1746 // assign the context node for stack id to this Call.
1747 if (Calls.size() == 1) {
1748 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1749 if (Ids.size() == 1) {
1750 assert(SavedContextIds.empty());
1751 // It should be this Node
1752 assert(Node == getNodeForStackId(Ids[0]));
1753 if (Node->Recursive)
1754 return;
1755 Node->setCall(Call);
1756 NonAllocationCallToContextNodeMap[Call] = Node;
1757 NodeToCallingFunc[Node] = Func;
1758 recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
1759 return;
1760 }
1761 }
1762
1763#ifndef NDEBUG
1764 // Find the node for the last stack id, which should be the same
1765 // across all calls recorded for this id, and is this node's id.
1766 uint64_t LastId = Node->OrigStackOrAllocId;
1767 ContextNode *LastNode = getNodeForStackId(LastId);
1768 // We should only have kept stack ids that had nodes.
1769 assert(LastNode);
1770 assert(LastNode == Node);
1771#else
1772 ContextNode *LastNode = Node;
1773#endif
1774
1775 // Compute the last node's context ids once, as it is shared by all calls in
1776 // this entry.
1777 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1778
1779 [[maybe_unused]] bool PrevIterCreatedNode = false;
1780 bool CreatedNode = false;
1781 for (unsigned I = 0; I < Calls.size();
1782 I++, PrevIterCreatedNode = CreatedNode) {
1783 CreatedNode = false;
1784 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1785 // Skip any for which we didn't assign any ids, these don't get a node in
1786 // the graph.
1787 if (SavedContextIds.empty()) {
1788 // If this call has a matching call (located in the same function and
1789 // having the same stack ids), simply add it to the context node created
1790 // for its matching call earlier. These can be treated the same through
1791 // cloning and get updated at the same time.
1792 if (!CallToMatchingCall.contains(Call))
1793 continue;
1794 auto MatchingCall = CallToMatchingCall[Call];
1795 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1796 // This should only happen if we had a prior iteration, and it didn't
1797 // create a node because of the below recomputation of context ids
1798 // finding none remaining and continuing early.
1799 assert(I > 0 && !PrevIterCreatedNode);
1800 continue;
1801 }
1802 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1803 Call);
1804 continue;
1805 }
1806
1807 assert(LastId == Ids.back());
1808
1809 // Recompute the context ids for this stack id sequence (the
1810 // intersection of the context ids of the corresponding nodes).
1811 // Start with the ids we saved in the map for this call, which could be
1812 // duplicated context ids. We have to recompute as we might have overlap
1813 // overlap between the saved context ids for different last nodes, and
1814 // removed them already during the post order traversal.
1815 set_intersect(SavedContextIds, LastNodeContextIds);
1816 ContextNode *PrevNode = LastNode;
1817 bool Skip = false;
1818 // Iterate backwards through the stack Ids, starting after the last Id
1819 // in the list, which was handled once outside for all Calls.
1820 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1821 auto Id = *IdIter;
1822 ContextNode *CurNode = getNodeForStackId(Id);
1823 // We should only have kept stack ids that had nodes and weren't
1824 // recursive.
1825 assert(CurNode);
1826 assert(!CurNode->Recursive);
1827
1828 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1829 if (!Edge) {
1830 Skip = true;
1831 break;
1832 }
1833 PrevNode = CurNode;
1834
1835 // Update the context ids, which is the intersection of the ids along
1836 // all edges in the sequence.
1837 set_intersect(SavedContextIds, Edge->getContextIds());
1838
1839 // If we now have no context ids for clone, skip this call.
1840 if (SavedContextIds.empty()) {
1841 Skip = true;
1842 break;
1843 }
1844 }
1845 if (Skip)
1846 continue;
1847
1848 // Create new context node.
1849 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1850 NonAllocationCallToContextNodeMap[Call] = NewNode;
1851 CreatedNode = true;
1852 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1853
1854 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1855 assert(FirstNode);
1856
1857 // Connect to callees of innermost stack frame in inlined call chain.
1858 // This updates context ids for FirstNode's callee's to reflect those
1859 // moved to NewNode.
1860 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1861
1862 // Connect to callers of outermost stack frame in inlined call chain.
1863 // This updates context ids for FirstNode's caller's to reflect those
1864 // moved to NewNode.
1865 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1866
1867 // Now we need to remove context ids from edges/nodes between First and
1868 // Last Node.
1869 PrevNode = nullptr;
1870 for (auto Id : Ids) {
1871 ContextNode *CurNode = getNodeForStackId(Id);
1872 // We should only have kept stack ids that had nodes.
1873 assert(CurNode);
1874
1875 // Remove the context ids moved to NewNode from CurNode, and the
1876 // edge from the prior node.
1877 if (PrevNode) {
1878 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1879 // If the sequence contained recursion, we might have already removed
1880 // some edges during the connectNewNode calls above.
1881 if (!PrevEdge) {
1882 PrevNode = CurNode;
1883 continue;
1884 }
1885 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1886 if (PrevEdge->getContextIds().empty())
1887 removeEdgeFromGraph(PrevEdge);
1888 }
1889 // Since we update the edges from leaf to tail, only look at the callee
1890 // edges. This isn't an alloc node, so if there are no callee edges, the
1891 // alloc type is None.
1892 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1893 ? (uint8_t)AllocationType::None
1894 : CurNode->computeAllocType();
1895 PrevNode = CurNode;
1896 }
1897
1898 recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
1899
1900 if (VerifyNodes) {
1901 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1902 for (auto Id : Ids) {
1903 ContextNode *CurNode = getNodeForStackId(Id);
1904 // We should only have kept stack ids that had nodes.
1905 assert(CurNode);
1906 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1907 }
1908 }
1909 }
1910}
1911
1912template <typename DerivedCCG, typename FuncTy, typename CallTy>
1913void CallsiteContextGraph<DerivedCCG, FuncTy,
1914 CallTy>::fixupImportantContexts() {
1915 if (ImportantContextIdInfo.empty())
1916 return;
1917
1918 // Update statistics as we are done building this map at this point.
1919 NumImportantContextIds = ImportantContextIdInfo.size();
1920
1922 return;
1923
1924 if (ExportToDot)
1925 exportToDot("beforestackfixup");
1926
1927 // For each context we identified as important, walk through the saved context
1928 // stack ids in order from leaf upwards, and make sure all edges are correct.
1929 // These can be difficult to get right when updating the graph while mapping
1930 // nodes onto summary or IR, especially when there is recursion. In
1931 // particular, when we have created new nodes to reflect inlining, it is
1932 // sometimes impossible to know exactly how to update the edges in the face of
1933 // recursion, as we have lost the original ordering of the stack ids in the
1934 // contexts.
1935 // TODO: Consider only doing this if we detect the context has recursive
1936 // cycles.
1937 //
1938 // I.e. assume we have a context with stack ids like: {A B A C A D E}
1939 // and let's say A was inlined into B, C, and D. The original graph will have
1940 // multiple recursive cycles through A. When we match the original context
1941 // nodes onto the IR or summary, we will merge {A B} into one context node,
1942 // {A C} onto another, and {A D} onto another. Looking at the stack sequence
1943 // above, we should end up with a non-cyclic set of edges like:
1944 // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
1945 // original ordering, we won't get the edges correct initially (it's
1946 // impossible without the original ordering). Here we do the fixup (add and
1947 // removing edges where necessary) for this context. In the
1948 // ImportantContextInfo struct in this case we should have a MaxLength = 2,
1949 // and map entries for {A B}, {A C}, {A D}, and {E}.
1950 for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
1951 if (Info.StackIdsToNode.empty())
1952 continue;
1953 bool Changed = false;
1954 ContextNode *PrevNode = nullptr;
1955 ContextNode *CurNode = nullptr;
1956 DenseSet<const ContextEdge *> VisitedEdges;
1957 ArrayRef<uint64_t> AllStackIds(Info.StackIds);
1958 // Try to identify what callsite ContextNode maps to which slice of the
1959 // context's ordered stack ids.
1960 for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
1961 // We will do this greedily, trying up to MaxLength stack ids in a row, to
1962 // see if we recorded a context node for that sequence.
1963 auto Len = Info.MaxLength;
1964 auto LenToEnd = AllStackIds.size() - I;
1965 if (Len > LenToEnd)
1966 Len = LenToEnd;
1967 CurNode = nullptr;
1968 // Try to find a recorded context node starting with the longest length
1969 // recorded, and on down until we check for just a single stack node.
1970 for (; Len > 0; Len--) {
1971 // Get the slice of the original stack id sequence to check.
1972 auto CheckStackIds = AllStackIds.slice(I, Len);
1973 auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
1974 if (EntryIt == Info.StackIdsToNode.end())
1975 continue;
1976 CurNode = EntryIt->second;
1977 // Skip forward so we don't try to look for the ones we just matched.
1978 // We increment by Len - 1, because the outer for loop will increment I.
1979 I += Len - 1;
1980 break;
1981 }
1982 // Give up if we couldn't find a node. Since we need to clone from the
1983 // leaf allocation upwards, no sense in doing anymore fixup further up
1984 // the context if we couldn't match part of the original stack context
1985 // onto a callsite node.
1986 if (!CurNode)
1987 break;
1988 // No edges to fix up until we have a pair of nodes that should be
1989 // adjacent in the graph.
1990 if (!PrevNode)
1991 continue;
1992 // See if we already have a call edge from CurNode to PrevNode.
1993 auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
1994 if (CurEdge) {
1995 // We already have an edge. Make sure it contains this context id.
1996 if (CurEdge->getContextIds().insert(CurContextId).second) {
1997 NumFixupEdgeIdsInserted++;
1998 Changed = true;
1999 }
2000 } else {
2001 // No edge exists - add one.
2002 NumFixupEdgesAdded++;
2003 DenseSet<uint32_t> ContextIds({CurContextId});
2004 auto AllocType = computeAllocType(ContextIds);
2005 auto NewEdge = std::make_shared<ContextEdge>(
2006 PrevNode, CurNode, AllocType, std::move(ContextIds));
2007 PrevNode->CallerEdges.push_back(NewEdge);
2008 CurNode->CalleeEdges.push_back(NewEdge);
2009 // Save the new edge for the below handling.
2010 CurEdge = NewEdge.get();
2011 Changed = true;
2012 }
2013 VisitedEdges.insert(CurEdge);
2014 // Now remove this context id from any other caller edges calling
2015 // PrevNode.
2016 for (auto &Edge : PrevNode->CallerEdges) {
2017 // Skip the edge updating/created above and edges we have already
2018 // visited (due to recursion).
2019 if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
2020 Edge->getContextIds().erase(CurContextId);
2021 }
2022 }
2023 if (Changed)
2024 NumFixedContexts++;
2025 }
2026}
2027
2028template <typename DerivedCCG, typename FuncTy, typename CallTy>
2029void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
2030 // Map of stack id to all calls with that as the last (outermost caller)
2031 // callsite id that has a context node (some might not due to pruning
2032 // performed during matching of the allocation profile contexts).
2033 // The CallContextInfo contains the Call and a list of its stack ids with
2034 // ContextNodes, the function containing Call, and the set of context ids
2035 // the analysis will eventually identify for use in any new node created
2036 // for that callsite.
2037 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
2038 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2039 for (auto &Call : CallsWithMetadata) {
2040 // Ignore allocations, already handled.
2041 if (AllocationCallToContextNodeMap.count(Call))
2042 continue;
2043 auto StackIdsWithContextNodes =
2044 getStackIdsWithContextNodesForCall(Call.call());
2045 // If there were no nodes created for MIBs on allocs (maybe this was in
2046 // the unambiguous part of the MIB stack that was pruned), ignore.
2047 if (StackIdsWithContextNodes.empty())
2048 continue;
2049 // Otherwise, record this Call along with the list of ids for the last
2050 // (outermost caller) stack id with a node.
2051 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
2052 {Call.call(), StackIdsWithContextNodes, Func, {}});
2053 }
2054 }
2055
2056 // First make a pass through all stack ids that correspond to a call,
2057 // as identified in the above loop. Compute the context ids corresponding to
2058 // each of these calls when they correspond to multiple stack ids due to
2059 // due to inlining. Perform any duplication of context ids required when
2060 // there is more than one call with the same stack ids. Their (possibly newly
2061 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
2062 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
2063 // Save a map from each call to any that are found to match it. I.e. located
2064 // in the same function and have the same (possibly pruned) stack ids. We use
2065 // this to avoid creating extra graph nodes as they can be treated the same.
2066 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
2067 for (auto &It : StackIdToMatchingCalls) {
2068 auto &Calls = It.getSecond();
2069 // Skip single calls with a single stack id. These don't need a new node.
2070 if (Calls.size() == 1) {
2071 auto &Ids = Calls[0].StackIds;
2072 if (Ids.size() == 1)
2073 continue;
2074 }
2075 // In order to do the best and maximal matching of inlined calls to context
2076 // node sequences we will sort the vectors of stack ids in descending order
2077 // of length, and within each length, lexicographically by stack id. The
2078 // latter is so that we can specially handle calls that have identical stack
2079 // id sequences (either due to cloning or artificially because of the MIB
2080 // context pruning). Those with the same Ids are then sorted by function to
2081 // facilitate efficiently mapping them to the same context node.
2082 // Because the functions are pointers, to ensure a stable sort first assign
2083 // each function pointer to its first index in the Calls array, and then use
2084 // that to sort by.
2085 DenseMap<const FuncTy *, unsigned> FuncToIndex;
2086 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
2087 FuncToIndex.insert({CallCtxInfo.Func, Idx});
2089 Calls,
2090 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
2091 return A.StackIds.size() > B.StackIds.size() ||
2092 (A.StackIds.size() == B.StackIds.size() &&
2093 (A.StackIds < B.StackIds ||
2094 (A.StackIds == B.StackIds &&
2095 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
2096 });
2097
2098 // Find the node for the last stack id, which should be the same
2099 // across all calls recorded for this id, and is the id for this
2100 // entry in the StackIdToMatchingCalls map.
2101 uint64_t LastId = It.getFirst();
2102 ContextNode *LastNode = getNodeForStackId(LastId);
2103 // We should only have kept stack ids that had nodes.
2104 assert(LastNode);
2105
2106 if (LastNode->Recursive)
2107 continue;
2108
2109 // Initialize the context ids with the last node's. We will subsequently
2110 // refine the context ids by computing the intersection along all edges.
2111 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
2112 assert(!LastNodeContextIds.empty());
2113
2114#ifndef NDEBUG
2115 // Save the set of functions seen for a particular set of the same stack
2116 // ids. This is used to ensure that they have been correctly sorted to be
2117 // adjacent in the Calls list, since we rely on that to efficiently place
2118 // all such matching calls onto the same context node.
2119 DenseSet<const FuncTy *> MatchingIdsFuncSet;
2120#endif
2121
2122 for (unsigned I = 0; I < Calls.size(); I++) {
2123 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
2124 assert(SavedContextIds.empty());
2125 assert(LastId == Ids.back());
2126
2127#ifndef NDEBUG
2128 // If this call has a different set of ids than the last one, clear the
2129 // set used to ensure they are sorted properly.
2130 if (I > 0 && Ids != Calls[I - 1].StackIds)
2131 MatchingIdsFuncSet.clear();
2132#endif
2133
2134 // First compute the context ids for this stack id sequence (the
2135 // intersection of the context ids of the corresponding nodes).
2136 // Start with the remaining saved ids for the last node.
2137 assert(!LastNodeContextIds.empty());
2138 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
2139
2140 ContextNode *PrevNode = LastNode;
2141 ContextNode *CurNode = LastNode;
2142 bool Skip = false;
2143
2144 // Iterate backwards through the stack Ids, starting after the last Id
2145 // in the list, which was handled once outside for all Calls.
2146 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
2147 auto Id = *IdIter;
2148 CurNode = getNodeForStackId(Id);
2149 // We should only have kept stack ids that had nodes.
2150 assert(CurNode);
2151
2152 if (CurNode->Recursive) {
2153 Skip = true;
2154 break;
2155 }
2156
2157 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
2158 // If there is no edge then the nodes belong to different MIB contexts,
2159 // and we should skip this inlined context sequence. For example, this
2160 // particular inlined context may include stack ids A->B, and we may
2161 // indeed have nodes for both A and B, but it is possible that they were
2162 // never profiled in sequence in a single MIB for any allocation (i.e.
2163 // we might have profiled an allocation that involves the callsite A,
2164 // but through a different one of its callee callsites, and we might
2165 // have profiled an allocation that involves callsite B, but reached
2166 // from a different caller callsite).
2167 if (!Edge) {
2168 Skip = true;
2169 break;
2170 }
2171 PrevNode = CurNode;
2172
2173 // Update the context ids, which is the intersection of the ids along
2174 // all edges in the sequence.
2175 set_intersect(StackSequenceContextIds, Edge->getContextIds());
2176
2177 // If we now have no context ids for clone, skip this call.
2178 if (StackSequenceContextIds.empty()) {
2179 Skip = true;
2180 break;
2181 }
2182 }
2183 if (Skip)
2184 continue;
2185
2186 // If some of this call's stack ids did not have corresponding nodes (due
2187 // to pruning), don't include any context ids for contexts that extend
2188 // beyond these nodes. Otherwise we would be matching part of unrelated /
2189 // not fully matching stack contexts. To do this, subtract any context ids
2190 // found in caller nodes of the last node found above.
2191 if (Ids.back() != getLastStackId(Call)) {
2192 for (const auto &PE : LastNode->CallerEdges) {
2193 set_subtract(StackSequenceContextIds, PE->getContextIds());
2194 if (StackSequenceContextIds.empty())
2195 break;
2196 }
2197 // If we now have no context ids for clone, skip this call.
2198 if (StackSequenceContextIds.empty())
2199 continue;
2200 }
2201
2202#ifndef NDEBUG
2203 // If the prior call had the same stack ids this set would not be empty.
2204 // Check if we already have a call that "matches" because it is located
2205 // in the same function. If the Calls list was sorted properly we should
2206 // not encounter this situation as all such entries should be adjacent
2207 // and processed in bulk further below.
2208 assert(!MatchingIdsFuncSet.contains(Func));
2209
2210 MatchingIdsFuncSet.insert(Func);
2211#endif
2212
2213 // Check if the next set of stack ids is the same (since the Calls vector
2214 // of tuples is sorted by the stack ids we can just look at the next one).
2215 // If so, save them in the CallToMatchingCall map so that they get
2216 // assigned to the same context node, and skip them.
2217 bool DuplicateContextIds = false;
2218 for (unsigned J = I + 1; J < Calls.size(); J++) {
2219 auto &CallCtxInfo = Calls[J];
2220 auto &NextIds = CallCtxInfo.StackIds;
2221 if (NextIds != Ids)
2222 break;
2223 auto *NextFunc = CallCtxInfo.Func;
2224 if (NextFunc != Func) {
2225 // We have another Call with the same ids but that cannot share this
2226 // node, must duplicate ids for it.
2227 DuplicateContextIds = true;
2228 break;
2229 }
2230 auto &NextCall = CallCtxInfo.Call;
2231 CallToMatchingCall[NextCall] = Call;
2232 // Update I so that it gets incremented correctly to skip this call.
2233 I = J;
2234 }
2235
2236 // If we don't have duplicate context ids, then we can assign all the
2237 // context ids computed for the original node sequence to this call.
2238 // If there are duplicate calls with the same stack ids then we synthesize
2239 // new context ids that are duplicates of the originals. These are
2240 // assigned to SavedContextIds, which is a reference into the map entry
2241 // for this call, allowing us to access these ids later on.
2242 OldToNewContextIds.reserve(OldToNewContextIds.size() +
2243 StackSequenceContextIds.size());
2244 SavedContextIds =
2245 DuplicateContextIds
2246 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2247 : StackSequenceContextIds;
2248 assert(!SavedContextIds.empty());
2249
2250 if (!DuplicateContextIds) {
2251 // Update saved last node's context ids to remove those that are
2252 // assigned to other calls, so that it is ready for the next call at
2253 // this stack id.
2254 set_subtract(LastNodeContextIds, StackSequenceContextIds);
2255 if (LastNodeContextIds.empty())
2256 break;
2257 }
2258 }
2259 }
2260
2261 // Propagate the duplicate context ids over the graph.
2262 propagateDuplicateContextIds(OldToNewContextIds);
2263
2264 if (VerifyCCG)
2265 check();
2266
2267 // Now perform a post-order traversal over the graph, starting with the
2268 // allocation nodes, essentially processing nodes from callers to callees.
2269 // For any that contains an id in the map, update the graph to contain new
2270 // nodes representing any inlining at interior callsites. Note we move the
2271 // associated context ids over to the new nodes.
2272 DenseSet<const ContextNode *> Visited;
2273 DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
2274 ImportantContextIdInfo.keys());
2275 for (auto &Entry : AllocationCallToContextNodeMap)
2276 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2277 CallToMatchingCall, ImportantContextIds);
2278
2279 fixupImportantContexts();
2280
2281 if (VerifyCCG)
2282 check();
2283}
2284
2285uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2286 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2287 Call->getMetadata(LLVMContext::MD_callsite));
2288 return CallsiteContext.back();
2289}
2290
2291uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2293 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2294 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2295 // Need to convert index into stack id.
2296 return Index.getStackIdAtIndex(CallsiteContext.back());
2297}
2298
2299static const std::string MemProfCloneSuffix = ".memprof.";
2300
2301static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2302 // We use CloneNo == 0 to refer to the original version, which doesn't get
2303 // renamed with a suffix.
2304 if (!CloneNo)
2305 return Base.str();
2306 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2307}
2308
2309static bool isMemProfClone(const Function &F) {
2310 return F.getName().contains(MemProfCloneSuffix);
2311}
2312
2313// Return the clone number of the given function by extracting it from the
2314// memprof suffix. Assumes the caller has already confirmed it is a memprof
2315// clone.
2316static unsigned getMemProfCloneNum(const Function &F) {
2318 auto Pos = F.getName().find_last_of('.');
2319 assert(Pos > 0);
2320 unsigned CloneNo;
2321 bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
2322 assert(!Err);
2323 (void)Err;
2324 return CloneNo;
2325}
2326
2327std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2328 const Instruction *Call,
2329 unsigned CloneNo) const {
2330 return (Twine(Call->getFunction()->getName()) + " -> " +
2331 cast<CallBase>(Call)->getCalledFunction()->getName())
2332 .str();
2333}
2334
2335std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2336 const IndexCall &Call,
2337 unsigned CloneNo) const {
2338 auto VI = FSToVIMap.find(Func);
2339 assert(VI != FSToVIMap.end());
2340 std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
2342 return CallerName + " -> alloc";
2343 else {
2344 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2345 return CallerName + " -> " +
2346 getMemProfFuncName(Callsite->Callee.name(),
2347 Callsite->Clones[CloneNo]);
2348 }
2349}
2350
2351std::vector<uint64_t>
2352ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2353 Instruction *Call) {
2354 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2355 Call->getMetadata(LLVMContext::MD_callsite));
2356 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2357 CallsiteContext);
2358}
2359
2360std::vector<uint64_t>
2361IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2363 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2364 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2365 return getStackIdsWithContextNodes<CallsiteInfo,
2366 SmallVector<unsigned>::const_iterator>(
2367 CallsiteContext);
2368}
2369
2370template <typename DerivedCCG, typename FuncTy, typename CallTy>
2371template <class NodeT, class IteratorT>
2372std::vector<uint64_t>
2373CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2374 CallStack<NodeT, IteratorT> &CallsiteContext) {
2375 std::vector<uint64_t> StackIds;
2376 for (auto IdOrIndex : CallsiteContext) {
2377 auto StackId = getStackId(IdOrIndex);
2378 ContextNode *Node = getNodeForStackId(StackId);
2379 if (!Node)
2380 break;
2381 StackIds.push_back(StackId);
2382 }
2383 return StackIds;
2384}
2385
2386ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2387 Module &M,
2388 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2389 : Mod(M), OREGetter(OREGetter) {
2390 // Map for keeping track of the largest cold contexts up to the number given
2391 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2392 // must be sorted.
2393 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2394 for (auto &F : M) {
2395 std::vector<CallInfo> CallsWithMetadata;
2396 for (auto &BB : F) {
2397 for (auto &I : BB) {
2398 if (!isa<CallBase>(I))
2399 continue;
2400 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2401 CallsWithMetadata.push_back(&I);
2402 auto *AllocNode = addAllocNode(&I, &F);
2403 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2404 assert(CallsiteMD);
2405 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2406 // Add all of the MIBs and their stack nodes.
2407 for (auto &MDOp : MemProfMD->operands()) {
2408 auto *MIBMD = cast<const MDNode>(MDOp);
2409 std::vector<ContextTotalSize> ContextSizeInfo;
2410 // Collect the context size information if it exists.
2411 if (MIBMD->getNumOperands() > 2) {
2412 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2413 MDNode *ContextSizePair =
2414 dyn_cast<MDNode>(MIBMD->getOperand(I));
2415 assert(ContextSizePair->getNumOperands() == 2);
2417 ContextSizePair->getOperand(0))
2418 ->getZExtValue();
2420 ContextSizePair->getOperand(1))
2421 ->getZExtValue();
2422 ContextSizeInfo.push_back({FullStackId, TotalSize});
2423 }
2424 }
2428 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2429 AllocNode, StackContext, CallsiteContext,
2430 getMIBAllocType(MIBMD), ContextSizeInfo,
2431 TotalSizeToContextIdTopNCold);
2432 }
2433 // If exporting the graph to dot and an allocation id of interest was
2434 // specified, record all the context ids for this allocation node.
2435 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2436 DotAllocContextIds = AllocNode->getContextIds();
2437 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2438 // Memprof and callsite metadata on memory allocations no longer
2439 // needed.
2440 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2441 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2442 }
2443 // For callsite metadata, add to list for this function for later use.
2444 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2445 CallsWithMetadata.push_back(&I);
2446 }
2447 }
2448 }
2449 if (!CallsWithMetadata.empty())
2450 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2451 }
2452
2453 if (DumpCCG) {
2454 dbgs() << "CCG before updating call stack chains:\n";
2455 dbgs() << *this;
2456 }
2457
2458 if (ExportToDot)
2459 exportToDot("prestackupdate");
2460
2461 updateStackNodes();
2462
2463 if (ExportToDot)
2464 exportToDot("poststackupdate");
2465
2466 handleCallsitesWithMultipleTargets();
2467
2468 markBackedges();
2469
2470 // Strip off remaining callsite metadata, no longer needed.
2471 for (auto &FuncEntry : FuncToCallsWithMetadata)
2472 for (auto &Call : FuncEntry.second)
2473 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2474}
2475
2476// Finds the set of GUIDs for weak aliasees that are prevailing in different
2477// modules than any of their aliases. We need to handle these specially.
2479IndexCallsiteContextGraph::findAliaseeGUIDsPrevailingInDifferentModule() {
2480 DenseSet<GlobalValue::GUID> AliaseeGUIDs;
2481 for (auto &I : Index) {
2482 auto VI = Index.getValueInfo(I);
2483 for (auto &S : VI.getSummaryList()) {
2484 // We only care about aliases to functions.
2485 auto *AS = dyn_cast<AliasSummary>(S.get());
2486 if (!AS)
2487 continue;
2488 auto *AliaseeSummary = &AS->getAliasee();
2489 auto *AliaseeFS = dyn_cast<FunctionSummary>(AliaseeSummary);
2490 if (!AliaseeFS)
2491 continue;
2492 // Skip this summary if it is not for the prevailing symbol for this GUID.
2493 // The linker doesn't resolve local linkage values so don't check whether
2494 // those are prevailing.
2495 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2496 !isPrevailing(VI.getGUID(), S.get()))
2497 continue;
2498 // Prevailing aliasee could be in a different module only if it is weak.
2499 if (!GlobalValue::isWeakForLinker(AliaseeSummary->linkage()))
2500 continue;
2501 auto AliaseeGUID = AS->getAliaseeGUID();
2502 // If the aliasee copy in this module is not prevailing, record it.
2503 if (!isPrevailing(AliaseeGUID, AliaseeSummary))
2504 AliaseeGUIDs.insert(AliaseeGUID);
2505 }
2506 }
2507 AliaseesPrevailingInDiffModuleFromAlias += AliaseeGUIDs.size();
2508 return AliaseeGUIDs;
2509}
2510
2511IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2512 ModuleSummaryIndex &Index,
2513 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2514 isPrevailing)
2515 : Index(Index), isPrevailing(isPrevailing) {
2516 // Since we use the aliasee summary info to create the necessary clones for
2517 // its aliases, conservatively skip recording the aliasee function's callsites
2518 // in the CCG for any that are prevailing in a different module than one of
2519 // its aliases. We could record the necessary information to do this in the
2520 // summary, but this case should not be common.
2521 DenseSet<GlobalValue::GUID> GUIDsToSkip =
2522 findAliaseeGUIDsPrevailingInDifferentModule();
2523 // Map for keeping track of the largest cold contexts up to the number given
2524 // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
2525 // must be sorted.
2526 std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
2527 // Sort by GUID for deterministic graph construction order.
2528 // TODO: This sort has a measurable cost on the thin link when memprof is
2529 // enabled. Investigate gating it behind an option that is only enabled for
2530 // tests that check internal state.
2531 for (const auto &I : Index.sortedGlobalValueSummariesRange()) {
2532 auto VI = Index.getValueInfo(I);
2533 if (GUIDsToSkip.contains(VI.getGUID()))
2534 continue;
2535 for (auto &S : VI.getSummaryList()) {
2536 // We should only add the prevailing nodes. Otherwise we may try to clone
2537 // in a weak copy that won't be linked (and may be different than the
2538 // prevailing version).
2539 // We only keep the memprof summary on the prevailing copy now when
2540 // building the combined index, as a space optimization, however don't
2541 // rely on this optimization. The linker doesn't resolve local linkage
2542 // values so don't check whether those are prevailing.
2543 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2544 !isPrevailing(VI.getGUID(), S.get()))
2545 continue;
2546 auto *FS = dyn_cast<FunctionSummary>(S.get());
2547 if (!FS)
2548 continue;
2549 std::vector<CallInfo> CallsWithMetadata;
2550 if (!FS->allocs().empty()) {
2551 for (auto &AN : FS->mutableAllocs()) {
2552 // This can happen because of recursion elimination handling that
2553 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2554 // We still added them to the summary because we need to be able to
2555 // correlate properly in applyImport in the backends.
2556 if (AN.MIBs.empty())
2557 continue;
2558 IndexCall AllocCall(&AN);
2559 CallsWithMetadata.push_back(AllocCall);
2560 auto *AllocNode = addAllocNode(AllocCall, FS);
2561 // Pass an empty CallStack to the CallsiteContext (second)
2562 // parameter, since for ThinLTO we already collapsed out the inlined
2563 // stack ids on the allocation call during ModuleSummaryAnalysis.
2565 EmptyContext;
2566 unsigned I = 0;
2568 AN.ContextSizeInfos.size() == AN.MIBs.size());
2569 // Now add all of the MIBs and their stack nodes.
2570 for (auto &MIB : AN.MIBs) {
2572 StackContext(&MIB);
2573 std::vector<ContextTotalSize> ContextSizeInfo;
2574 if (!AN.ContextSizeInfos.empty()) {
2575 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2576 ContextSizeInfo.push_back({FullStackId, TotalSize});
2577 }
2578 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2579 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2580 ContextSizeInfo, TotalSizeToContextIdTopNCold);
2581 I++;
2582 }
2583 // If exporting the graph to dot and an allocation id of interest was
2584 // specified, record all the context ids for this allocation node.
2585 if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2586 DotAllocContextIds = AllocNode->getContextIds();
2587 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2588 // Initialize version 0 on the summary alloc node to the current alloc
2589 // type, unless it has both types in which case make it default, so
2590 // that in the case where we aren't able to clone the original version
2591 // always ends up with the default allocation behavior.
2592 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2593 }
2594 }
2595 // For callsite metadata, add to list for this function for later use.
2596 if (!FS->callsites().empty())
2597 for (auto &SN : FS->mutableCallsites()) {
2598 IndexCall StackNodeCall(&SN);
2599 CallsWithMetadata.push_back(StackNodeCall);
2600 }
2601
2602 if (!CallsWithMetadata.empty())
2603 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2604
2605 if (!FS->allocs().empty() || !FS->callsites().empty())
2606 FSToVIMap[FS] = VI;
2607 }
2608 }
2609
2610 if (DumpCCG) {
2611 dbgs() << "CCG before updating call stack chains:\n";
2612 dbgs() << *this;
2613 }
2614
2615 if (ExportToDot)
2616 exportToDot("prestackupdate");
2617
2618 updateStackNodes();
2619
2620 if (ExportToDot)
2621 exportToDot("poststackupdate");
2622
2623 handleCallsitesWithMultipleTargets();
2624
2625 markBackedges();
2626}
2627
2628template <typename DerivedCCG, typename FuncTy, typename CallTy>
2629void CallsiteContextGraph<DerivedCCG, FuncTy,
2630 CallTy>::handleCallsitesWithMultipleTargets() {
2631 // Look for and workaround callsites that call multiple functions.
2632 // This can happen for indirect calls, which needs better handling, and in
2633 // more rare cases (e.g. macro expansion).
2634 // TODO: To fix this for indirect calls we will want to perform speculative
2635 // devirtualization using either the normal PGO info with ICP, or using the
2636 // information in the profiled MemProf contexts. We can do this prior to
2637 // this transformation for regular LTO, and for ThinLTO we can simulate that
2638 // effect in the summary and perform the actual speculative devirtualization
2639 // while cloning in the ThinLTO backend.
2640
2641 // Keep track of the new nodes synthesized for discovered tail calls missing
2642 // from the profiled contexts.
2643 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2644
2645 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2646 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2647 auto *Node = Entry.second;
2648 assert(Node->Clones.empty());
2649 // Check all node callees and see if in the same function.
2650 // We need to check all of the calls recorded in this Node, because in some
2651 // cases we may have had multiple calls with the same debug info calling
2652 // different callees. This can happen, for example, when an object is
2653 // constructed in the paramter list - the destructor call of the object has
2654 // the same debug info (line/col) as the call the object was passed to.
2655 // Here we will prune any that don't match all callee nodes.
2656 std::vector<CallInfo> AllCalls;
2657 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2658 AllCalls.push_back(Node->Call);
2659 llvm::append_range(AllCalls, Node->MatchingCalls);
2660
2661 // First see if we can partition the calls by callee function, creating new
2662 // nodes to host each set of calls calling the same callees. This is
2663 // necessary for support indirect calls with ThinLTO, for which we
2664 // synthesized CallsiteInfo records for each target. They will all have the
2665 // same callsite stack ids and would be sharing a context node at this
2666 // point. We need to perform separate cloning for each, which will be
2667 // applied along with speculative devirtualization in the ThinLTO backends
2668 // as needed. Note this does not currently support looking through tail
2669 // calls, it is unclear if we need that for indirect call targets.
2670 // First partition calls by callee func. Map indexed by func, value is
2671 // struct with list of matching calls, assigned node.
2672 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2673 continue;
2674
2675 auto It = AllCalls.begin();
2676 // Iterate through the calls until we find the first that matches.
2677 for (; It != AllCalls.end(); ++It) {
2678 auto ThisCall = *It;
2679 bool Match = true;
2680 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2681 ++EI) {
2682 auto Edge = *EI;
2683 if (!Edge->Callee->hasCall())
2684 continue;
2685 assert(NodeToCallingFunc.count(Edge->Callee));
2686 // Check if the called function matches that of the callee node.
2687 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2688 Match = false;
2689 break;
2690 }
2691 }
2692 // Found a call that matches the callee nodes, we can quit now.
2693 if (Match) {
2694 // If the first match is not the primary call on the Node, update it
2695 // now. We will update the list of matching calls further below.
2696 if (Node->Call != ThisCall) {
2697 Node->setCall(ThisCall);
2698 // We need to update the NonAllocationCallToContextNodeMap, but don't
2699 // want to do this during iteration over that map, so save the calls
2700 // that need updated entries.
2701 NewCallToNode.push_back({ThisCall, Node});
2702 }
2703 break;
2704 }
2705 }
2706 // We will update this list below (or leave it cleared if there was no
2707 // match found above).
2708 Node->MatchingCalls.clear();
2709 // If we hit the end of the AllCalls vector, no call matching the callee
2710 // nodes was found, clear the call information in the node.
2711 if (It == AllCalls.end()) {
2712 RemovedEdgesWithMismatchedCallees++;
2713 // Work around by setting Node to have a null call, so it gets
2714 // skipped during cloning. Otherwise assignFunctions will assert
2715 // because its data structures are not designed to handle this case.
2716 Node->setCall(CallInfo());
2717 continue;
2718 }
2719 // Now add back any matching calls that call the same function as the
2720 // matching primary call on Node.
2721 for (++It; It != AllCalls.end(); ++It) {
2722 auto ThisCall = *It;
2723 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2724 continue;
2725 Node->MatchingCalls.push_back(ThisCall);
2726 }
2727 }
2728
2729 // Remove all mismatched nodes identified in the above loop from the node map
2730 // (checking whether they have a null call which is set above). For a
2731 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2732 // to do the removal via remove_if than by individually erasing entries above.
2733 // Also remove any entries if we updated the node's primary call above.
2734 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2735 return !it.second->hasCall() || it.second->Call != it.first;
2736 });
2737
2738 // Add entries for any new primary calls recorded above.
2739 for (auto &[Call, Node] : NewCallToNode)
2740 NonAllocationCallToContextNodeMap[Call] = Node;
2741
2742 // Add the new nodes after the above loop so that the iteration is not
2743 // invalidated.
2744 for (auto &[Call, Node] : TailCallToContextNodeMap)
2745 NonAllocationCallToContextNodeMap[Call] = Node;
2746}
2747
2748template <typename DerivedCCG, typename FuncTy, typename CallTy>
2749bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2750 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2751 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2752 // Struct to keep track of all the calls having the same callee function,
2753 // and the node we eventually assign to them. Eventually we will record the
2754 // context node assigned to this group of calls.
2755 struct CallsWithSameCallee {
2756 std::vector<CallInfo> Calls;
2757 ContextNode *Node = nullptr;
2758 };
2759
2760 // First partition calls by callee function. Build map from each function
2761 // to the list of matching calls.
2763 for (auto ThisCall : AllCalls) {
2764 auto *F = getCalleeFunc(ThisCall.call());
2765 if (F)
2766 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2767 }
2768
2769 // Next, walk through all callee edges. For each callee node, get its
2770 // containing function and see if it was recorded in the above map (meaning we
2771 // have at least one matching call). Build another map from each callee node
2772 // with a matching call to the structure instance created above containing all
2773 // the calls.
2775 for (const auto &Edge : Node->CalleeEdges) {
2776 if (!Edge->Callee->hasCall())
2777 continue;
2778 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2779 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2780 CalleeNodeToCallInfo[Edge->Callee] =
2781 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2782 }
2783
2784 // If there are entries in the second map, then there were no matching
2785 // calls/callees, nothing to do here. Return so we can go to the handling that
2786 // looks through tail calls.
2787 if (CalleeNodeToCallInfo.empty())
2788 return false;
2789
2790 // Walk through all callee edges again. Any and all callee edges that didn't
2791 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2792 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2793 // ignored during cloning. If it is in the map, then we use the node recorded
2794 // in that entry (creating it if needed), and move the callee edge to it.
2795 // The first callee will use the original node instead of creating a new one.
2796 // Note that any of the original calls on this node (in AllCalls) that didn't
2797 // have a callee function automatically get dropped from the node as part of
2798 // this process.
2799 ContextNode *UnmatchedCalleesNode = nullptr;
2800 // Track whether we already assigned original node to a callee.
2801 bool UsedOrigNode = false;
2802 assert(NodeToCallingFunc[Node]);
2803 // Iterate over a copy of Node's callee edges, since we may need to remove
2804 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2805 // makes it less error-prone.
2806 auto CalleeEdges = Node->CalleeEdges;
2807 for (auto &Edge : CalleeEdges) {
2808 if (!Edge->Callee->hasCall())
2809 continue;
2810
2811 // Will be updated below to point to whatever (caller) node this callee edge
2812 // should be moved to.
2813 ContextNode *CallerNodeToUse = nullptr;
2814
2815 // Handle the case where there were no matching calls first. Move this
2816 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2817 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2818 if (!UnmatchedCalleesNode)
2819 UnmatchedCalleesNode =
2820 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2821 CallerNodeToUse = UnmatchedCalleesNode;
2822 } else {
2823 // Look up the information recorded for this callee node, and use the
2824 // recorded caller node (creating it if needed).
2825 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2826 if (!Info->Node) {
2827 // If we haven't assigned any callees to the original node use it.
2828 if (!UsedOrigNode) {
2829 Info->Node = Node;
2830 // Clear the set of matching calls which will be updated below.
2831 Node->MatchingCalls.clear();
2832 UsedOrigNode = true;
2833 } else
2834 Info->Node =
2835 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2836 assert(!Info->Calls.empty());
2837 // The first call becomes the primary call for this caller node, and the
2838 // rest go in the matching calls list.
2839 Info->Node->setCall(Info->Calls.front());
2840 llvm::append_range(Info->Node->MatchingCalls,
2841 llvm::drop_begin(Info->Calls));
2842 // Save the primary call to node correspondence so that we can update
2843 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2844 // caller of this function.
2845 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2846 }
2847 CallerNodeToUse = Info->Node;
2848 }
2849
2850 // Don't need to move edge if we are using the original node;
2851 if (CallerNodeToUse == Node)
2852 continue;
2853
2854 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2855 }
2856 // Now that we are done moving edges, clean up any caller edges that ended
2857 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2858 // caller edges from Node are replicated onto the new callers, and it
2859 // simplifies the handling to leave them until we have moved all
2860 // edges/context ids.
2861 for (auto &I : CalleeNodeToCallInfo)
2862 removeNoneTypeCallerEdges(I.second->Node);
2863 if (UnmatchedCalleesNode)
2864 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2865 removeNoneTypeCallerEdges(Node);
2866
2867 return true;
2868}
2869
2870uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2871 // In the Module (IR) case this is already the Id.
2872 return IdOrIndex;
2873}
2874
2875uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2876 // In the Index case this is an index into the stack id list in the summary
2877 // index, convert it to an Id.
2878 return Index.getStackIdAtIndex(IdOrIndex);
2879}
2880
2881template <typename DerivedCCG, typename FuncTy, typename CallTy>
2882bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2883 CallTy Call, EdgeIter &EI,
2884 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2885 auto Edge = *EI;
2886 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2887 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2888 // Will be populated in order of callee to caller if we find a chain of tail
2889 // calls between the profiled caller and callee.
2890 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2891 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2892 FoundCalleeChain))
2893 return false;
2894
2895 // The usual case where the profiled callee matches that of the IR/summary.
2896 if (FoundCalleeChain.empty())
2897 return true;
2898
2899 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2900 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2901 // If there is already an edge between these nodes, simply update it and
2902 // return.
2903 if (CurEdge) {
2904 CurEdge->ContextIds.insert_range(Edge->ContextIds);
2905 CurEdge->AllocTypes |= Edge->AllocTypes;
2906 return;
2907 }
2908 // Otherwise, create a new edge and insert it into the caller and callee
2909 // lists.
2910 auto NewEdge = std::make_shared<ContextEdge>(
2911 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2912 Callee->CallerEdges.push_back(NewEdge);
2913 if (Caller == Edge->Caller) {
2914 // If we are inserting the new edge into the current edge's caller, insert
2915 // the new edge before the current iterator position, and then increment
2916 // back to the current edge.
2917 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2918 ++EI;
2919 assert(*EI == Edge &&
2920 "Iterator position not restored after insert and increment");
2921 } else
2922 Caller->CalleeEdges.push_back(NewEdge);
2923 };
2924
2925 // Create new nodes for each found callee and connect in between the profiled
2926 // caller and callee.
2927 auto *CurCalleeNode = Edge->Callee;
2928 for (auto &[NewCall, Func] : FoundCalleeChain) {
2929 ContextNode *NewNode = nullptr;
2930 // First check if we have already synthesized a node for this tail call.
2931 if (TailCallToContextNodeMap.count(NewCall)) {
2932 NewNode = TailCallToContextNodeMap[NewCall];
2933 NewNode->AllocTypes |= Edge->AllocTypes;
2934 } else {
2935 FuncToCallsWithMetadata[Func].push_back({NewCall});
2936 // Create Node and record node info.
2937 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2938 TailCallToContextNodeMap[NewCall] = NewNode;
2939 NewNode->AllocTypes = Edge->AllocTypes;
2940 }
2941
2942 // Hook up node to its callee node
2943 AddEdge(NewNode, CurCalleeNode);
2944
2945 CurCalleeNode = NewNode;
2946 }
2947
2948 // Hook up edge's original caller to new callee node.
2949 AddEdge(Edge->Caller, CurCalleeNode);
2950
2951#ifndef NDEBUG
2952 // Save this because Edge's fields get cleared below when removed.
2953 auto *Caller = Edge->Caller;
2954#endif
2955
2956 // Remove old edge
2957 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2958
2959 // To simplify the increment of EI in the caller, subtract one from EI.
2960 // In the final AddEdge call we would have either added a new callee edge,
2961 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2962 // that there is at least one callee edge.
2963 assert(!Caller->CalleeEdges.empty());
2964 --EI;
2965
2966 return true;
2967}
2968
2969bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2970 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2971 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2972 bool &FoundMultipleCalleeChains) {
2973 // Stop recursive search if we have already explored the maximum specified
2974 // depth.
2976 return false;
2977
2978 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2979 FoundCalleeChain.push_back({Callsite, F});
2980 };
2981
2982 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2983 if (!CalleeFunc) {
2984 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2985 assert(Alias);
2986 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2987 assert(CalleeFunc);
2988 }
2989
2990 // Look for tail calls in this function, and check if they either call the
2991 // profiled callee directly, or indirectly (via a recursive search).
2992 // Only succeed if there is a single unique tail call chain found between the
2993 // profiled caller and callee, otherwise we could perform incorrect cloning.
2994 bool FoundSingleCalleeChain = false;
2995 for (auto &BB : *CalleeFunc) {
2996 for (auto &I : BB) {
2997 auto *CB = dyn_cast<CallBase>(&I);
2998 if (!CB || !CB->isTailCall())
2999 continue;
3000 auto *CalledValue = CB->getCalledOperand();
3001 auto *CalledFunction = CB->getCalledFunction();
3002 if (CalledValue && !CalledFunction) {
3003 CalledValue = CalledValue->stripPointerCasts();
3004 // Stripping pointer casts can reveal a called function.
3005 CalledFunction = dyn_cast<Function>(CalledValue);
3006 }
3007 // Check if this is an alias to a function. If so, get the
3008 // called aliasee for the checks below.
3009 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3010 assert(!CalledFunction &&
3011 "Expected null called function in callsite for alias");
3012 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3013 }
3014 if (!CalledFunction)
3015 continue;
3016 if (CalledFunction == ProfiledCallee) {
3017 if (FoundSingleCalleeChain) {
3018 FoundMultipleCalleeChains = true;
3019 return false;
3020 }
3021 FoundSingleCalleeChain = true;
3022 FoundProfiledCalleeCount++;
3023 FoundProfiledCalleeDepth += Depth;
3024 if (Depth > FoundProfiledCalleeMaxDepth)
3025 FoundProfiledCalleeMaxDepth = Depth;
3026 SaveCallsiteInfo(&I, CalleeFunc);
3027 } else if (findProfiledCalleeThroughTailCalls(
3028 ProfiledCallee, CalledFunction, Depth + 1,
3029 FoundCalleeChain, FoundMultipleCalleeChains)) {
3030 // findProfiledCalleeThroughTailCalls should not have returned
3031 // true if FoundMultipleCalleeChains.
3032 assert(!FoundMultipleCalleeChains);
3033 if (FoundSingleCalleeChain) {
3034 FoundMultipleCalleeChains = true;
3035 return false;
3036 }
3037 FoundSingleCalleeChain = true;
3038 SaveCallsiteInfo(&I, CalleeFunc);
3039 } else if (FoundMultipleCalleeChains)
3040 return false;
3041 }
3042 }
3043
3044 return FoundSingleCalleeChain;
3045}
3046
3047const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
3048 auto *CB = dyn_cast<CallBase>(Call);
3049 if (!CB->getCalledOperand() || CB->isIndirectCall())
3050 return nullptr;
3051 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3052 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3053 if (Alias)
3054 return dyn_cast<Function>(Alias->getAliasee());
3055 return dyn_cast<Function>(CalleeVal);
3056}
3057
3058bool ModuleCallsiteContextGraph::calleeMatchesFunc(
3059 Instruction *Call, const Function *Func, const Function *CallerFunc,
3060 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
3061 auto *CB = dyn_cast<CallBase>(Call);
3062 if (!CB->getCalledOperand() || CB->isIndirectCall())
3063 return false;
3064 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
3065 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
3066 if (CalleeFunc == Func)
3067 return true;
3068 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
3069 if (Alias && Alias->getAliasee() == Func)
3070 return true;
3071
3072 // Recursively search for the profiled callee through tail calls starting with
3073 // the actual Callee. The discovered tail call chain is saved in
3074 // FoundCalleeChain, and we will fixup the graph to include these callsites
3075 // after returning.
3076 // FIXME: We will currently redo the same recursive walk if we find the same
3077 // mismatched callee from another callsite. We can improve this with more
3078 // bookkeeping of the created chain of new nodes for each mismatch.
3079 unsigned Depth = 1;
3080 bool FoundMultipleCalleeChains = false;
3081 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
3082 FoundCalleeChain,
3083 FoundMultipleCalleeChains)) {
3084 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
3085 << Func->getName() << " from " << CallerFunc->getName()
3086 << " that actually called " << CalleeVal->getName()
3087 << (FoundMultipleCalleeChains
3088 ? " (found multiple possible chains)"
3089 : "")
3090 << "\n");
3091 if (FoundMultipleCalleeChains)
3092 FoundProfiledCalleeNonUniquelyCount++;
3093 return false;
3094 }
3095
3096 return true;
3097}
3098
3099bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
3100 Instruction *Call2) {
3101 auto *CB1 = cast<CallBase>(Call1);
3102 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
3103 return false;
3104 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
3105 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
3106 auto *CB2 = cast<CallBase>(Call2);
3107 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
3108 return false;
3109 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
3110 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
3111 return CalleeFunc1 == CalleeFunc2;
3112}
3113
3114bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
3115 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
3116 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
3117 bool &FoundMultipleCalleeChains) {
3118 // Stop recursive search if we have already explored the maximum specified
3119 // depth.
3121 return false;
3122
3123 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
3124 // Make a CallsiteInfo for each discovered callee, if one hasn't already
3125 // been synthesized.
3126 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
3127 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
3128 // StackIds is empty (we don't have debug info available in the index for
3129 // these callsites)
3130 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
3131 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
3132 CallsiteInfo *NewCallsiteInfo =
3133 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
3134 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
3135 };
3136
3137 // Look for tail calls in this function, and check if they either call the
3138 // profiled callee directly, or indirectly (via a recursive search).
3139 // Only succeed if there is a single unique tail call chain found between the
3140 // profiled caller and callee, otherwise we could perform incorrect cloning.
3141 bool FoundSingleCalleeChain = false;
3142 for (auto &S : CurCallee.getSummaryList()) {
3143 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
3144 !isPrevailing(CurCallee.getGUID(), S.get()))
3145 continue;
3146 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
3147 if (!FS)
3148 continue;
3149 auto FSVI = CurCallee;
3150 auto *AS = dyn_cast<AliasSummary>(S.get());
3151 if (AS)
3152 FSVI = AS->getAliaseeVI();
3153 for (auto &CallEdge : FS->calls()) {
3154 if (!CallEdge.second.hasTailCall())
3155 continue;
3156 if (CallEdge.first == ProfiledCallee) {
3157 if (FoundSingleCalleeChain) {
3158 FoundMultipleCalleeChains = true;
3159 return false;
3160 }
3161 FoundSingleCalleeChain = true;
3162 FoundProfiledCalleeCount++;
3163 FoundProfiledCalleeDepth += Depth;
3164 if (Depth > FoundProfiledCalleeMaxDepth)
3165 FoundProfiledCalleeMaxDepth = Depth;
3166 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3167 // Add FS to FSToVIMap in case it isn't already there.
3168 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3169 FSToVIMap[FS] = FSVI;
3170 } else if (findProfiledCalleeThroughTailCalls(
3171 ProfiledCallee, CallEdge.first, Depth + 1,
3172 FoundCalleeChain, FoundMultipleCalleeChains)) {
3173 // findProfiledCalleeThroughTailCalls should not have returned
3174 // true if FoundMultipleCalleeChains.
3175 assert(!FoundMultipleCalleeChains);
3176 if (FoundSingleCalleeChain) {
3177 FoundMultipleCalleeChains = true;
3178 return false;
3179 }
3180 FoundSingleCalleeChain = true;
3181 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
3182 // Add FS to FSToVIMap in case it isn't already there.
3183 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
3184 FSToVIMap[FS] = FSVI;
3185 } else if (FoundMultipleCalleeChains)
3186 return false;
3187 }
3188 }
3189
3190 return FoundSingleCalleeChain;
3191}
3192
3193const FunctionSummary *
3194IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
3195 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3196 if (Callee.getSummaryList().empty())
3197 return nullptr;
3198 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
3199}
3200
3201bool IndexCallsiteContextGraph::calleeMatchesFunc(
3202 IndexCall &Call, const FunctionSummary *Func,
3203 const FunctionSummary *CallerFunc,
3204 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
3205 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
3206 // If there is no summary list then this is a call to an externally defined
3207 // symbol.
3208 AliasSummary *Alias =
3209 Callee.getSummaryList().empty()
3210 ? nullptr
3211 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
3212 assert(FSToVIMap.count(Func));
3213 auto FuncVI = FSToVIMap[Func];
3214 if (Callee == FuncVI ||
3215 // If callee is an alias, check the aliasee, since only function
3216 // summary base objects will contain the stack node summaries and thus
3217 // get a context node.
3218 (Alias && Alias->getAliaseeVI() == FuncVI))
3219 return true;
3220
3221 // Recursively search for the profiled callee through tail calls starting with
3222 // the actual Callee. The discovered tail call chain is saved in
3223 // FoundCalleeChain, and we will fixup the graph to include these callsites
3224 // after returning.
3225 // FIXME: We will currently redo the same recursive walk if we find the same
3226 // mismatched callee from another callsite. We can improve this with more
3227 // bookkeeping of the created chain of new nodes for each mismatch.
3228 unsigned Depth = 1;
3229 bool FoundMultipleCalleeChains = false;
3230 if (!findProfiledCalleeThroughTailCalls(
3231 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
3232 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
3233 << " from " << FSToVIMap[CallerFunc]
3234 << " that actually called " << Callee
3235 << (FoundMultipleCalleeChains
3236 ? " (found multiple possible chains)"
3237 : "")
3238 << "\n");
3239 if (FoundMultipleCalleeChains)
3240 FoundProfiledCalleeNonUniquelyCount++;
3241 return false;
3242 }
3243
3244 return true;
3245}
3246
3247bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
3248 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
3249 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
3250 return Callee1 == Callee2;
3251}
3252
3253template <typename DerivedCCG, typename FuncTy, typename CallTy>
3254void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
3255 const {
3256 print(dbgs());
3257 dbgs() << "\n";
3258}
3259
3260template <typename DerivedCCG, typename FuncTy, typename CallTy>
3261void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
3262 raw_ostream &OS) const {
3263 OS << "Node " << this << "\n";
3264 OS << "\t";
3265 printCall(OS);
3266 if (Recursive)
3267 OS << " (recursive)";
3268 OS << "\n";
3269 if (!MatchingCalls.empty()) {
3270 OS << "\tMatchingCalls:\n";
3271 for (auto &MatchingCall : MatchingCalls) {
3272 OS << "\t";
3273 MatchingCall.print(OS);
3274 OS << "\n";
3275 }
3276 }
3277 OS << "\tNodeId: " << NodeId << "\n";
3278 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
3279 OS << "\tContextIds:";
3280 // Make a copy of the computed context ids that we can sort for stability.
3281 auto ContextIds = getContextIds();
3282 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3283 std::sort(SortedIds.begin(), SortedIds.end());
3284 for (auto Id : SortedIds)
3285 OS << " " << Id;
3286 OS << "\n";
3287 OS << "\tCalleeEdges:\n";
3288 for (auto &Edge : CalleeEdges)
3289 OS << "\t\t" << *Edge << " (Callee NodeId: " << Edge->Callee->NodeId
3290 << ")\n";
3291 OS << "\tCallerEdges:\n";
3292 for (auto &Edge : CallerEdges)
3293 OS << "\t\t" << *Edge << " (Caller NodeId: " << Edge->Caller->NodeId
3294 << ")\n";
3295 if (!Clones.empty()) {
3296 OS << "\tClones: ";
3297 ListSeparator LS;
3298 for (auto *C : Clones)
3299 OS << LS << C << " NodeId: " << C->NodeId;
3300 OS << "\n";
3301 } else if (CloneOf) {
3302 OS << "\tClone of " << CloneOf << " NodeId: " << CloneOf->NodeId << "\n";
3303 }
3304}
3305
3306template <typename DerivedCCG, typename FuncTy, typename CallTy>
3307void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
3308 const {
3309 print(dbgs());
3310 dbgs() << "\n";
3311}
3312
3313template <typename DerivedCCG, typename FuncTy, typename CallTy>
3314void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
3315 raw_ostream &OS) const {
3316 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
3317 << (IsBackedge ? " (BE)" : "")
3318 << " AllocTypes: " << getAllocTypeString(AllocTypes);
3319 OS << " ContextIds:";
3320 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3321 std::sort(SortedIds.begin(), SortedIds.end());
3322 for (auto Id : SortedIds)
3323 OS << " " << Id;
3324}
3325
3326template <typename DerivedCCG, typename FuncTy, typename CallTy>
3327void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3328 print(dbgs());
3329}
3330
3331template <typename DerivedCCG, typename FuncTy, typename CallTy>
3332void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3333 raw_ostream &OS) const {
3334 OS << "Callsite Context Graph:\n";
3335 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3336 for (const auto Node : nodes<GraphType>(this)) {
3337 if (Node->isRemoved())
3338 continue;
3339 Node->print(OS);
3340 OS << "\n";
3341 }
3342}
3343
3344template <typename DerivedCCG, typename FuncTy, typename CallTy>
3345void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3346 raw_ostream &OS,
3347 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) const {
3348 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3349 for (const auto Node : nodes<GraphType>(this)) {
3350 if (Node->isRemoved())
3351 continue;
3352 if (!Node->IsAllocation)
3353 continue;
3354 DenseSet<uint32_t> ContextIds = Node->getContextIds();
3355 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3356 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3357 std::sort(SortedIds.begin(), SortedIds.end());
3358 for (auto Id : SortedIds) {
3359 auto TypeI = ContextIdToAllocationType.find(Id);
3360 assert(TypeI != ContextIdToAllocationType.end());
3361 auto CSI = ContextIdToContextSizeInfos.find(Id);
3362 if (CSI != ContextIdToContextSizeInfos.end()) {
3363 for (auto &Info : CSI->second) {
3364 std::string Msg =
3365 "MemProf hinting: " + getAllocTypeString((uint8_t)TypeI->second) +
3366 " full allocation context " + std::to_string(Info.FullStackId) +
3367 " with total size " + std::to_string(Info.TotalSize) + " is " +
3368 getAllocTypeString(Node->AllocTypes) + " after cloning";
3369 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3370 Msg += " marked " + getAllocTypeString((uint8_t)AllocTypeFromCall) +
3371 " due to cold byte percent";
3372 // Print the internal context id to aid debugging and visualization.
3373 Msg += " (internal context id " + std::to_string(Id) + ")";
3375 OS << Msg << "\n";
3376 if (EmitRemark)
3377 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3378 }
3379 } else {
3380 // This is only emitted if the context size info is not present.
3381 std::string Msg =
3382 "MemProf hinting: " + getAllocTypeString((uint8_t)TypeI->second) +
3383 " context is " + getAllocTypeString(Node->AllocTypes) +
3384 " after cloning";
3385 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3386 Msg += " marked " + getAllocTypeString((uint8_t)AllocTypeFromCall) +
3387 " due to cold byte percent";
3388 // Print the internal context id to aid debugging and visualization.
3389 Msg += " (internal context id " + std::to_string(Id) + ")";
3391 OS << Msg << "\n";
3392 if (EmitRemark)
3393 EmitRemark(DEBUG_TYPE, "MemProfReport", Msg);
3394 }
3395 }
3396 }
3397}
3398
3399template <typename DerivedCCG, typename FuncTy, typename CallTy>
3400void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3401 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3402 for (const auto Node : nodes<GraphType>(this)) {
3403 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3404 for (auto &Edge : Node->CallerEdges)
3406 }
3407}
3408
3409template <typename DerivedCCG, typename FuncTy, typename CallTy>
3410struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3411 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3412 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3413
3414 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
3415 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3416
3419 decltype(&getNode)>;
3420
3422 return nodes_iterator(G->NodeOwner.begin(), &getNode);
3423 }
3424
3426 return nodes_iterator(G->NodeOwner.end(), &getNode);
3427 }
3428
3430 return G->NodeOwner.begin()->get();
3431 }
3432
3433 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3434 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
3436 return P->Callee;
3437 }
3438
3440 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3441 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3442 decltype(&GetCallee)>;
3443
3445 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3446 }
3447
3449 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3450 }
3451};
3452
3453template <typename DerivedCCG, typename FuncTy, typename CallTy>
3454struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3455 : public DefaultDOTGraphTraits {
3456 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3457 // If the user requested the full graph to be exported, but provided an
3458 // allocation id, or if the user gave a context id and requested more than
3459 // just a specific context to be exported, note that highlighting is
3460 // enabled.
3461 DoHighlight =
3462 (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3463 (ContextIdForDot.getNumOccurrences() &&
3465 }
3466
3467 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3469 using NodeRef = typename GTraits::NodeRef;
3470 using ChildIteratorType = typename GTraits::ChildIteratorType;
3471
3472 static std::string getNodeLabel(NodeRef Node, GraphType G) {
3473 std::string LabelString =
3474 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3475 Twine(Node->OrigStackOrAllocId) + " NodeId: " + Twine(Node->NodeId))
3476 .str();
3477 LabelString += "\n";
3478 if (Node->hasCall()) {
3479 auto Func = G->NodeToCallingFunc.find(Node);
3480 assert(Func != G->NodeToCallingFunc.end());
3481 LabelString +=
3482 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3483 for (auto &MatchingCall : Node->MatchingCalls) {
3484 LabelString += "\n";
3485 LabelString += G->getLabel(Func->second, MatchingCall.call(),
3486 MatchingCall.cloneNo());
3487 }
3488 } else {
3489 LabelString += "null call";
3490 if (Node->Recursive)
3491 LabelString += " (recursive)";
3492 else
3493 LabelString += " (external)";
3494 }
3495 return LabelString;
3496 }
3497
3499 auto ContextIds = Node->getContextIds();
3500 // If highlighting enabled, see if this node contains any of the context ids
3501 // of interest. If so, it will use a different color and a larger fontsize
3502 // (which makes the node larger as well).
3503 bool Highlight = false;
3504 if (DoHighlight) {
3505 assert(ContextIdForDot.getNumOccurrences() ||
3506 AllocIdForDot.getNumOccurrences());
3507 if (ContextIdForDot.getNumOccurrences())
3508 Highlight = ContextIds.contains(ContextIdForDot);
3509 else
3510 Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3511 }
3512 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3513 getContextIds(ContextIds) + "\"")
3514 .str();
3515 // Default fontsize is 14
3516 if (Highlight)
3517 AttributeString += ",fontsize=\"30\"";
3518 AttributeString +=
3519 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3520 .str();
3521 if (Node->CloneOf) {
3522 AttributeString += ",color=\"blue\"";
3523 AttributeString += ",style=\"filled,bold,dashed\"";
3524 } else
3525 AttributeString += ",style=\"filled\"";
3526 return AttributeString;
3527 }
3528
3529 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3530 GraphType G) {
3531 auto &Edge = *(ChildIter.getCurrent());
3532 // If highlighting enabled, see if this edge contains any of the context ids
3533 // of interest. If so, it will use a different color and a heavier arrow
3534 // size and weight (the larger weight makes the highlighted path
3535 // straighter).
3536 bool Highlight = false;
3537 if (DoHighlight) {
3538 assert(ContextIdForDot.getNumOccurrences() ||
3539 AllocIdForDot.getNumOccurrences());
3540 if (ContextIdForDot.getNumOccurrences())
3541 Highlight = Edge->ContextIds.contains(ContextIdForDot);
3542 else
3543 Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3544 }
3545 auto Color = getColor(Edge->AllocTypes, Highlight);
3546 std::string AttributeString =
3547 (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3548 // fillcolor is the arrow head and color is the line
3549 Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3550 "\"")
3551 .str();
3552 if (Edge->IsBackedge)
3553 AttributeString += ",style=\"dotted\"";
3554 // Default penwidth and weight are both 1.
3555 if (Highlight)
3556 AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3557 return AttributeString;
3558 }
3559
3560 // Since the NodeOwners list includes nodes that are no longer connected to
3561 // the graph, skip them here.
3563 if (Node->isRemoved())
3564 return true;
3565 // If a scope smaller than the full graph was requested, see if this node
3566 // contains any of the context ids of interest.
3568 return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3570 return !Node->getContextIds().contains(ContextIdForDot);
3571 return false;
3572 }
3573
3574private:
3575 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3576 std::string IdString = "ContextIds:";
3577 if (ContextIds.size() < 100) {
3578 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3579 std::sort(SortedIds.begin(), SortedIds.end());
3580 for (auto Id : SortedIds)
3581 IdString += (" " + Twine(Id)).str();
3582 } else {
3583 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3584 }
3585 return IdString;
3586 }
3587
3588 static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3589 // If DoHighlight is not enabled, we want to use the highlight colors for
3590 // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3591 // both compatible with the color scheme before highlighting was supported,
3592 // and for the NotCold+Cold color the non-highlight color is a bit more
3593 // readable.
3594 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3595 // Color "brown1" actually looks like a lighter red.
3596 return !DoHighlight || Highlight ? "brown1" : "lightpink";
3597 if (AllocTypes == (uint8_t)AllocationType::Cold)
3598 return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3599 if (AllocTypes ==
3600 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3601 return Highlight ? "magenta" : "mediumorchid1";
3602 return "gray";
3603 }
3604
3605 static std::string getNodeId(NodeRef Node) {
3606 std::stringstream SStream;
3607 SStream << std::hex << "N0x" << (unsigned long long)Node;
3608 std::string Result = SStream.str();
3609 return Result;
3610 }
3611
3612 // True if we should highlight a specific context or allocation's contexts in
3613 // the emitted graph.
3614 static bool DoHighlight;
3615};
3616
3617template <typename DerivedCCG, typename FuncTy, typename CallTy>
3618bool DOTGraphTraits<
3619 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3620 false;
3621
3622template <typename DerivedCCG, typename FuncTy, typename CallTy>
3623void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3624 std::string Label) const {
3625 WriteGraph(this, "", false, Label,
3626 DotFilePathPrefix + "ccg." + Label + ".dot");
3627}
3628
3629template <typename DerivedCCG, typename FuncTy, typename CallTy>
3630typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3631CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3632 const std::shared_ptr<ContextEdge> &Edge,
3633 DenseSet<uint32_t> ContextIdsToMove) {
3634 ContextNode *Node = Edge->Callee;
3635 assert(NodeToCallingFunc.count(Node));
3636 ContextNode *Clone =
3637 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3638 Node->addClone(Clone);
3639 Clone->MatchingCalls = Node->MatchingCalls;
3640 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3641 ContextIdsToMove);
3642 return Clone;
3643}
3644
3645template <typename DerivedCCG, typename FuncTy, typename CallTy>
3646void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3647 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3648 ContextNode *NewCallee, bool NewClone,
3649 DenseSet<uint32_t> ContextIdsToMove) {
3650 // NewCallee and Edge's current callee must be clones of the same original
3651 // node (Edge's current callee may be the original node too).
3652 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3653
3654 bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3655
3656 ContextNode *OldCallee = Edge->Callee;
3657
3658 // We might already have an edge to the new callee from earlier cloning for a
3659 // different allocation. If one exists we will reuse it.
3660 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3661
3662 // Callers will pass an empty ContextIdsToMove set when they want to move the
3663 // edge. Copy in Edge's ids for simplicity.
3664 if (ContextIdsToMove.empty())
3665 ContextIdsToMove = Edge->getContextIds();
3666
3667 // If we are moving all of Edge's ids, then just move the whole Edge.
3668 // Otherwise only move the specified subset, to a new edge if needed.
3669 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3670 // First, update the alloc types on New Callee from Edge.
3671 // Do this before we potentially clear Edge's fields below!
3672 NewCallee->AllocTypes |= Edge->AllocTypes;
3673 // Moving the whole Edge.
3674 if (ExistingEdgeToNewCallee) {
3675 // Since we already have an edge to NewCallee, simply move the ids
3676 // onto it, and remove the existing Edge.
3677 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3678 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3679 assert(Edge->ContextIds == ContextIdsToMove);
3680 removeEdgeFromGraph(Edge.get());
3681 } else {
3682 // Otherwise just reconnect Edge to NewCallee.
3683 Edge->Callee = NewCallee;
3684 NewCallee->CallerEdges.push_back(Edge);
3685 // Remove it from callee where it was previously connected.
3686 OldCallee->eraseCallerEdge(Edge.get());
3687 // Don't need to update Edge's context ids since we are simply
3688 // reconnecting it.
3689 }
3690 } else {
3691 // Only moving a subset of Edge's ids.
3692 // Compute the alloc type of the subset of ids being moved.
3693 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3694 if (ExistingEdgeToNewCallee) {
3695 // Since we already have an edge to NewCallee, simply move the ids
3696 // onto it.
3697 ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3698 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3699 } else {
3700 // Otherwise, create a new edge to NewCallee for the ids being moved.
3701 auto NewEdge = std::make_shared<ContextEdge>(
3702 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3703 Edge->Caller->CalleeEdges.push_back(NewEdge);
3704 NewCallee->CallerEdges.push_back(NewEdge);
3705 }
3706 // In either case, need to update the alloc types on NewCallee, and remove
3707 // those ids and update the alloc type on the original Edge.
3708 NewCallee->AllocTypes |= CallerEdgeAllocType;
3709 set_subtract(Edge->ContextIds, ContextIdsToMove);
3710 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3711 }
3712 // Now walk the old callee node's callee edges and move Edge's context ids
3713 // over to the corresponding edge into the clone (which is created here if
3714 // this is a newly created clone).
3715 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3716 ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3717 // If this is a direct recursion edge, use NewCallee (the clone) as the
3718 // callee as well, so that any edge updated/created here is also direct
3719 // recursive.
3720 if (CalleeToUse == OldCallee) {
3721 // If this is a recursive edge, see if we already moved a recursive edge
3722 // (which would have to have been this one) - if we were only moving a
3723 // subset of context ids it would still be on OldCallee.
3724 if (EdgeIsRecursive) {
3725 assert(OldCalleeEdge == Edge);
3726 continue;
3727 }
3728 CalleeToUse = NewCallee;
3729 }
3730 // The context ids moving to the new callee are the subset of this edge's
3731 // context ids and the context ids on the caller edge being moved.
3732 DenseSet<uint32_t> EdgeContextIdsToMove =
3733 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3734 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3735 OldCalleeEdge->AllocTypes =
3736 computeAllocType(OldCalleeEdge->getContextIds());
3737 if (!NewClone) {
3738 // Update context ids / alloc type on corresponding edge to NewCallee.
3739 // There is a chance this may not exist if we are reusing an existing
3740 // clone, specifically during function assignment, where we would have
3741 // removed none type edges after creating the clone. If we can't find
3742 // a corresponding edge there, fall through to the cloning below.
3743 if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3744 NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3745 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3746 continue;
3747 }
3748 }
3749 auto NewEdge = std::make_shared<ContextEdge>(
3750 CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3751 EdgeContextIdsToMove);
3752 NewCallee->CalleeEdges.push_back(NewEdge);
3753 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3754 }
3755 // Recompute the node alloc type now that its callee edges have been
3756 // updated (since we will compute from those edges).
3757 OldCallee->AllocTypes = OldCallee->computeAllocType();
3758 // OldCallee alloc type should be None iff its context id set is now empty.
3759 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3760 OldCallee->emptyContextIds());
3761 if (VerifyCCG) {
3762 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3763 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3764 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3765 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3766 /*CheckEdges=*/false);
3767 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3768 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3769 /*CheckEdges=*/false);
3770 }
3771}
3772
3773template <typename DerivedCCG, typename FuncTy, typename CallTy>
3774void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3775 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3776 ContextNode *NewCaller) {
3777 auto *OldCallee = Edge->Callee;
3778 auto *NewCallee = OldCallee;
3779 // If this edge was direct recursive, make any new/updated edge also direct
3780 // recursive to NewCaller.
3781 bool Recursive = Edge->Caller == Edge->Callee;
3782 if (Recursive)
3783 NewCallee = NewCaller;
3784
3785 ContextNode *OldCaller = Edge->Caller;
3786 OldCaller->eraseCalleeEdge(Edge.get());
3787
3788 // We might already have an edge to the new caller. If one exists we will
3789 // reuse it.
3790 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3791
3792 if (ExistingEdgeToNewCaller) {
3793 // Since we already have an edge to NewCaller, simply move the ids
3794 // onto it, and remove the existing Edge.
3795 ExistingEdgeToNewCaller->getContextIds().insert_range(
3796 Edge->getContextIds());
3797 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3798 Edge->ContextIds.clear();
3799 Edge->AllocTypes = (uint8_t)AllocationType::None;
3800 OldCallee->eraseCallerEdge(Edge.get());
3801 } else {
3802 // Otherwise just reconnect Edge to NewCaller.
3803 Edge->Caller = NewCaller;
3804 NewCaller->CalleeEdges.push_back(Edge);
3805 if (Recursive) {
3806 assert(NewCallee == NewCaller);
3807 // In the case of (direct) recursive edges, we update the callee as well
3808 // so that it becomes recursive on the new caller.
3809 Edge->Callee = NewCallee;
3810 NewCallee->CallerEdges.push_back(Edge);
3811 OldCallee->eraseCallerEdge(Edge.get());
3812 }
3813 // Don't need to update Edge's context ids since we are simply
3814 // reconnecting it.
3815 }
3816 // In either case, need to update the alloc types on New Caller.
3817 NewCaller->AllocTypes |= Edge->AllocTypes;
3818
3819 // Now walk the old caller node's caller edges and move Edge's context ids
3820 // over to the corresponding edge into the node (which is created here if
3821 // this is a newly created node). We can tell whether this is a newly created
3822 // node by seeing if it has any caller edges yet.
3823#ifndef NDEBUG
3824 bool IsNewNode = NewCaller->CallerEdges.empty();
3825#endif
3826 // If we just moved a direct recursive edge, presumably its context ids should
3827 // also flow out of OldCaller via some other non-recursive callee edge. We
3828 // don't want to remove the recursive context ids from other caller edges yet,
3829 // otherwise the context ids get into an inconsistent state on OldCaller.
3830 // We will update these context ids on the non-recursive caller edge when and
3831 // if they are updated on the non-recursive callee.
3832 if (!Recursive) {
3833 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3834 auto OldCallerCaller = OldCallerEdge->Caller;
3835 // The context ids moving to the new caller are the subset of this edge's
3836 // context ids and the context ids on the callee edge being moved.
3837 DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3838 OldCallerEdge->getContextIds(), Edge->getContextIds());
3839 if (OldCaller == OldCallerCaller) {
3840 OldCallerCaller = NewCaller;
3841 // Don't actually move this one. The caller will move it directly via a
3842 // call to this function with this as the Edge if it is appropriate to
3843 // move to a diff node that has a matching callee (itself).
3844 continue;
3845 }
3846 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3847 OldCallerEdge->AllocTypes =
3848 computeAllocType(OldCallerEdge->getContextIds());
3849 // In this function we expect that any pre-existing node already has edges
3850 // from the same callers as the old node. That should be true in the
3851 // current use case, where we will remove None-type edges after copying
3852 // over all caller edges from the callee.
3853 auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3854 // Since we would have skipped caller edges when moving a direct recursive
3855 // edge, this may not hold true when recursive handling enabled.
3856 assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3857 if (ExistingCallerEdge) {
3858 ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3859 ExistingCallerEdge->AllocTypes |=
3860 computeAllocType(EdgeContextIdsToMove);
3861 continue;
3862 }
3863 auto NewEdge = std::make_shared<ContextEdge>(
3864 NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3865 EdgeContextIdsToMove);
3866 NewCaller->CallerEdges.push_back(NewEdge);
3867 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3868 }
3869 }
3870 // Recompute the node alloc type now that its caller edges have been
3871 // updated (since we will compute from those edges).
3872 OldCaller->AllocTypes = OldCaller->computeAllocType();
3873 // OldCaller alloc type should be None iff its context id set is now empty.
3874 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3875 OldCaller->emptyContextIds());
3876 if (VerifyCCG) {
3877 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3878 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3879 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3880 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3881 /*CheckEdges=*/false);
3882 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3883 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3884 /*CheckEdges=*/false);
3885 }
3886}
3887
3888template <typename DerivedCCG, typename FuncTy, typename CallTy>
3889void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3890 recursivelyRemoveNoneTypeCalleeEdges(
3891 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3892 auto Inserted = Visited.insert(Node);
3893 if (!Inserted.second)
3894 return;
3895
3896 removeNoneTypeCalleeEdges(Node);
3897
3898 for (auto *Clone : Node->Clones)
3899 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3900
3901 // The recursive call may remove some of this Node's caller edges.
3902 // Iterate over a copy and skip any that were removed.
3903 auto CallerEdges = Node->CallerEdges;
3904 for (auto &Edge : CallerEdges) {
3905 // Skip any that have been removed by an earlier recursive call.
3906 if (Edge->isRemoved()) {
3907 assert(!is_contained(Node->CallerEdges, Edge));
3908 continue;
3909 }
3910 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3911 }
3912}
3913
3914// This is the standard DFS based backedge discovery algorithm.
3915template <typename DerivedCCG, typename FuncTy, typename CallTy>
3916void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3917 // If we are cloning recursive contexts, find and mark backedges from all root
3918 // callers, using the typical DFS based backedge analysis.
3920 return;
3921 DenseSet<const ContextNode *> Visited;
3922 DenseSet<const ContextNode *> CurrentStack;
3923 for (auto &Entry : NonAllocationCallToContextNodeMap) {
3924 auto *Node = Entry.second;
3925 if (Node->isRemoved())
3926 continue;
3927 // It is a root if it doesn't have callers.
3928 if (!Node->CallerEdges.empty())
3929 continue;
3930 markBackedges(Node, Visited, CurrentStack);
3931 assert(CurrentStack.empty());
3932 }
3933}
3934
3935// Recursive helper for above markBackedges method.
3936template <typename DerivedCCG, typename FuncTy, typename CallTy>
3937void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3938 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3939 DenseSet<const ContextNode *> &CurrentStack) {
3940 auto I = Visited.insert(Node);
3941 // We should only call this for unvisited nodes.
3942 assert(I.second);
3943 (void)I;
3944 for (auto &CalleeEdge : Node->CalleeEdges) {
3945 auto *Callee = CalleeEdge->Callee;
3946 if (Visited.count(Callee)) {
3947 // Since this was already visited we need to check if it is currently on
3948 // the recursive stack in which case it is a backedge.
3949 if (CurrentStack.count(Callee))
3950 CalleeEdge->IsBackedge = true;
3951 continue;
3952 }
3953 CurrentStack.insert(Callee);
3954 markBackedges(Callee, Visited, CurrentStack);
3955 CurrentStack.erase(Callee);
3956 }
3957}
3958
3959template <typename DerivedCCG, typename FuncTy, typename CallTy>
3960void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3961 DenseSet<const ContextNode *> Visited;
3962 for (auto &Entry : AllocationCallToContextNodeMap) {
3963 Visited.clear();
3964 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3965 }
3966 Visited.clear();
3967 for (auto &Entry : AllocationCallToContextNodeMap)
3968 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3969 if (VerifyCCG)
3970 check();
3971}
3972
3973// helper function to check an AllocType is cold or notcold or both.
3980
3981template <typename DerivedCCG, typename FuncTy, typename CallTy>
3982void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3983 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3984 const DenseSet<uint32_t> &AllocContextIds) {
3985 if (VerifyNodes)
3986 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3987 assert(!Node->CloneOf);
3988
3989 // If Node as a null call, then either it wasn't found in the module (regular
3990 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3991 // cloning (e.g. recursion, calls multiple targets, etc).
3992 // Do this here so that we don't try to recursively clone callers below, which
3993 // isn't useful at least for this node.
3994 if (!Node->hasCall())
3995 return;
3996
3997 // No need to look at any callers if allocation type already unambiguous.
3998 if (hasSingleAllocType(Node->AllocTypes))
3999 return;
4000
4001#ifndef NDEBUG
4002 auto Insert =
4003#endif
4004 Visited.insert(Node);
4005 // We should not have visited this node yet.
4006 assert(Insert.second);
4007 // The recursive call to identifyClones may delete the current edge from the
4008 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
4009 // in an iterator and having recursive call erase from it. Other edges may
4010 // also get removed during the recursion, which will have null Callee and
4011 // Caller pointers (and are deleted later), so we skip those below.
4012 {
4013 auto CallerEdges = Node->CallerEdges;
4014 for (auto &Edge : CallerEdges) {
4015 // Skip any that have been removed by an earlier recursive call.
4016 if (Edge->isRemoved()) {
4017 assert(!is_contained(Node->CallerEdges, Edge));
4018 continue;
4019 }
4020 // Defer backedges. See comments further below where these edges are
4021 // handled during the cloning of this Node.
4022 if (Edge->IsBackedge) {
4023 // We should only mark these if cloning recursive contexts, where we
4024 // need to do this deferral.
4026 continue;
4027 }
4028 // Ignore any caller we previously visited via another edge.
4029 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
4030 identifyClones(Edge->Caller, Visited, AllocContextIds);
4031 }
4032 }
4033 }
4034
4035 // Check if we reached an unambiguous call or have have only a single caller.
4036 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4037 return;
4038
4039 // We need to clone.
4040
4041 // Try to keep the original version as alloc type NotCold. This will make
4042 // cases with indirect calls or any other situation with an unknown call to
4043 // the original function get the default behavior. We do this by sorting the
4044 // CallerEdges of the Node we will clone by alloc type.
4045 //
4046 // Give NotCold edge the lowest sort priority so those edges are at the end of
4047 // the caller edges vector, and stay on the original version (since the below
4048 // code clones greedily until it finds all remaining edges have the same type
4049 // and leaves the remaining ones on the original Node).
4050 //
4051 // We shouldn't actually have any None type edges, so the sorting priority for
4052 // that is arbitrary, and we assert in that case below.
4053 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
4054 /*Cold*/ 1,
4055 /*NotColdCold*/ 2};
4056 llvm::stable_sort(Node->CallerEdges,
4057 [&](const std::shared_ptr<ContextEdge> &A,
4058 const std::shared_ptr<ContextEdge> &B) {
4059 // Nodes with non-empty context ids should be sorted
4060 // before those with empty context ids.
4061 if (A->ContextIds.empty())
4062 // Either B ContextIds are non-empty (in which case we
4063 // should return false because B < A), or B ContextIds
4064 // are empty, in which case they are equal, and we
4065 // should maintain the original relative ordering.
4066 return false;
4067 if (B->ContextIds.empty())
4068 return true;
4069
4070 if (A->AllocTypes == B->AllocTypes)
4071 // Use the first context id for each edge as a
4072 // tie-breaker.
4073 return *A->ContextIds.begin() < *B->ContextIds.begin();
4074 return AllocTypeCloningPriority[A->AllocTypes] <
4075 AllocTypeCloningPriority[B->AllocTypes];
4076 });
4077
4078 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4079
4080 DenseSet<uint32_t> RecursiveContextIds;
4082 // If we are allowing recursive callsites, but have also disabled recursive
4083 // contexts, look for context ids that show up in multiple caller edges.
4085 DenseSet<uint32_t> AllCallerContextIds;
4086 for (auto &CE : Node->CallerEdges) {
4087 // Resize to the largest set of caller context ids, since we know the
4088 // final set will be at least that large.
4089 AllCallerContextIds.reserve(CE->getContextIds().size());
4090 for (auto Id : CE->getContextIds())
4091 if (!AllCallerContextIds.insert(Id).second)
4092 RecursiveContextIds.insert(Id);
4093 }
4094 }
4095
4096 // Iterate until we find no more opportunities for disambiguating the alloc
4097 // types via cloning. In most cases this loop will terminate once the Node
4098 // has a single allocation type, in which case no more cloning is needed.
4099 // Iterate over a copy of Node's caller edges, since we may need to remove
4100 // edges in the moveEdgeTo* methods, and this simplifies the handling and
4101 // makes it less error-prone.
4102 auto CallerEdges = Node->CallerEdges;
4103 for (auto &CallerEdge : CallerEdges) {
4104 // Skip any that have been removed by an earlier recursive call.
4105 if (CallerEdge->isRemoved()) {
4106 assert(!is_contained(Node->CallerEdges, CallerEdge));
4107 continue;
4108 }
4109 assert(CallerEdge->Callee == Node);
4110
4111 // See if cloning the prior caller edge left this node with a single alloc
4112 // type or a single caller. In that case no more cloning of Node is needed.
4113 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
4114 break;
4115
4116 // If the caller was not successfully matched to a call in the IR/summary,
4117 // there is no point in trying to clone for it as we can't update that call.
4118 if (!CallerEdge->Caller->hasCall())
4119 continue;
4120
4121 // Only need to process the ids along this edge pertaining to the given
4122 // allocation.
4123 auto CallerEdgeContextsForAlloc =
4124 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
4125 if (!RecursiveContextIds.empty())
4126 CallerEdgeContextsForAlloc =
4127 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
4128 if (CallerEdgeContextsForAlloc.empty())
4129 continue;
4130
4131 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4132
4133 // Compute the node callee edge alloc types corresponding to the context ids
4134 // for this caller edge.
4135 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
4136 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
4137 for (auto &CalleeEdge : Node->CalleeEdges)
4138 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4139 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4140
4141 // Don't clone if doing so will not disambiguate any alloc types amongst
4142 // caller edges (including the callee edges that would be cloned).
4143 // Otherwise we will simply move all edges to the clone.
4144 //
4145 // First check if by cloning we will disambiguate the caller allocation
4146 // type from node's allocation type. Query allocTypeToUse so that we don't
4147 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
4148 // neither of these should be None type.
4149 //
4150 // Then check if by cloning node at least one of the callee edges will be
4151 // disambiguated by splitting out different context ids.
4152 //
4153 // However, always do the cloning if this is a backedge, in which case we
4154 // have not yet cloned along this caller edge.
4155 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
4156 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4157 if (!CallerEdge->IsBackedge &&
4158 allocTypeToUse(CallerAllocTypeForAlloc) ==
4159 allocTypeToUse(Node->AllocTypes) &&
4160 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
4161 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
4162 continue;
4163 }
4164
4165 if (CallerEdge->IsBackedge) {
4166 // We should only mark these if cloning recursive contexts, where we
4167 // need to do this deferral.
4169 DeferredBackedges++;
4170 }
4171
4172 // If this is a backedge, we now do recursive cloning starting from its
4173 // caller since we may have moved unambiguous caller contexts to a clone
4174 // of this Node in a previous iteration of the current loop, giving more
4175 // opportunity for cloning through the backedge. Because we sorted the
4176 // caller edges earlier so that cold caller edges are first, we would have
4177 // visited and cloned this node for any unamibiguously cold non-recursive
4178 // callers before any ambiguous backedge callers. Note that we don't do this
4179 // if the caller is already cloned or visited during cloning (e.g. via a
4180 // different context path from the allocation).
4181 // TODO: Can we do better in the case where the caller was already visited?
4182 if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
4183 !Visited.count(CallerEdge->Caller)) {
4184 const auto OrigIdCount = CallerEdge->getContextIds().size();
4185 // Now do the recursive cloning of this backedge's caller, which was
4186 // deferred earlier.
4187 identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
4188 removeNoneTypeCalleeEdges(CallerEdge->Caller);
4189 // See if the recursive call to identifyClones moved the context ids to a
4190 // new edge from this node to a clone of caller, and switch to looking at
4191 // that new edge so that we clone Node for the new caller clone.
4192 bool UpdatedEdge = false;
4193 if (OrigIdCount > CallerEdge->getContextIds().size()) {
4194 for (auto E : Node->CallerEdges) {
4195 // Only interested in clones of the current edges caller.
4196 if (E->Caller->CloneOf != CallerEdge->Caller)
4197 continue;
4198 // See if this edge contains any of the context ids originally on the
4199 // current caller edge.
4200 auto CallerEdgeContextsForAllocNew =
4201 set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
4202 if (CallerEdgeContextsForAllocNew.empty())
4203 continue;
4204 // Make sure we don't pick a previously existing caller edge of this
4205 // Node, which would be processed on a different iteration of the
4206 // outer loop over the saved CallerEdges.
4207 if (llvm::is_contained(CallerEdges, E))
4208 continue;
4209 // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
4210 // are updated further below for all cases where we just invoked
4211 // identifyClones recursively.
4212 CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
4213 CallerEdge = E;
4214 UpdatedEdge = true;
4215 break;
4216 }
4217 }
4218 // If cloning removed this edge (and we didn't update it to a new edge
4219 // above), we're done with this edge. It's possible we moved all of the
4220 // context ids to an existing clone, in which case there's no need to do
4221 // further processing for them.
4222 if (CallerEdge->isRemoved())
4223 continue;
4224
4225 // Now we need to update the information used for the cloning decisions
4226 // further below, as we may have modified edges and their context ids.
4227
4228 // Note if we changed the CallerEdge above we would have already updated
4229 // the context ids.
4230 if (!UpdatedEdge) {
4231 CallerEdgeContextsForAlloc = set_intersection(
4232 CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
4233 if (CallerEdgeContextsForAlloc.empty())
4234 continue;
4235 }
4236 // Update the other information that depends on the edges and on the now
4237 // updated CallerEdgeContextsForAlloc.
4238 CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
4239 CalleeEdgeAllocTypesForCallerEdge.clear();
4240 for (auto &CalleeEdge : Node->CalleeEdges) {
4241 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
4242 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
4243 }
4244 }
4245
4246 // First see if we can use an existing clone. Check each clone and its
4247 // callee edges for matching alloc types.
4248 ContextNode *Clone = nullptr;
4249 for (auto *CurClone : Node->Clones) {
4250 if (allocTypeToUse(CurClone->AllocTypes) !=
4251 allocTypeToUse(CallerAllocTypeForAlloc))
4252 continue;
4253
4254 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
4255 hasSingleAllocType(CallerAllocTypeForAlloc);
4256 // The above check should mean that if both have single alloc types that
4257 // they should be equal.
4258 assert(!BothSingleAlloc ||
4259 CurClone->AllocTypes == CallerAllocTypeForAlloc);
4260
4261 // If either both have a single alloc type (which are the same), or if the
4262 // clone's callee edges have the same alloc types as those for the current
4263 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
4264 // then we can reuse this clone.
4265 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
4266 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
4267 Clone = CurClone;
4268 break;
4269 }
4270 }
4271
4272 // The edge iterator is adjusted when we move the CallerEdge to the clone.
4273 if (Clone)
4274 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
4275 CallerEdgeContextsForAlloc);
4276 else
4277 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
4278
4279 // Sanity check that no alloc types on clone or its edges are None.
4280 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
4281 }
4282
4283 // We should still have some context ids on the original Node.
4284 assert(!Node->emptyContextIds());
4285
4286 // Sanity check that no alloc types on node or edges are None.
4287 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
4288
4289 if (VerifyNodes)
4290 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
4291}
4292
4293void ModuleCallsiteContextGraph::updateAllocationCall(
4294 CallInfo &Call, AllocationType AllocType) {
4295 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
4297 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
4298 "memprof", AllocTypeString);
4299 cast<CallBase>(Call.call())->addFnAttr(A);
4300 OREGetter(Call.call()->getFunction())
4301 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
4302 << ore::NV("AllocationCall", Call.call()) << " in clone "
4303 << ore::NV("Caller", Call.call()->getFunction())
4304 << " marked with memprof allocation attribute "
4305 << ore::NV("Attribute", AllocTypeString));
4306}
4307
4308void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
4310 auto *AI = cast<AllocInfo *>(Call.call());
4311 assert(AI);
4312 assert(AI->Versions.size() > Call.cloneNo());
4313 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
4314}
4315
4317ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4318 const auto *CB = cast<CallBase>(Call.call());
4319 if (!CB->getAttributes().hasFnAttr("memprof"))
4320 return AllocationType::None;
4321 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4322 ? AllocationType::Cold
4323 : AllocationType::NotCold;
4324}
4325
4327IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
4328 const auto *AI = cast<AllocInfo *>(Call.call());
4329 assert(AI->Versions.size() > Call.cloneNo());
4330 return (AllocationType)AI->Versions[Call.cloneNo()];
4331}
4332
4333void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4334 FuncInfo CalleeFunc) {
4335 auto *CurF = getCalleeFunc(CallerCall.call());
4336 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4337 if (isMemProfClone(*CurF)) {
4338 // If we already assigned this callsite to call a specific non-default
4339 // clone (i.e. not the original function which is clone 0), ensure that we
4340 // aren't trying to now update it to call a different clone, which is
4341 // indicative of a bug in the graph or function assignment.
4342 auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
4343 if (CurCalleeCloneNo != NewCalleeCloneNo) {
4344 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4345 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4346 << "\n");
4347 MismatchedCloneAssignments++;
4348 }
4349 }
4350 if (NewCalleeCloneNo > 0)
4351 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
4352 OREGetter(CallerCall.call()->getFunction())
4353 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
4354 << ore::NV("Call", CallerCall.call()) << " in clone "
4355 << ore::NV("Caller", CallerCall.call()->getFunction())
4356 << " assigned to call function clone "
4357 << ore::NV("Callee", CalleeFunc.func()));
4358}
4359
4360void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
4361 FuncInfo CalleeFunc) {
4362 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
4363 assert(CI &&
4364 "Caller cannot be an allocation which should not have profiled calls");
4365 assert(CI->Clones.size() > CallerCall.cloneNo());
4366 auto NewCalleeCloneNo = CalleeFunc.cloneNo();
4367 auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
4368 // If we already assigned this callsite to call a specific non-default
4369 // clone (i.e. not the original function which is clone 0), ensure that we
4370 // aren't trying to now update it to call a different clone, which is
4371 // indicative of a bug in the graph or function assignment.
4372 if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
4373 LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
4374 << CurCalleeCloneNo << " now " << NewCalleeCloneNo
4375 << "\n");
4376 MismatchedCloneAssignments++;
4377 }
4378 CurCalleeCloneNo = NewCalleeCloneNo;
4379}
4380
4381// Update the debug information attached to NewFunc to use the clone Name. Note
4382// this needs to be done for both any existing DISubprogram for the definition,
4383// as well as any separate declaration DISubprogram.
4385 assert(Name == NewFunc->getName());
4386 auto *SP = NewFunc->getSubprogram();
4387 if (!SP)
4388 return;
4389 auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
4390 SP->replaceLinkageName(MDName);
4391 DISubprogram *Decl = SP->getDeclaration();
4392 if (!Decl)
4393 return;
4394 TempDISubprogram NewDecl = Decl->clone();
4395 NewDecl->replaceLinkageName(MDName);
4396 SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
4397}
4398
4399CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4400 Instruction *>::FuncInfo
4401ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4402 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4403 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4404 // Use existing LLVM facilities for cloning and obtaining Call in clone
4405 ValueToValueMapTy VMap;
4406 auto *NewFunc = CloneFunction(Func.func(), VMap);
4407 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4408 assert(!Func.func()->getParent()->getFunction(Name));
4409 NewFunc->setName(Name);
4410 updateSubprogramLinkageName(NewFunc, Name);
4411 for (auto &Inst : CallsWithMetadataInFunc) {
4412 // This map always has the initial version in it.
4413 assert(Inst.cloneNo() == 0);
4414 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4415 }
4416 OREGetter(Func.func())
4417 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4418 << "created clone " << ore::NV("NewFunction", NewFunc));
4419 return {NewFunc, CloneNo};
4420}
4421
4422CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4423 IndexCall>::FuncInfo
4424IndexCallsiteContextGraph::cloneFunctionForCallsite(
4425 FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap,
4426 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4427 // Check how many clones we have of Call (and therefore function).
4428 // The next clone number is the current size of versions array.
4429 // Confirm this matches the CloneNo provided by the caller, which is based on
4430 // the number of function clones we have.
4431 assert(CloneNo == (isa<AllocInfo *>(Call.call())
4432 ? cast<AllocInfo *>(Call.call())->Versions.size()
4433 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4434 // Walk all the instructions in this function. Create a new version for
4435 // each (by adding an entry to the Versions/Clones summary array), and copy
4436 // over the version being called for the function clone being cloned here.
4437 // Additionally, add an entry to the CallMap for the new function clone,
4438 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4439 // to the new call clone.
4440 for (auto &Inst : CallsWithMetadataInFunc) {
4441 // This map always has the initial version in it.
4442 assert(Inst.cloneNo() == 0);
4443 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4444 assert(AI->Versions.size() == CloneNo);
4445 // We assign the allocation type later (in updateAllocationCall), just add
4446 // an entry for it here.
4447 AI->Versions.push_back(0);
4448 } else {
4449 auto *CI = cast<CallsiteInfo *>(Inst.call());
4450 assert(CI && CI->Clones.size() == CloneNo);
4451 // We assign the clone number later (in updateCall), just add an entry for
4452 // it here.
4453 CI->Clones.push_back(0);
4454 }
4455 CallMap[Inst] = {Inst.call(), CloneNo};
4456 }
4457 return {Func.func(), CloneNo};
4458}
4459
4460// We perform cloning for each allocation node separately. However, this
4461// sometimes results in a situation where the same node calls multiple
4462// clones of the same callee, created for different allocations. This
4463// causes issues when assigning functions to these clones, as each node can
4464// in reality only call a single callee clone.
4465//
4466// To address this, before assigning functions, merge callee clone nodes as
4467// needed using a post order traversal from the allocations. We attempt to
4468// use existing clones as the merge node when legal, and to share them
4469// among callers with the same properties (callers calling the same set of
4470// callee clone nodes for the same allocations).
4471//
4472// Without this fix, in some cases incorrect function assignment will lead
4473// to calling the wrong allocation clone.
4474template <typename DerivedCCG, typename FuncTy, typename CallTy>
4475void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4476 if (!MergeClones)
4477 return;
4478
4479 // Generate a map from context id to the associated allocation node for use
4480 // when merging clones.
4481 DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4482 for (auto &Entry : AllocationCallToContextNodeMap) {
4483 auto *Node = Entry.second;
4484 for (auto Id : Node->getContextIds())
4485 ContextIdToAllocationNode[Id] = Node->getOrigNode();
4486 for (auto *Clone : Node->Clones) {
4487 for (auto Id : Clone->getContextIds())
4488 ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4489 }
4490 }
4491
4492 // Post order traversal starting from allocations to ensure each callsite
4493 // calls a single clone of its callee. Callee nodes that are clones of each
4494 // other are merged (via new merge nodes if needed) to achieve this.
4495 DenseSet<const ContextNode *> Visited;
4496 for (auto &Entry : AllocationCallToContextNodeMap) {
4497 auto *Node = Entry.second;
4498
4499 mergeClones(Node, Visited, ContextIdToAllocationNode);
4500
4501 // Make a copy so the recursive post order traversal that may create new
4502 // clones doesn't mess up iteration. Note that the recursive traversal
4503 // itself does not call mergeClones on any of these nodes, which are all
4504 // (clones of) allocations.
4505 auto Clones = Node->Clones;
4506 for (auto *Clone : Clones)
4507 mergeClones(Clone, Visited, ContextIdToAllocationNode);
4508 }
4509
4510 if (DumpCCG) {
4511 dbgs() << "CCG after merging:\n";
4512 dbgs() << *this;
4513 }
4514 if (ExportToDot)
4515 exportToDot("aftermerge");
4516
4517 if (VerifyCCG) {
4518 check();
4519 }
4520}
4521
4522// Recursive helper for above mergeClones method.
4523template <typename DerivedCCG, typename FuncTy, typename CallTy>
4524void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4525 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4526 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4527 auto Inserted = Visited.insert(Node);
4528 if (!Inserted.second)
4529 return;
4530
4531 // Iteratively perform merging on this node to handle new caller nodes created
4532 // during the recursive traversal. We could do something more elegant such as
4533 // maintain a worklist, but this is a simple approach that doesn't cause a
4534 // measureable compile time effect, as most nodes don't have many caller
4535 // edges to check.
4536 bool FoundUnvisited = true;
4537 unsigned Iters = 0;
4538 while (FoundUnvisited) {
4539 Iters++;
4540 FoundUnvisited = false;
4541 // Make a copy since the recursive call may move a caller edge to a new
4542 // callee, messing up the iterator.
4543 auto CallerEdges = Node->CallerEdges;
4544 for (auto CallerEdge : CallerEdges) {
4545 // Skip any caller edge moved onto a different callee during recursion.
4546 if (CallerEdge->Callee != Node)
4547 continue;
4548 // If we found an unvisited caller, note that we should check the caller
4549 // edges again as mergeClones may add or change caller nodes.
4550 if (DoMergeIteration && !Visited.contains(CallerEdge->Caller))
4551 FoundUnvisited = true;
4552 mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4553 }
4554 }
4555
4556 TotalMergeInvokes++;
4557 TotalMergeIters += Iters;
4558 if (Iters > MaxMergeIters)
4559 MaxMergeIters = Iters;
4560
4561 // Merge for this node after we handle its callers.
4562 mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4563}
4564
4565template <typename DerivedCCG, typename FuncTy, typename CallTy>
4566void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4567 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4568 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4569 // Ignore Node if we moved all of its contexts to clones.
4570 if (Node->emptyContextIds())
4571 return;
4572
4573 // First identify groups of clones among Node's callee edges, by building
4574 // a map from each callee base node to the associated callee edges from Node.
4575 MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4576 OrigNodeToCloneEdges;
4577 for (const auto &E : Node->CalleeEdges) {
4578 auto *Callee = E->Callee;
4579 if (!Callee->CloneOf && Callee->Clones.empty())
4580 continue;
4581 ContextNode *Base = Callee->getOrigNode();
4582 OrigNodeToCloneEdges[Base].push_back(E);
4583 }
4584
4585 // Helper for callee edge sorting below. Return true if A's callee has fewer
4586 // caller edges than B, or if A is a clone and B is not, or if A's first
4587 // context id is smaller than B's.
4588 auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4589 const std::shared_ptr<ContextEdge> &B) {
4590 if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4591 return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4592 if (A->Callee->CloneOf && !B->Callee->CloneOf)
4593 return true;
4594 else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4595 return false;
4596 // Use the first context id for each edge as a
4597 // tie-breaker.
4598 return *A->ContextIds.begin() < *B->ContextIds.begin();
4599 };
4600
4601 // Process each set of callee clones called by Node, performing the needed
4602 // merging.
4603 for (auto Entry : OrigNodeToCloneEdges) {
4604 // CalleeEdges is the set of edges from Node reaching callees that are
4605 // mutual clones of each other.
4606 auto &CalleeEdges = Entry.second;
4607 auto NumCalleeClones = CalleeEdges.size();
4608 // A single edge means there is no merging needed.
4609 if (NumCalleeClones == 1)
4610 continue;
4611 // Sort the CalleeEdges calling this group of clones in ascending order of
4612 // their caller edge counts, putting the original non-clone node first in
4613 // cases of a tie. This simplifies finding an existing node to use as the
4614 // merge node.
4615 llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4616
4617 /// Find other callers of the given set of callee edges that can
4618 /// share the same callee merge node. See the comments at this method
4619 /// definition for details.
4620 DenseSet<ContextNode *> OtherCallersToShareMerge;
4621 findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4622 OtherCallersToShareMerge);
4623
4624 // Now do the actual merging. Identify existing or create a new MergeNode
4625 // during the first iteration. Move each callee over, along with edges from
4626 // other callers we've determined above can share the same merge node.
4627 ContextNode *MergeNode = nullptr;
4628 DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4629 for (auto CalleeEdge : CalleeEdges) {
4630 auto *OrigCallee = CalleeEdge->Callee;
4631 // If we don't have a MergeNode yet (only happens on the first iteration,
4632 // as a new one will be created when we go to move the first callee edge
4633 // over as needed), see if we can use this callee.
4634 if (!MergeNode) {
4635 // If there are no other callers, simply use this callee.
4636 if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4637 MergeNode = OrigCallee;
4638 NonNewMergedNodes++;
4639 continue;
4640 }
4641 // Otherwise, if we have identified other caller nodes that can share
4642 // the merge node with Node, see if all of OrigCallee's callers are
4643 // going to share the same merge node. In that case we can use callee
4644 // (since all of its callers would move to the new merge node).
4645 if (!OtherCallersToShareMerge.empty()) {
4646 bool MoveAllCallerEdges = true;
4647 for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4648 if (CalleeCallerE == CalleeEdge)
4649 continue;
4650 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4651 MoveAllCallerEdges = false;
4652 break;
4653 }
4654 }
4655 // If we are going to move all callers over, we can use this callee as
4656 // the MergeNode.
4657 if (MoveAllCallerEdges) {
4658 MergeNode = OrigCallee;
4659 NonNewMergedNodes++;
4660 continue;
4661 }
4662 }
4663 }
4664 // Move this callee edge, creating a new merge node if necessary.
4665 if (MergeNode) {
4666 assert(MergeNode != OrigCallee);
4667 moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4668 /*NewClone*/ false);
4669 } else {
4670 MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4671 NewMergedNodes++;
4672 }
4673 // Now move all identified edges from other callers over to the merge node
4674 // as well.
4675 if (!OtherCallersToShareMerge.empty()) {
4676 // Make and iterate over a copy of OrigCallee's caller edges because
4677 // some of these will be moved off of the OrigCallee and that would mess
4678 // up the iteration from OrigCallee.
4679 auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4680 for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4681 if (CalleeCallerE == CalleeEdge)
4682 continue;
4683 if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4684 continue;
4685 CallerToMoveCount[CalleeCallerE->Caller]++;
4686 moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4687 /*NewClone*/ false);
4688 }
4689 }
4690 removeNoneTypeCalleeEdges(OrigCallee);
4691 removeNoneTypeCalleeEdges(MergeNode);
4692 }
4693 }
4694}
4695
4696// Look for other nodes that have edges to the same set of callee
4697// clones as the current Node. Those can share the eventual merge node
4698// (reducing cloning and binary size overhead) iff:
4699// - they have edges to the same set of callee clones
4700// - each callee edge reaches a subset of the same allocations as Node's
4701// corresponding edge to the same callee clone.
4702// The second requirement is to ensure that we don't undo any of the
4703// necessary cloning to distinguish contexts with different allocation
4704// behavior.
4705// FIXME: This is somewhat conservative, as we really just need to ensure
4706// that they don't reach the same allocations as contexts on edges from Node
4707// going to any of the *other* callee clones being merged. However, that
4708// requires more tracking and checking to get right.
4709template <typename DerivedCCG, typename FuncTy, typename CallTy>
4710void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
4711 findOtherCallersToShareMerge(
4712 ContextNode *Node,
4713 std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4714 DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4715 DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4716 auto NumCalleeClones = CalleeEdges.size();
4717 // This map counts how many edges to the same callee clone exist for other
4718 // caller nodes of each callee clone.
4719 DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4720 // Counts the number of other caller nodes that have edges to all callee
4721 // clones that don't violate the allocation context checking.
4722 unsigned PossibleOtherCallerNodes = 0;
4723
4724 // We only need to look at other Caller nodes if the first callee edge has
4725 // multiple callers (recall they are sorted in ascending order above).
4726 if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4727 return;
4728
4729 // For each callee edge:
4730 // - Collect the count of other caller nodes calling the same callees.
4731 // - Collect the alloc nodes reached by contexts on each callee edge.
4732 DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4733 for (auto CalleeEdge : CalleeEdges) {
4734 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4735 // For each other caller of the same callee, increment the count of
4736 // edges reaching the same callee clone.
4737 for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4738 if (CalleeCallerEdges->Caller == Node) {
4739 assert(CalleeCallerEdges == CalleeEdge);
4740 continue;
4741 }
4742 OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4743 // If this caller edge now reaches all of the same callee clones,
4744 // increment the count of candidate other caller nodes.
4745 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4746 NumCalleeClones)
4747 PossibleOtherCallerNodes++;
4748 }
4749 // Collect the alloc nodes reached by contexts on each callee edge, for
4750 // later analysis.
4751 for (auto Id : CalleeEdge->getContextIds()) {
4752 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4753 if (!Alloc) {
4754 // FIXME: unclear why this happens occasionally, presumably
4755 // imperfect graph updates possibly with recursion.
4756 MissingAllocForContextId++;
4757 continue;
4758 }
4759 CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4760 }
4761 }
4762
4763 // Now walk the callee edges again, and make sure that for each candidate
4764 // caller node all of its edges to the callees reach the same allocs (or
4765 // a subset) as those along the corresponding callee edge from Node.
4766 for (auto CalleeEdge : CalleeEdges) {
4767 assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4768 // Stop if we do not have any (more) candidate other caller nodes.
4769 if (!PossibleOtherCallerNodes)
4770 break;
4771 auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4772 // Check each other caller of this callee clone.
4773 for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4774 // Not interested in the callee edge from Node itself.
4775 if (CalleeCallerE == CalleeEdge)
4776 continue;
4777 // Skip any callers that didn't have callee edges to all the same
4778 // callee clones.
4779 if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4780 NumCalleeClones)
4781 continue;
4782 // Make sure that each context along edge from candidate caller node
4783 // reaches an allocation also reached by this callee edge from Node.
4784 for (auto Id : CalleeCallerE->getContextIds()) {
4785 auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4786 if (!Alloc)
4787 continue;
4788 // If not, simply reset the map entry to 0 so caller is ignored, and
4789 // reduce the count of candidate other caller nodes.
4790 if (!CurCalleeAllocNodes.contains(Alloc)) {
4791 OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4792 PossibleOtherCallerNodes--;
4793 break;
4794 }
4795 }
4796 }
4797 }
4798
4799 if (!PossibleOtherCallerNodes)
4800 return;
4801
4802 // Build the set of other caller nodes that can use the same callee merge
4803 // node.
4804 for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4805 if (Count != NumCalleeClones)
4806 continue;
4807 OtherCallersToShareMerge.insert(OtherCaller);
4808 }
4809}
4810
4811// This method assigns cloned callsites to functions, cloning the functions as
4812// needed. The assignment is greedy and proceeds roughly as follows:
4813//
4814// For each function Func:
4815// For each call with graph Node having clones:
4816// Initialize ClonesWorklist to Node and its clones
4817// Initialize NodeCloneCount to 0
4818// While ClonesWorklist is not empty:
4819// Clone = pop front ClonesWorklist
4820// NodeCloneCount++
4821// If Func has been cloned less than NodeCloneCount times:
4822// If NodeCloneCount is 1:
4823// Assign Clone to original Func
4824// Continue
4825// Create a new function clone
4826// If other callers not assigned to call a function clone yet:
4827// Assign them to call new function clone
4828// Continue
4829// Assign any other caller calling the cloned version to new clone
4830//
4831// For each caller of Clone:
4832// If caller is assigned to call a specific function clone:
4833// If we cannot assign Clone to that function clone:
4834// Create new callsite Clone NewClone
4835// Add NewClone to ClonesWorklist
4836// Continue
4837// Assign Clone to existing caller's called function clone
4838// Else:
4839// If Clone not already assigned to a function clone:
4840// Assign to first function clone without assignment
4841// Assign caller to selected function clone
4842// For each call with graph Node having clones:
4843// If number func clones > number call's callsite Node clones:
4844// Record func CallInfo clones without Node clone in UnassignedCallClones
4845// For callsite Nodes in DFS order from allocations:
4846// If IsAllocation:
4847// Update allocation with alloc type
4848// Else:
4849// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4850// Update call to call recorded callee clone
4851//
4852template <typename DerivedCCG, typename FuncTy, typename CallTy>
4853bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4854 bool Changed = false;
4855
4856 mergeClones();
4857
4858 // Keep track of the assignment of nodes (callsites) to function clones they
4859 // call.
4860 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4861
4862 // Update caller node to call function version CalleeFunc, by recording the
4863 // assignment in CallsiteToCalleeFuncCloneMap.
4864 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4865 const FuncInfo &CalleeFunc) {
4866 assert(Caller->hasCall());
4867 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4868 };
4869
4870 // Information for a single clone of this Func.
4871 struct FuncCloneInfo {
4872 // The function clone.
4873 FuncInfo FuncClone;
4874 // Remappings of each call of interest (from original uncloned call to the
4875 // corresponding cloned call in this function clone).
4876 DenseMap<CallInfo, CallInfo> CallMap;
4877 };
4878
4879 // Map to keep track of information needed to update calls in function clones
4880 // when their corresponding callsite node was not itself cloned for that
4881 // function clone. Because of call context pruning (i.e. we only keep as much
4882 // caller information as needed to distinguish hot vs cold), we may not have
4883 // caller edges coming to each callsite node from all possible function
4884 // callers. A function clone may get created for other callsites in the
4885 // function for which there are caller edges that were not pruned. Any other
4886 // callsites in that function clone, which were not themselved cloned for
4887 // that function clone, should get updated the same way as the corresponding
4888 // callsite in the original function (which may call a clone of its callee).
4889 //
4890 // We build this map after completing function cloning for each function, so
4891 // that we can record the information from its call maps before they are
4892 // destructed. The map will be used as we update calls to update any still
4893 // unassigned call clones. Note that we may create new node clones as we clone
4894 // other functions, so later on we check which node clones were still not
4895 // created. To this end, the inner map is a map from function clone number to
4896 // the list of calls cloned for that function (can be more than one due to the
4897 // Node's MatchingCalls array).
4898 //
4899 // The alternative is creating new callsite clone nodes below as we clone the
4900 // function, but that is tricker to get right and likely more overhead.
4901 //
4902 // Inner map is a std::map so sorted by key (clone number), in order to get
4903 // ordered remarks in the full LTO case.
4904 DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4905 UnassignedCallClones;
4906
4907 // Walk all functions for which we saw calls with memprof metadata, and handle
4908 // cloning for each of its calls.
4909 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4910 FuncInfo OrigFunc(Func);
4911 // Map from each clone number of OrigFunc to information about that function
4912 // clone (the function clone FuncInfo and call remappings). The index into
4913 // the vector is the clone number, as function clones are created and
4914 // numbered sequentially.
4915 std::vector<FuncCloneInfo> FuncCloneInfos;
4916 for (auto &Call : CallsWithMetadata) {
4917 ContextNode *Node = getNodeForInst(Call);
4918 // Skip call if we do not have a node for it (all uses of its stack ids
4919 // were either on inlined chains or pruned from the MIBs), or if we did
4920 // not create any clones for it.
4921 if (!Node || Node->Clones.empty())
4922 continue;
4923 assert(Node->hasCall() &&
4924 "Not having a call should have prevented cloning");
4925
4926 // Track the assignment of function clones to clones of the current
4927 // callsite Node being handled.
4928 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4929
4930 // Assign callsite version CallsiteClone to function version FuncClone,
4931 // and also assign (possibly cloned) Call to CallsiteClone.
4932 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4933 CallInfo &Call,
4934 ContextNode *CallsiteClone,
4935 bool IsAlloc) {
4936 // Record the clone of callsite node assigned to this function clone.
4937 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4938
4939 assert(FuncCloneInfos.size() > FuncClone.cloneNo());
4940 DenseMap<CallInfo, CallInfo> &CallMap =
4941 FuncCloneInfos[FuncClone.cloneNo()].CallMap;
4942 CallInfo CallClone(Call);
4943 if (auto It = CallMap.find(Call); It != CallMap.end())
4944 CallClone = It->second;
4945 CallsiteClone->setCall(CallClone);
4946 // Need to do the same for all matching calls.
4947 for (auto &MatchingCall : Node->MatchingCalls) {
4948 CallInfo CallClone(MatchingCall);
4949 if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4950 CallClone = It->second;
4951 // Updates the call in the list.
4952 MatchingCall = CallClone;
4953 }
4954 };
4955
4956 // Invokes moveEdgeToNewCalleeClone which creates a new clone, and then
4957 // performs the necessary fixups (removing none type edges, and
4958 // importantly, propagating any function call assignment of the original
4959 // node to the new clone).
4960 auto MoveEdgeToNewCalleeCloneAndSetUp =
4961 [&](const std::shared_ptr<ContextEdge> &Edge) {
4962 ContextNode *OrigCallee = Edge->Callee;
4963 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4964 removeNoneTypeCalleeEdges(NewClone);
4965 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4966 // If the original Callee was already assigned to call a specific
4967 // function version, make sure its new clone is assigned to call
4968 // that same function clone.
4969 if (CallsiteToCalleeFuncCloneMap.count(OrigCallee))
4970 RecordCalleeFuncOfCallsite(
4971 NewClone, CallsiteToCalleeFuncCloneMap[OrigCallee]);
4972 return NewClone;
4973 };
4974
4975 // Keep track of the clones of callsite Node that need to be assigned to
4976 // function clones. This list may be expanded in the loop body below if we
4977 // find additional cloning is required.
4978 std::deque<ContextNode *> ClonesWorklist;
4979 // Ignore original Node if we moved all of its contexts to clones.
4980 if (!Node->emptyContextIds())
4981 ClonesWorklist.push_back(Node);
4982 llvm::append_range(ClonesWorklist, Node->Clones);
4983
4984 // Now walk through all of the clones of this callsite Node that we need,
4985 // and determine the assignment to a corresponding clone of the current
4986 // function (creating new function clones as needed).
4987 unsigned NodeCloneCount = 0;
4988 while (!ClonesWorklist.empty()) {
4989 ContextNode *Clone = ClonesWorklist.front();
4990 ClonesWorklist.pop_front();
4991 NodeCloneCount++;
4992 if (VerifyNodes)
4994
4995 // Need to create a new function clone if we have more callsite clones
4996 // than existing function clones, which would have been assigned to an
4997 // earlier clone in the list (we assign callsite clones to function
4998 // clones greedily).
4999 if (FuncCloneInfos.size() < NodeCloneCount) {
5000 // If this is the first callsite copy, assign to original function.
5001 if (NodeCloneCount == 1) {
5002 // Since FuncCloneInfos is empty in this case, no clones have
5003 // been created for this function yet, and no callers should have
5004 // been assigned a function clone for this callee node yet.
5006 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5007 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5008 }));
5009 // Initialize with empty call map, assign Clone to original function
5010 // and its callers, and skip to the next clone.
5011 FuncCloneInfos.push_back(
5012 {OrigFunc, DenseMap<CallInfo, CallInfo>()});
5013 AssignCallsiteCloneToFuncClone(
5014 OrigFunc, Call, Clone,
5015 AllocationCallToContextNodeMap.count(Call));
5016 for (auto &CE : Clone->CallerEdges) {
5017 // Ignore any caller that does not have a recorded callsite Call.
5018 if (!CE->Caller->hasCall())
5019 continue;
5020 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
5021 }
5022 continue;
5023 }
5024
5025 // First locate which copy of OrigFunc to clone again. If a caller
5026 // of this callsite clone was already assigned to call a particular
5027 // function clone, we need to redirect all of those callers to the
5028 // new function clone, and update their other callees within this
5029 // function.
5030 FuncInfo PreviousAssignedFuncClone;
5031 auto EI = llvm::find_if(
5032 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
5033 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
5034 });
5035 bool CallerAssignedToCloneOfFunc = false;
5036 if (EI != Clone->CallerEdges.end()) {
5037 const std::shared_ptr<ContextEdge> &Edge = *EI;
5038 PreviousAssignedFuncClone =
5039 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5040 CallerAssignedToCloneOfFunc = true;
5041 }
5042
5043 // Clone function and save it along with the CallInfo map created
5044 // during cloning in the FuncCloneInfos.
5045 DenseMap<CallInfo, CallInfo> NewCallMap;
5046 unsigned CloneNo = FuncCloneInfos.size();
5047 assert(CloneNo > 0 && "Clone 0 is the original function, which "
5048 "should already exist in the map");
5049 FuncInfo NewFuncClone = cloneFunctionForCallsite(
5050 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
5051 FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
5052 FunctionClonesAnalysis++;
5053 Changed = true;
5054
5055 // If no caller callsites were already assigned to a clone of this
5056 // function, we can simply assign this clone to the new func clone
5057 // and update all callers to it, then skip to the next clone.
5058 if (!CallerAssignedToCloneOfFunc) {
5059 AssignCallsiteCloneToFuncClone(
5060 NewFuncClone, Call, Clone,
5061 AllocationCallToContextNodeMap.count(Call));
5062 for (auto &CE : Clone->CallerEdges) {
5063 // Ignore any caller that does not have a recorded callsite Call.
5064 if (!CE->Caller->hasCall())
5065 continue;
5066 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5067 }
5068 continue;
5069 }
5070
5071 // We may need to do additional node cloning in this case.
5072 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
5073 // that were previously assigned to call PreviousAssignedFuncClone,
5074 // to record that they now call NewFuncClone.
5075 // The none type edge removal may remove some of this Clone's caller
5076 // edges, if it is reached via another of its caller's callees.
5077 // Iterate over a copy and skip any that were removed.
5078 auto CallerEdges = Clone->CallerEdges;
5079 for (auto CE : CallerEdges) {
5080 // Skip any that have been removed on an earlier iteration.
5081 if (CE->isRemoved()) {
5082 assert(!is_contained(Clone->CallerEdges, CE));
5083 continue;
5084 }
5085 assert(CE);
5086 // Ignore any caller that does not have a recorded callsite Call.
5087 if (!CE->Caller->hasCall())
5088 continue;
5089
5090 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
5091 // We subsequently fall through to later handling that
5092 // will perform any additional cloning required for
5093 // callers that were calling other function clones.
5094 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
5095 PreviousAssignedFuncClone)
5096 continue;
5097
5098 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
5099
5100 // If we are cloning a function that was already assigned to some
5101 // callers, then essentially we are creating new callsite clones
5102 // of the other callsites in that function that are reached by those
5103 // callers. Clone the other callees of the current callsite's caller
5104 // that were already assigned to PreviousAssignedFuncClone
5105 // accordingly. This is important since we subsequently update the
5106 // calls from the nodes in the graph and their assignments to callee
5107 // functions recorded in CallsiteToCalleeFuncCloneMap.
5108 // The none type edge removal may remove some of this caller's
5109 // callee edges, if it is reached via another of its callees.
5110 // Iterate over a copy and skip any that were removed.
5111 auto CalleeEdges = CE->Caller->CalleeEdges;
5112 for (auto CalleeEdge : CalleeEdges) {
5113 // Skip any that have been removed on an earlier iteration when
5114 // cleaning up newly None type callee edges.
5115 if (CalleeEdge->isRemoved()) {
5116 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
5117 continue;
5118 }
5119 assert(CalleeEdge);
5120 ContextNode *Callee = CalleeEdge->Callee;
5121 // Skip the current callsite, we are looking for other
5122 // callsites Caller calls, as well as any that does not have a
5123 // recorded callsite Call.
5124 if (Callee == Clone || !Callee->hasCall())
5125 continue;
5126 // Skip direct recursive calls. We don't need/want to clone the
5127 // caller node again, and this loop will not behave as expected if
5128 // we tried.
5129 if (Callee == CalleeEdge->Caller)
5130 continue;
5131 ContextNode *NewClone =
5132 MoveEdgeToNewCalleeCloneAndSetUp(CalleeEdge);
5133 // Moving the edge may have resulted in some none type
5134 // callee edges on the original Callee.
5135 removeNoneTypeCalleeEdges(Callee);
5136 // Update NewClone with the new Call clone of this callsite's Call
5137 // created for the new function clone created earlier.
5138 // Recall that we have already ensured when building the graph
5139 // that each caller can only call callsites within the same
5140 // function, so we are guaranteed that Callee Call is in the
5141 // current OrigFunc.
5142 // CallMap is set up as indexed by original Call at clone 0.
5143 CallInfo OrigCall(Callee->getOrigNode()->Call);
5144 OrigCall.setCloneNo(0);
5145 DenseMap<CallInfo, CallInfo> &CallMap =
5146 FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
5147 assert(CallMap.count(OrigCall));
5148 CallInfo NewCall(CallMap[OrigCall]);
5149 assert(NewCall);
5150 NewClone->setCall(NewCall);
5151 // Need to do the same for all matching calls.
5152 for (auto &MatchingCall : NewClone->MatchingCalls) {
5153 CallInfo OrigMatchingCall(MatchingCall);
5154 OrigMatchingCall.setCloneNo(0);
5155 assert(CallMap.count(OrigMatchingCall));
5156 CallInfo NewCall(CallMap[OrigMatchingCall]);
5157 assert(NewCall);
5158 // Updates the call in the list.
5159 MatchingCall = NewCall;
5160 }
5161 }
5162 }
5163 // Fall through to handling below to perform the recording of the
5164 // function for this callsite clone. This enables handling of cases
5165 // where the callers were assigned to different clones of a function.
5166 }
5167
5168 auto FindFirstAvailFuncClone = [&]() {
5169 // Find first function in FuncCloneInfos without an assigned
5170 // clone of this callsite Node. We should always have one
5171 // available at this point due to the earlier cloning when the
5172 // FuncCloneInfos size was smaller than the clone number.
5173 for (auto &CF : FuncCloneInfos) {
5174 if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
5175 return CF.FuncClone;
5176 }
5178 "Expected an available func clone for this callsite clone");
5179 };
5180
5181 // See if we can use existing function clone. Walk through
5182 // all caller edges to see if any have already been assigned to
5183 // a clone of this callsite's function. If we can use it, do so. If not,
5184 // because that function clone is already assigned to a different clone
5185 // of this callsite, then we need to clone again.
5186 // Basically, this checking is needed to handle the case where different
5187 // caller functions/callsites may need versions of this function
5188 // containing different mixes of callsite clones across the different
5189 // callsites within the function. If that happens, we need to create
5190 // additional function clones to handle the various combinations.
5191 //
5192 // Keep track of any new clones of this callsite created by the
5193 // following loop, as well as any existing clone that we decided to
5194 // assign this clone to.
5195 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
5196 FuncInfo FuncCloneAssignedToCurCallsiteClone;
5197 // Iterate over a copy of Clone's caller edges, since we may need to
5198 // remove edges in the moveEdgeTo* methods, and this simplifies the
5199 // handling and makes it less error-prone.
5200 auto CloneCallerEdges = Clone->CallerEdges;
5201 for (auto &Edge : CloneCallerEdges) {
5202 // Skip removed edges (due to direct recursive edges updated when
5203 // updating callee edges when moving an edge and subsequently
5204 // removed by call to removeNoneTypeCalleeEdges on the Clone).
5205 if (Edge->isRemoved())
5206 continue;
5207 // Ignore any caller that does not have a recorded callsite Call.
5208 if (!Edge->Caller->hasCall())
5209 continue;
5210 // If this caller already assigned to call a version of OrigFunc, need
5211 // to ensure we can assign this callsite clone to that function clone.
5212 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
5213 FuncInfo FuncCloneCalledByCaller =
5214 CallsiteToCalleeFuncCloneMap[Edge->Caller];
5215 // First we need to confirm that this function clone is available
5216 // for use by this callsite node clone.
5217 //
5218 // While FuncCloneToCurNodeCloneMap is built only for this Node and
5219 // its callsite clones, one of those callsite clones X could have
5220 // been assigned to the same function clone called by Edge's caller
5221 // - if Edge's caller calls another callsite within Node's original
5222 // function, and that callsite has another caller reaching clone X.
5223 // We need to clone Node again in this case.
5224 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
5225 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
5226 Clone) ||
5227 // Detect when we have multiple callers of this callsite that
5228 // have already been assigned to specific, and different, clones
5229 // of OrigFunc (due to other unrelated callsites in Func they
5230 // reach via call contexts). Is this Clone of callsite Node
5231 // assigned to a different clone of OrigFunc? If so, clone Node
5232 // again.
5233 (FuncCloneAssignedToCurCallsiteClone &&
5234 FuncCloneAssignedToCurCallsiteClone !=
5235 FuncCloneCalledByCaller)) {
5236 // We need to use a different newly created callsite clone, in
5237 // order to assign it to another new function clone on a
5238 // subsequent iteration over the Clones array (adjusted below).
5239 // Note we specifically do not reset the
5240 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
5241 // when this new clone is processed later we know which version of
5242 // the function to copy (so that other callsite clones we have
5243 // assigned to that function clone are properly cloned over). See
5244 // comments in the function cloning handling earlier.
5245
5246 // Check if we already have cloned this callsite again while
5247 // walking through caller edges, for a caller calling the same
5248 // function clone. If so, we can move this edge to that new clone
5249 // rather than creating yet another new clone.
5250 if (FuncCloneToNewCallsiteCloneMap.count(
5251 FuncCloneCalledByCaller)) {
5252 ContextNode *NewClone =
5253 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
5254 moveEdgeToExistingCalleeClone(Edge, NewClone);
5255 // Cleanup any none type edges cloned over.
5256 removeNoneTypeCalleeEdges(NewClone);
5257 } else {
5258 // Create a new callsite clone.
5259 ContextNode *NewClone = MoveEdgeToNewCalleeCloneAndSetUp(Edge);
5260 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
5261 NewClone;
5262 // Add to list of clones and process later.
5263 ClonesWorklist.push_back(NewClone);
5264 }
5265 // Moving the caller edge may have resulted in some none type
5266 // callee edges.
5267 removeNoneTypeCalleeEdges(Clone);
5268 // We will handle the newly created callsite clone in a subsequent
5269 // iteration over this Node's Clones.
5270 continue;
5271 }
5272
5273 // Otherwise, we can use the function clone already assigned to this
5274 // caller.
5275 if (!FuncCloneAssignedToCurCallsiteClone) {
5276 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
5277 // Assign Clone to FuncCloneCalledByCaller
5278 AssignCallsiteCloneToFuncClone(
5279 FuncCloneCalledByCaller, Call, Clone,
5280 AllocationCallToContextNodeMap.count(Call));
5281 } else
5282 // Don't need to do anything - callsite is already calling this
5283 // function clone.
5284 assert(FuncCloneAssignedToCurCallsiteClone ==
5285 FuncCloneCalledByCaller);
5286
5287 } else {
5288 // We have not already assigned this caller to a version of
5289 // OrigFunc. Do the assignment now.
5290
5291 // First check if we have already assigned this callsite clone to a
5292 // clone of OrigFunc for another caller during this iteration over
5293 // its caller edges.
5294 if (!FuncCloneAssignedToCurCallsiteClone) {
5295 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5296 assert(FuncCloneAssignedToCurCallsiteClone);
5297 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
5298 AssignCallsiteCloneToFuncClone(
5299 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5300 AllocationCallToContextNodeMap.count(Call));
5301 } else
5302 assert(FuncCloneToCurNodeCloneMap
5303 [FuncCloneAssignedToCurCallsiteClone] == Clone);
5304 // Update callers to record function version called.
5305 RecordCalleeFuncOfCallsite(Edge->Caller,
5306 FuncCloneAssignedToCurCallsiteClone);
5307 }
5308 }
5309 // If we didn't assign a function clone to this callsite clone yet, e.g.
5310 // none of its callers has a non-null call, do the assignment here.
5311 // We want to ensure that every callsite clone is assigned to some
5312 // function clone, so that the call updates below work as expected.
5313 // In particular if this is the original callsite, we want to ensure it
5314 // is assigned to the original function, otherwise the original function
5315 // will appear available for assignment to other callsite clones,
5316 // leading to unintended effects. For one, the unknown and not updated
5317 // callers will call into cloned paths leading to the wrong hints,
5318 // because they still call the original function (clone 0). Also,
5319 // because all callsites start out as being clone 0 by default, we can't
5320 // easily distinguish between callsites explicitly assigned to clone 0
5321 // vs those never assigned, which can lead to multiple updates of the
5322 // calls when invoking updateCall below, with mismatched clone values.
5323 // TODO: Add a flag to the callsite nodes or some other mechanism to
5324 // better distinguish and identify callsite clones that are not getting
5325 // assigned to function clones as expected.
5326 if (!FuncCloneAssignedToCurCallsiteClone) {
5327 FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
5328 assert(FuncCloneAssignedToCurCallsiteClone &&
5329 "No available func clone for this callsite clone");
5330 AssignCallsiteCloneToFuncClone(
5331 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
5332 /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
5333 }
5334 }
5335 if (VerifyCCG) {
5337 for (const auto &PE : Node->CalleeEdges)
5339 for (const auto &CE : Node->CallerEdges)
5341 for (auto *Clone : Node->Clones) {
5343 for (const auto &PE : Clone->CalleeEdges)
5345 for (const auto &CE : Clone->CallerEdges)
5347 }
5348 }
5349 }
5350
5351 if (FuncCloneInfos.size() < 2)
5352 continue;
5353
5354 // In this case there is more than just the original function copy.
5355 // Record call clones of any callsite nodes in the function that did not
5356 // themselves get cloned for all of the function clones.
5357 for (auto &Call : CallsWithMetadata) {
5358 ContextNode *Node = getNodeForInst(Call);
5359 if (!Node || !Node->hasCall() || Node->emptyContextIds())
5360 continue;
5361 // If Node has enough clones already to cover all function clones, we can
5362 // skip it. Need to add one for the original copy.
5363 // Use >= in case there were clones that were skipped due to having empty
5364 // context ids
5365 if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5366 continue;
5367 // First collect all function clones we cloned this callsite node for.
5368 // They may not be sequential due to empty clones e.g.
5369 DenseSet<unsigned> NodeCallClones;
5370 for (auto *C : Node->Clones)
5371 NodeCallClones.insert(C->Call.cloneNo());
5372 unsigned I = 0;
5373 // Now check all the function clones.
5374 for (auto &FC : FuncCloneInfos) {
5375 // Function clones should be sequential.
5376 assert(FC.FuncClone.cloneNo() == I);
5377 // Skip the first clone which got the original call.
5378 // Also skip any other clones created for this Node.
5379 if (++I == 1 || NodeCallClones.contains(I)) {
5380 continue;
5381 }
5382 // Record the call clones created for this callsite in this function
5383 // clone.
5384 auto &CallVector = UnassignedCallClones[Node][I];
5385 DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5386 if (auto It = CallMap.find(Call); It != CallMap.end()) {
5387 CallInfo CallClone = It->second;
5388 CallVector.push_back(CallClone);
5389 } else {
5390 // All but the original clone (skipped earlier) should have an entry
5391 // for all calls.
5392 assert(false && "Expected to find call in CallMap");
5393 }
5394 // Need to do the same for all matching calls.
5395 for (auto &MatchingCall : Node->MatchingCalls) {
5396 if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5397 CallInfo CallClone = It->second;
5398 CallVector.push_back(CallClone);
5399 } else {
5400 // All but the original clone (skipped earlier) should have an entry
5401 // for all calls.
5402 assert(false && "Expected to find call in CallMap");
5403 }
5404 }
5405 }
5406 }
5407 }
5408
5409 uint8_t BothTypes =
5410 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
5411
5412 auto UpdateCalls = [&](ContextNode *Node,
5413 DenseSet<const ContextNode *> &Visited,
5414 auto &&UpdateCalls) {
5415 auto Inserted = Visited.insert(Node);
5416 if (!Inserted.second)
5417 return;
5418
5419 for (auto *Clone : Node->Clones)
5420 UpdateCalls(Clone, Visited, UpdateCalls);
5421
5422 for (auto &Edge : Node->CallerEdges)
5423 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
5424
5425 // Skip if either no call to update, or if we ended up with no context ids
5426 // (we moved all edges onto other clones).
5427 if (!Node->hasCall() || Node->emptyContextIds())
5428 return;
5429
5430 if (Node->IsAllocation) {
5431 auto AT = allocTypeToUse(Node->AllocTypes);
5432 // If the allocation type is ambiguous, and more aggressive hinting
5433 // has been enabled via the MinClonedColdBytePercent flag, see if this
5434 // allocation should be hinted cold anyway because its fraction cold bytes
5435 // allocated is at least the given threshold.
5436 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
5437 !ContextIdToContextSizeInfos.empty()) {
5438 uint64_t TotalCold = 0;
5439 uint64_t Total = 0;
5440 for (auto Id : Node->getContextIds()) {
5441 auto TypeI = ContextIdToAllocationType.find(Id);
5442 assert(TypeI != ContextIdToAllocationType.end());
5443 auto CSI = ContextIdToContextSizeInfos.find(Id);
5444 if (CSI != ContextIdToContextSizeInfos.end()) {
5445 for (auto &Info : CSI->second) {
5446 Total += Info.TotalSize;
5447 if (TypeI->second == AllocationType::Cold)
5448 TotalCold += Info.TotalSize;
5449 }
5450 }
5451 }
5452 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
5453 AT = AllocationType::Cold;
5454 }
5455 updateAllocationCall(Node->Call, AT);
5456 assert(Node->MatchingCalls.empty());
5457 return;
5458 }
5459
5460 if (!CallsiteToCalleeFuncCloneMap.count(Node))
5461 return;
5462
5463 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
5464 updateCall(Node->Call, CalleeFunc);
5465 // Update all the matching calls as well.
5466 for (auto &Call : Node->MatchingCalls)
5467 updateCall(Call, CalleeFunc);
5468
5469 // Now update all calls recorded earlier that are still in function clones
5470 // which don't have a clone of this callsite node.
5471 if (!UnassignedCallClones.contains(Node))
5472 return;
5473 DenseSet<unsigned> NodeCallClones;
5474 for (auto *C : Node->Clones)
5475 NodeCallClones.insert(C->Call.cloneNo());
5476 // Note that we already confirmed Node is in this map a few lines above.
5477 auto &ClonedCalls = UnassignedCallClones[Node];
5478 for (auto &[CloneNo, CallVector] : ClonedCalls) {
5479 // Should start at 1 as we never create an entry for original node.
5480 assert(CloneNo > 0);
5481 // If we subsequently created a clone, skip this one.
5482 if (NodeCallClones.contains(CloneNo))
5483 continue;
5484 // Use the original Node's CalleeFunc.
5485 for (auto &Call : CallVector)
5486 updateCall(Call, CalleeFunc);
5487 }
5488 };
5489
5490 // Performs DFS traversal starting from allocation nodes to update calls to
5491 // reflect cloning decisions recorded earlier. For regular LTO this will
5492 // update the actual calls in the IR to call the appropriate function clone
5493 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
5494 // are recorded in the summary entries.
5495 DenseSet<const ContextNode *> Visited;
5496 for (auto &Entry : AllocationCallToContextNodeMap)
5497 UpdateCalls(Entry.second, Visited, UpdateCalls);
5498
5499 return Changed;
5500}
5501
5502// Compute a SHA1 hash of the callsite and alloc version information of clone I
5503// in the summary, to use in detection of duplicate clones.
5505 SHA1 Hasher;
5506 // Update hash with any callsites that call non-default (non-zero) callee
5507 // versions.
5508 for (auto &SN : FS->callsites()) {
5509 // In theory all callsites and allocs in this function should have the same
5510 // number of clone entries, but handle any discrepancies gracefully below
5511 // for NDEBUG builds.
5512 assert(
5513 SN.Clones.size() > I &&
5514 "Callsite summary has fewer entries than other summaries in function");
5515 if (SN.Clones.size() <= I || !SN.Clones[I])
5516 continue;
5517 uint8_t Data[sizeof(SN.Clones[I])];
5518 support::endian::write32le(Data, SN.Clones[I]);
5519 Hasher.update(Data);
5520 }
5521 // Update hash with any allocs that have non-default (non-None) hints.
5522 for (auto &AN : FS->allocs()) {
5523 // In theory all callsites and allocs in this function should have the same
5524 // number of clone entries, but handle any discrepancies gracefully below
5525 // for NDEBUG builds.
5526 assert(AN.Versions.size() > I &&
5527 "Alloc summary has fewer entries than other summaries in function");
5528 if (AN.Versions.size() <= I ||
5529 (AllocationType)AN.Versions[I] == AllocationType::None)
5530 continue;
5531 Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5532 }
5533 return support::endian::read64le(Hasher.result().data());
5534}
5535
5537 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
5539 &FuncToAliasMap,
5540 FunctionSummary *FS) {
5541 auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5542 // We might have created this when adjusting callsite in another
5543 // function. It should be a declaration.
5544 assert(DeclGV->isDeclaration());
5545 NewGV->takeName(DeclGV);
5546 DeclGV->replaceAllUsesWith(NewGV);
5547 DeclGV->eraseFromParent();
5548 };
5549
5550 // Handle aliases to this function, and create analogous alias clones to the
5551 // provided clone of this function.
5552 auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5553 if (!FuncToAliasMap.count(&F))
5554 return;
5555 for (auto *A : FuncToAliasMap[&F]) {
5556 std::string AliasName = getMemProfFuncName(A->getName(), I);
5557 auto *PrevA = M.getNamedAlias(AliasName);
5558 auto *NewA = GlobalAlias::create(A->getValueType(),
5559 A->getType()->getPointerAddressSpace(),
5560 A->getLinkage(), AliasName, NewF);
5561 NewA->copyAttributesFrom(A);
5562 if (PrevA)
5563 TakeDeclNameAndReplace(PrevA, NewA);
5564 }
5565 };
5566
5567 // The first "clone" is the original copy, we should only call this if we
5568 // needed to create new clones.
5569 assert(NumClones > 1);
5571 VMaps.reserve(NumClones - 1);
5572 FunctionsClonedThinBackend++;
5573
5574 // Map of hash of callsite/alloc versions to the instantiated function clone
5575 // (possibly the original) implementing those calls. Used to avoid
5576 // instantiating duplicate function clones.
5577 // FIXME: Ideally the thin link would not generate such duplicate clones to
5578 // start with, but right now it happens due to phase ordering in the function
5579 // assignment and possible new clones that produces. We simply make each
5580 // duplicate an alias to the matching instantiated clone recorded in the map
5581 // (except for available_externally which are made declarations as they would
5582 // be aliases in the prevailing module, and available_externally aliases are
5583 // not well supported right now).
5585
5586 // Save the hash of the original function version.
5587 HashToFunc[ComputeHash(FS, 0)] = &F;
5588
5589 for (unsigned I = 1; I < NumClones; I++) {
5590 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5591 std::string Name = getMemProfFuncName(F.getName(), I);
5592 auto Hash = ComputeHash(FS, I);
5593 // If this clone would duplicate a previously seen clone, don't generate the
5594 // duplicate clone body, just make an alias to satisfy any (potentially
5595 // cross-module) references.
5596 if (HashToFunc.contains(Hash)) {
5597 FunctionCloneDuplicatesThinBackend++;
5598 auto *Func = HashToFunc[Hash];
5599 if (Func->hasAvailableExternallyLinkage()) {
5600 // Skip these as EliminateAvailableExternallyPass does not handle
5601 // available_externally aliases correctly and we end up with an
5602 // available_externally alias to a declaration. Just create a
5603 // declaration for now as we know we will have a definition in another
5604 // module.
5605 auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5606 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5607 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5608 continue;
5609 }
5610 auto *PrevF = M.getFunction(Name);
5611 auto *Alias = GlobalAlias::create(Name, Func);
5612 if (PrevF)
5613 TakeDeclNameAndReplace(PrevF, Alias);
5614 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5615 << "created clone alias " << ore::NV("Alias", Alias));
5616
5617 // Now handle aliases to this function, and clone those as well.
5618 CloneFuncAliases(Func, I);
5619 continue;
5620 }
5621 auto *NewF = CloneFunction(&F, *VMaps.back());
5622 HashToFunc[Hash] = NewF;
5623 FunctionClonesThinBackend++;
5624 // Strip memprof and callsite metadata from clone as they are no longer
5625 // needed.
5626 for (auto &BB : *NewF) {
5627 for (auto &Inst : BB) {
5628 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
5629 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
5630 }
5631 }
5632 auto *PrevF = M.getFunction(Name);
5633 if (PrevF)
5634 TakeDeclNameAndReplace(PrevF, NewF);
5635 else
5636 NewF->setName(Name);
5637 updateSubprogramLinkageName(NewF, Name);
5638 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5639 << "created clone " << ore::NV("NewFunction", NewF));
5640
5641 // Now handle aliases to this function, and clone those as well.
5642 CloneFuncAliases(NewF, I);
5643 }
5644 return VMaps;
5645}
5646
5647// Locate the summary for F. This is complicated by the fact that it might
5648// have been internalized or promoted.
5650 const ModuleSummaryIndex *ImportSummary,
5651 const Function *CallingFunc = nullptr) {
5652 // FIXME: Ideally we would retain the original GUID in some fashion on the
5653 // function (e.g. as metadata), but for now do our best to locate the
5654 // summary without that information.
5655 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
5656 if (!TheFnVI)
5657 // See if theFn was internalized, by checking index directly with
5658 // original name (this avoids the name adjustment done by getGUID() for
5659 // internal symbols).
5660 TheFnVI = ImportSummary->getValueInfo(
5662 if (TheFnVI)
5663 return TheFnVI;
5664 // Now query with the original name before any promotion was performed.
5665 StringRef OrigName =
5667 // When this pass is enabled, we always add thinlto_src_file provenance
5668 // metadata to imported function definitions, which allows us to recreate the
5669 // original internal symbol's GUID.
5670 auto SrcFileMD = F.getMetadata("thinlto_src_file");
5671 // If this is a call to an imported/promoted local for which we didn't import
5672 // the definition, the metadata will not exist on the declaration. However,
5673 // since we are doing this early, before any inlining in the LTO backend, we
5674 // can simply look at the metadata on the calling function which must have
5675 // been from the same module if F was an internal symbol originally.
5676 if (!SrcFileMD && F.isDeclaration()) {
5677 // We would only call this for a declaration for a direct callsite, in which
5678 // case the caller would have provided the calling function pointer.
5679 assert(CallingFunc);
5680 SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5681 // If this is a promoted local (OrigName != F.getName()), since this is a
5682 // declaration, it must be imported from a different module and therefore we
5683 // should always find the metadata on its calling function. Any call to a
5684 // promoted local that came from this module should still be a definition.
5685 assert(SrcFileMD || OrigName == F.getName());
5686 }
5687 StringRef SrcFile = M.getSourceFileName();
5688 if (SrcFileMD)
5689 SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5690 std::string OrigId = GlobalValue::getGlobalIdentifier(
5691 OrigName, GlobalValue::InternalLinkage, SrcFile);
5692 TheFnVI = ImportSummary->getValueInfo(
5694 // Internal func in original module may have gotten a numbered suffix if we
5695 // imported an external function with the same name. This happens
5696 // automatically during IR linking for naming conflicts. It would have to
5697 // still be internal in that case (otherwise it would have been renamed on
5698 // promotion in which case we wouldn't have a naming conflict).
5699 if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5700 F.getName().contains('.')) {
5701 OrigName = F.getName().rsplit('.').first;
5703 OrigName, GlobalValue::InternalLinkage, SrcFile);
5704 TheFnVI = ImportSummary->getValueInfo(
5706 }
5707 // The only way we may not have a VI is if this is a declaration created for
5708 // an imported reference. For distributed ThinLTO we may not have a VI for
5709 // such declarations in the distributed summary.
5710 assert(TheFnVI || F.isDeclaration());
5711 return TheFnVI;
5712}
5713
5714bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5715 Module &M) {
5716 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5717 Symtab = std::make_unique<InstrProfSymtab>();
5718 // Don't add canonical names, to avoid multiple functions to the symtab
5719 // when they both have the same root name with "." suffixes stripped.
5720 // If we pick the wrong one then this could lead to incorrect ICP and calling
5721 // a memprof clone that we don't actually create (resulting in linker unsats).
5722 // What this means is that the GUID of the function (or its PGOFuncName
5723 // metadata) *must* match that in the VP metadata to allow promotion.
5724 // In practice this should not be a limitation, since local functions should
5725 // have PGOFuncName metadata and global function names shouldn't need any
5726 // special handling (they should not get the ".llvm.*" suffix that the
5727 // canonicalization handling is attempting to strip).
5728 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5729 std::string SymtabFailure = toString(std::move(E));
5730 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5731 return false;
5732 }
5733 return true;
5734}
5735
5736#ifndef NDEBUG
5737// Sanity check that the MIB stack ids match between the summary and
5738// instruction metadata.
5740 const AllocInfo &AllocNode, const MDNode *MemProfMD,
5741 const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5742 const ModuleSummaryIndex *ImportSummary) {
5743 auto MIBIter = AllocNode.MIBs.begin();
5744 for (auto &MDOp : MemProfMD->operands()) {
5745 assert(MIBIter != AllocNode.MIBs.end());
5746 auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5747 auto *MIBMD = cast<const MDNode>(MDOp);
5748 MDNode *StackMDNode = getMIBStackNode(MIBMD);
5749 assert(StackMDNode);
5750 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5751 auto ContextIterBegin =
5752 StackContext.beginAfterSharedPrefix(CallsiteContext);
5753 // Skip the checking on the first iteration.
5754 uint64_t LastStackContextId =
5755 (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5756 : 0;
5757 for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5758 ++ContextIter) {
5759 // If this is a direct recursion, simply skip the duplicate
5760 // entries, to be consistent with how the summary ids were
5761 // generated during ModuleSummaryAnalysis.
5762 if (LastStackContextId == *ContextIter)
5763 continue;
5764 LastStackContextId = *ContextIter;
5765 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5766 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5767 *ContextIter);
5768 StackIdIndexIter++;
5769 }
5770 MIBIter++;
5771 }
5772}
5773#endif
5774
5775bool MemProfContextDisambiguation::applyImport(Module &M) {
5776 assert(ImportSummary);
5777 bool Changed = false;
5778
5779 // We also need to clone any aliases that reference cloned functions, because
5780 // the modified callsites may invoke via the alias. Keep track of the aliases
5781 // for each function.
5782 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5783 FuncToAliasMap;
5784 for (auto &A : M.aliases()) {
5785 auto *Aliasee = A.getAliaseeObject();
5786 if (auto *F = dyn_cast<Function>(Aliasee))
5787 FuncToAliasMap[F].insert(&A);
5788 }
5789
5790 if (!initializeIndirectCallPromotionInfo(M))
5791 return false;
5792
5793 for (auto &F : M) {
5794 if (F.isDeclaration() || isMemProfClone(F))
5795 continue;
5796
5797 OptimizationRemarkEmitter ORE(&F);
5798
5800 bool ClonesCreated = false;
5801 unsigned NumClonesCreated = 0;
5802 auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
5803 // We should at least have version 0 which is the original copy.
5804 assert(NumClones > 0);
5805 // If only one copy needed use original.
5806 if (NumClones == 1)
5807 return;
5808 // If we already performed cloning of this function, confirm that the
5809 // requested number of clones matches (the thin link should ensure the
5810 // number of clones for each constituent callsite is consistent within
5811 // each function), before returning.
5812 if (ClonesCreated) {
5813 assert(NumClonesCreated == NumClones);
5814 return;
5815 }
5816 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
5817 // The first "clone" is the original copy, which doesn't have a VMap.
5818 assert(VMaps.size() == NumClones - 1);
5819 Changed = true;
5820 ClonesCreated = true;
5821 NumClonesCreated = NumClones;
5822 };
5823
5824 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5825 Function *CalledFunction, FunctionSummary *FS) {
5826 // Perform cloning if not yet done.
5827 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
5828
5829 assert(!isMemProfClone(*CalledFunction));
5830
5831 // Because we update the cloned calls by calling setCalledOperand (see
5832 // comment below), out of an abundance of caution make sure the called
5833 // function was actually the called operand (or its aliasee). We also
5834 // strip pointer casts when looking for calls (to match behavior during
5835 // summary generation), however, with opaque pointers in theory this
5836 // should not be an issue. Note we still clone the current function
5837 // (containing this call) above, as that could be needed for its callers.
5838 auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5839 if (CalledFunction != CB->getCalledOperand() &&
5840 (!GA || CalledFunction != GA->getAliaseeObject())) {
5841 SkippedCallsCloning++;
5842 return;
5843 }
5844 // Update the calls per the summary info.
5845 // Save orig name since it gets updated in the first iteration
5846 // below.
5847 auto CalleeOrigName = CalledFunction->getName();
5848 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5849 // If the VMap is empty, this clone was a duplicate of another and was
5850 // created as an alias or a declaration.
5851 if (J > 0 && VMaps[J - 1]->empty())
5852 continue;
5853 // Do nothing if this version calls the original version of its
5854 // callee.
5855 if (!StackNode.Clones[J])
5856 continue;
5857 auto NewF = M.getOrInsertFunction(
5858 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5859 CalledFunction->getFunctionType());
5860 CallBase *CBClone;
5861 // Copy 0 is the original function.
5862 if (!J)
5863 CBClone = CB;
5864 else
5865 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5866 // Set the called operand directly instead of calling setCalledFunction,
5867 // as the latter mutates the function type on the call. In rare cases
5868 // we may have a slightly different type on a callee function
5869 // declaration due to it being imported from a different module with
5870 // incomplete types. We really just want to change the name of the
5871 // function to the clone, and not make any type changes.
5872 CBClone->setCalledOperand(NewF.getCallee());
5873 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5874 << ore::NV("Call", CBClone) << " in clone "
5875 << ore::NV("Caller", CBClone->getFunction())
5876 << " assigned to call function clone "
5877 << ore::NV("Callee", NewF.getCallee()));
5878 }
5879 };
5880
5881 // Locate the summary for F.
5882 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5883 // If not found, this could be an imported local (see comment in
5884 // findValueInfoForFunc). Skip for now as it will be cloned in its original
5885 // module (where it would have been promoted to global scope so should
5886 // satisfy any reference in this module).
5887 if (!TheFnVI)
5888 continue;
5889
5890 auto *GVSummary =
5891 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5892 if (!GVSummary) {
5893 // Must have been imported, use the summary which matches the definition。
5894 // (might be multiple if this was a linkonce_odr).
5895 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5896 assert(SrcModuleMD &&
5897 "enable-import-metadata is needed to emit thinlto_src_module");
5898 StringRef SrcModule =
5899 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5900 for (auto &GVS : TheFnVI.getSummaryList()) {
5901 if (GVS->modulePath() == SrcModule) {
5902 GVSummary = GVS.get();
5903 break;
5904 }
5905 }
5906 // TODO: Put back the assert once we have metadata on imported copies of
5907 // aliases linking them back to the original alias GUID, which would allow
5908 // us to locate the alias summary here.
5909 // assert(GVSummary && GVSummary->modulePath() == SrcModule);
5910 }
5911
5912 // GVSummary can be null if this is a function imported as a copy of an
5913 // alias, and we don't have the aliasee's summary in our distributed index.
5914 // TODO: Once we can locate the original GUID for imported aliases (e.g. via
5915 // TBD additional metadata), we should find the alias summary instead, and
5916 // we can remove this check and fall back to the original check below.
5917 if (!GVSummary)
5918 continue;
5919
5920 // If this was an imported alias skip it as we won't have the function
5921 // summary, and it should be cloned in the original module.
5922 if (isa<AliasSummary>(GVSummary))
5923 continue;
5924
5925 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5926
5927 if (FS->allocs().empty() && FS->callsites().empty())
5928 continue;
5929
5930 auto SI = FS->callsites().begin();
5931 auto AI = FS->allocs().begin();
5932
5933 // To handle callsite infos synthesized for tail calls which have missing
5934 // frames in the profiled context, map callee VI to the synthesized callsite
5935 // info.
5936 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5937 // Iterate the callsites for this function in reverse, since we place all
5938 // those synthesized for tail calls at the end.
5939 for (auto CallsiteIt = FS->callsites().rbegin();
5940 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5941 auto &Callsite = *CallsiteIt;
5942 // Stop as soon as we see a non-synthesized callsite info (see comment
5943 // above loop). All the entries added for discovered tail calls have empty
5944 // stack ids.
5945 if (!Callsite.StackIdIndices.empty())
5946 break;
5947 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5948 }
5949
5950 // Keeps track of needed ICP for the function.
5951 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5952
5953 // Assume for now that the instructions are in the exact same order
5954 // as when the summary was created, but confirm this is correct by
5955 // matching the stack ids.
5956 for (auto &BB : F) {
5957 for (auto &I : BB) {
5958 auto *CB = dyn_cast<CallBase>(&I);
5959 // Same handling as when creating module summary.
5960 if (!mayHaveMemprofSummary(CB))
5961 continue;
5962
5963 auto *CalledValue = CB->getCalledOperand();
5964 auto *CalledFunction = CB->getCalledFunction();
5965 if (CalledValue && !CalledFunction) {
5966 CalledValue = CalledValue->stripPointerCasts();
5967 // Stripping pointer casts can reveal a called function.
5968 CalledFunction = dyn_cast<Function>(CalledValue);
5969 }
5970 // Check if this is an alias to a function. If so, get the
5971 // called aliasee for the checks below.
5972 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5973 assert(!CalledFunction &&
5974 "Expected null called function in callsite for alias");
5975 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5976 }
5977
5978 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5979 I.getMetadata(LLVMContext::MD_callsite));
5980 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5981
5982 // Include allocs that were already assigned a memprof function
5983 // attribute in the statistics. Only do this for those that do not have
5984 // memprof metadata, since we add an "ambiguous" memprof attribute by
5985 // default.
5986 if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
5987 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5988 ? AllocTypeColdThinBackend++
5989 : AllocTypeNotColdThinBackend++;
5990 OrigAllocsThinBackend++;
5991 AllocVersionsThinBackend++;
5992 if (!MaxAllocVersionsThinBackend)
5993 MaxAllocVersionsThinBackend = 1;
5994 continue;
5995 }
5996
5997 if (MemProfMD) {
5998 // Consult the next alloc node.
5999 assert(AI != FS->allocs().end());
6000 auto &AllocNode = *(AI++);
6001
6002#ifndef NDEBUG
6003 checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
6004 ImportSummary);
6005#endif
6006
6007 // Perform cloning if not yet done.
6008 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
6009
6010 OrigAllocsThinBackend++;
6011 AllocVersionsThinBackend += AllocNode.Versions.size();
6012 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
6013 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
6014
6015 // If there is only one version that means we didn't end up
6016 // considering this function for cloning, and in that case the alloc
6017 // will still be none type or should have gotten the default NotCold.
6018 // Skip that after calling clone helper since that does some sanity
6019 // checks that confirm we haven't decided yet that we need cloning.
6020 // We might have a single version that is cold due to the
6021 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
6022 // case.
6023 if (AllocNode.Versions.size() == 1 &&
6024 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
6025 assert((AllocationType)AllocNode.Versions[0] ==
6026 AllocationType::NotCold ||
6027 (AllocationType)AllocNode.Versions[0] ==
6028 AllocationType::None);
6029 UnclonableAllocsThinBackend++;
6030 continue;
6031 }
6032
6033 // All versions should have a singular allocation type.
6034 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
6035 return Type == ((uint8_t)AllocationType::NotCold |
6036 (uint8_t)AllocationType::Cold);
6037 }));
6038
6039 // Update the allocation types per the summary info.
6040 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
6041 // If the VMap is empty, this clone was a duplicate of another and
6042 // was created as an alias or a declaration.
6043 if (J > 0 && VMaps[J - 1]->empty())
6044 continue;
6045 // Ignore any that didn't get an assigned allocation type.
6046 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
6047 continue;
6048 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
6049 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
6050 : AllocTypeNotColdThinBackend++;
6051 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
6052 auto A = llvm::Attribute::get(F.getContext(), "memprof",
6053 AllocTypeString);
6054 CallBase *CBClone;
6055 // Copy 0 is the original function.
6056 if (!J)
6057 CBClone = CB;
6058 else
6059 // Since VMaps are only created for new clones, we index with
6060 // clone J-1 (J==0 is the original clone and does not have a VMaps
6061 // entry).
6062 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6064 CBClone->addFnAttr(A);
6065 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
6066 << ore::NV("AllocationCall", CBClone) << " in clone "
6067 << ore::NV("Caller", CBClone->getFunction())
6068 << " marked with memprof allocation attribute "
6069 << ore::NV("Attribute", AllocTypeString));
6070 }
6071 } else if (!CallsiteContext.empty()) {
6072 if (!CalledFunction) {
6073#ifndef NDEBUG
6074 // We should have skipped inline assembly calls.
6075 auto *CI = dyn_cast<CallInst>(CB);
6076 assert(!CI || !CI->isInlineAsm());
6077#endif
6078 // We should have skipped direct calls via a Constant.
6079 assert(CalledValue && !isa<Constant>(CalledValue));
6080
6081 // This is an indirect call, see if we have profile information and
6082 // whether any clones were recorded for the profiled targets (that
6083 // we synthesized CallsiteInfo summary records for when building the
6084 // index).
6085 auto NumClones =
6086 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
6087
6088 // Perform cloning if not yet done. This is done here in case
6089 // we don't need to do ICP, but might need to clone this
6090 // function as it is the target of other cloned calls.
6091 if (NumClones)
6092 CloneFuncIfNeeded(NumClones, FS);
6093 }
6094
6095 else {
6096 // Consult the next callsite node.
6097 assert(SI != FS->callsites().end());
6098 auto &StackNode = *(SI++);
6099
6100#ifndef NDEBUG
6101 // Sanity check that the stack ids match between the summary and
6102 // instruction metadata.
6103 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
6104 for (auto StackId : CallsiteContext) {
6105 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
6106 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
6107 StackId);
6108 StackIdIndexIter++;
6109 }
6110#endif
6111
6112 CloneCallsite(StackNode, CB, CalledFunction, FS);
6113 }
6114 } else if (CB->isTailCall() && CalledFunction) {
6115 // Locate the synthesized callsite info for the callee VI, if any was
6116 // created, and use that for cloning.
6117 ValueInfo CalleeVI =
6118 findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
6119 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
6120 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
6121 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
6122 CloneCallsite(Callsite->second, CB, CalledFunction, FS);
6123 }
6124 }
6125 }
6126 }
6127
6128 // Now do any promotion required for cloning.
6129 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
6130 }
6131
6132 // We skip some of the functions and instructions above, so remove all the
6133 // metadata in a single sweep here.
6134 for (auto &F : M) {
6135 // We can skip memprof clones because createFunctionClones already strips
6136 // the metadata from the newly created clones.
6137 if (F.isDeclaration() || isMemProfClone(F))
6138 continue;
6139 for (auto &BB : F) {
6140 for (auto &I : BB) {
6141 if (!isa<CallBase>(I))
6142 continue;
6143 I.setMetadata(LLVMContext::MD_memprof, nullptr);
6144 I.setMetadata(LLVMContext::MD_callsite, nullptr);
6145 }
6146 }
6147 }
6148
6149 return Changed;
6150}
6151
6152unsigned MemProfContextDisambiguation::recordICPInfo(
6153 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
6155 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
6156 // First see if we have profile information for this indirect call.
6157 uint32_t NumCandidates;
6158 uint64_t TotalCount;
6159 auto CandidateProfileData =
6160 ICallAnalysis->getPromotionCandidatesForInstruction(
6161 CB, TotalCount, NumCandidates, MaxSummaryIndirectEdges);
6162 if (CandidateProfileData.empty())
6163 return 0;
6164
6165 // Iterate through all of the candidate profiled targets along with the
6166 // CallsiteInfo summary records synthesized for them when building the index,
6167 // and see if any are cloned and/or refer to clones.
6168 bool ICPNeeded = false;
6169 unsigned NumClones = 0;
6170 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
6171 for (const auto &Candidate : CandidateProfileData) {
6172#ifndef NDEBUG
6173 auto CalleeValueInfo =
6174#endif
6175 ImportSummary->getValueInfo(Candidate.Value);
6176 // We might not have a ValueInfo if this is a distributed
6177 // ThinLTO backend and decided not to import that function.
6178 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
6179 assert(SI != AllCallsites.end());
6180 auto &StackNode = *(SI++);
6181 // See if any of the clones of the indirect callsite for this
6182 // profiled target should call a cloned version of the profiled
6183 // target. We only need to do the ICP here if so.
6184 ICPNeeded |= llvm::any_of(StackNode.Clones,
6185 [](unsigned CloneNo) { return CloneNo != 0; });
6186 // Every callsite in the same function should have been cloned the same
6187 // number of times.
6188 assert(!NumClones || NumClones == StackNode.Clones.size());
6189 NumClones = StackNode.Clones.size();
6190 }
6191 if (!ICPNeeded)
6192 return NumClones;
6193 // Save information for ICP, which is performed later to avoid messing up the
6194 // current function traversal.
6195 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
6196 TotalCount, CallsiteInfoStartIndex});
6197 return NumClones;
6198}
6199
6200void MemProfContextDisambiguation::performICP(
6201 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
6202 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
6203 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
6204 OptimizationRemarkEmitter &ORE) {
6205 // Now do any promotion required for cloning. Specifically, for each
6206 // recorded ICP candidate (which was only recorded because one clone of that
6207 // candidate should call a cloned target), we perform ICP (speculative
6208 // devirtualization) for each clone of the callsite, and update its callee
6209 // to the appropriate clone. Note that the ICP compares against the original
6210 // version of the target, which is what is in the vtable.
6211 for (auto &Info : ICallAnalysisInfo) {
6212 auto *CB = Info.CB;
6213 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
6214 auto TotalCount = Info.TotalCount;
6215 unsigned NumClones = 0;
6216 SmallVector<InstrProfValueData, 8> RemainingCandidates;
6217
6218 for (auto &Candidate : Info.CandidateProfileData) {
6219 auto &StackNode = AllCallsites[CallsiteIndex++];
6220
6221 // All calls in the same function must have the same number of clones.
6222 assert(!NumClones || NumClones == StackNode.Clones.size());
6223 NumClones = StackNode.Clones.size();
6224
6225 // See if the target is in the module. If it wasn't imported, it is
6226 // possible that this profile could have been collected on a different
6227 // target (or version of the code), and we need to be conservative
6228 // (similar to what is done in the ICP pass).
6229 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
6230 if (TargetFunction == nullptr ||
6231 // Any ThinLTO global dead symbol removal should have already
6232 // occurred, so it should be safe to promote when the target is a
6233 // declaration.
6234 // TODO: Remove internal option once more fully tested.
6236 TargetFunction->isDeclaration())) {
6237 ORE.emit([&]() {
6238 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
6239 << "Memprof cannot promote indirect call: target with md5sum "
6240 << ore::NV("target md5sum", Candidate.Value) << " not found";
6241 });
6242 // FIXME: See if we can use the new declaration importing support to
6243 // at least get the declarations imported for this case. Hot indirect
6244 // targets should have been imported normally, however.
6245 RemainingCandidates.push_back(Candidate);
6246 continue;
6247 }
6248
6249 // Check if legal to promote
6250 const char *Reason = nullptr;
6251 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
6252 ORE.emit([&]() {
6253 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
6254 << "Memprof cannot promote indirect call to "
6255 << ore::NV("TargetFunction", TargetFunction)
6256 << " with count of " << ore::NV("TotalCount", TotalCount)
6257 << ": " << Reason;
6258 });
6259 RemainingCandidates.push_back(Candidate);
6260 continue;
6261 }
6262
6263 assert(!isMemProfClone(*TargetFunction));
6264
6265 // Handle each call clone, applying ICP so that each clone directly
6266 // calls the specified callee clone, guarded by the appropriate ICP
6267 // check.
6268 CallBase *CBClone = CB;
6269 for (unsigned J = 0; J < NumClones; J++) {
6270 // If the VMap is empty, this clone was a duplicate of another and was
6271 // created as an alias or a declaration.
6272 if (J > 0 && VMaps[J - 1]->empty())
6273 continue;
6274 // Copy 0 is the original function.
6275 if (J > 0)
6276 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6277 // We do the promotion using the original name, so that the comparison
6278 // is against the name in the vtable. Then just below, change the new
6279 // direct call to call the cloned function.
6280 auto &DirectCall =
6281 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
6282 TotalCount, isSamplePGO, &ORE);
6283 auto *TargetToUse = TargetFunction;
6284 // Call original if this version calls the original version of its
6285 // callee.
6286 if (StackNode.Clones[J]) {
6287 TargetToUse =
6288 cast<Function>(M.getOrInsertFunction(
6289 getMemProfFuncName(TargetFunction->getName(),
6290 StackNode.Clones[J]),
6291 TargetFunction->getFunctionType())
6292 .getCallee());
6293 }
6294 DirectCall.setCalledFunction(TargetToUse);
6295 // During matching we generate synthetic VP metadata for indirect calls
6296 // not already having any, from the memprof profile's callee GUIDs. If
6297 // we subsequently promote and inline those callees, we currently lose
6298 // the ability to generate this synthetic VP metadata. Optionally apply
6299 // a noinline attribute to promoted direct calls, where the threshold is
6300 // set to capture synthetic VP metadata targets which get a count of 1.
6302 Candidate.Count < MemProfICPNoInlineThreshold)
6303 DirectCall.setIsNoInline();
6304 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
6305 << ore::NV("Call", CBClone) << " in clone "
6306 << ore::NV("Caller", CBClone->getFunction())
6307 << " promoted and assigned to call function clone "
6308 << ore::NV("Callee", TargetToUse));
6309 }
6310
6311 // Update TotalCount (all clones should get same count above)
6312 TotalCount -= Candidate.Count;
6313 }
6314 // Adjust the MD.prof metadata for all clones, now that we have the new
6315 // TotalCount and the remaining candidates.
6316 CallBase *CBClone = CB;
6317 for (unsigned J = 0; J < NumClones; J++) {
6318 // If the VMap is empty, this clone was a duplicate of another and was
6319 // created as an alias or a declaration.
6320 if (J > 0 && VMaps[J - 1]->empty())
6321 continue;
6322 // Copy 0 is the original function.
6323 if (J > 0)
6324 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
6325 // First delete the old one.
6326 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
6327 // If all promoted, we don't need the MD.prof metadata.
6328 // Otherwise we need update with the un-promoted records back.
6329 if (TotalCount != 0)
6330 annotateValueSite(M, *CBClone, RemainingCandidates, TotalCount,
6331 IPVK_IndirectCallTarget, Info.NumCandidates);
6332 }
6333 }
6334}
6335
6336template <typename DerivedCCG, typename FuncTy, typename CallTy>
6337bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process(
6338 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark,
6339 bool AllowExtraAnalysis) {
6340 if (DumpCCG) {
6341 dbgs() << "CCG before cloning:\n";
6342 dbgs() << *this;
6343 }
6344 if (ExportToDot)
6345 exportToDot("postbuild");
6346
6347 if (VerifyCCG) {
6348 check();
6349 }
6350
6351 identifyClones();
6352
6353 if (VerifyCCG) {
6354 check();
6355 }
6356
6357 if (DumpCCG) {
6358 dbgs() << "CCG after cloning:\n";
6359 dbgs() << *this;
6360 }
6361 if (ExportToDot)
6362 exportToDot("cloned");
6363
6364 bool Changed = assignFunctions();
6365
6366 if (DumpCCG) {
6367 dbgs() << "CCG after assigning function clones:\n";
6368 dbgs() << *this;
6369 }
6370 if (ExportToDot)
6371 exportToDot("clonefuncassign");
6372
6373 if (MemProfReportHintedSizes || AllowExtraAnalysis)
6374 printTotalSizes(errs(), EmitRemark);
6375
6376 return Changed;
6377}
6378
6379bool MemProfContextDisambiguation::processModule(
6380 Module &M,
6381 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
6382
6383 // If we have an import summary, then the cloning decisions were made during
6384 // the thin link on the index. Apply them and return.
6385 if (ImportSummary)
6386 return applyImport(M);
6387
6388 // TODO: If/when other types of memprof cloning are enabled beyond just for
6389 // hot and cold, we will need to change this to individually control the
6390 // AllocationType passed to addStackNodesForMIB during CCG construction.
6391 // Note that we specifically check this after applying imports above, so that
6392 // the option isn't needed to be passed to distributed ThinLTO backend
6393 // clang processes, which won't necessarily have visibility into the linker
6394 // dependences. Instead the information is communicated from the LTO link to
6395 // the backends via the combined summary index.
6396 if (!SupportsHotColdNew)
6397 return false;
6398
6399 ModuleCallsiteContextGraph CCG(M, OREGetter);
6400 // TODO: Set up remarks for regular LTO. We need to decide what function to
6401 // use in the callback.
6402 return CCG.process();
6403}
6404
6406 const ModuleSummaryIndex *Summary, bool isSamplePGO)
6407 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
6408 // Check the dot graph printing options once here, to make sure we have valid
6409 // and expected combinations.
6410 if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
6412 "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
6414 !ContextIdForDot.getNumOccurrences())
6416 "-memprof-dot-scope=context requires -memprof-dot-context-id");
6417 if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
6418 ContextIdForDot.getNumOccurrences())
6420 "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
6421 "-memprof-dot-context-id");
6422 if (ImportSummary) {
6423 // The MemProfImportSummary should only be used for testing ThinLTO
6424 // distributed backend handling via opt, in which case we don't have a
6425 // summary from the pass pipeline.
6427 return;
6428 }
6429 if (MemProfImportSummary.empty())
6430 return;
6431
6432 auto ReadSummaryFile =
6434 if (!ReadSummaryFile) {
6435 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
6436 "Error loading file '" + MemProfImportSummary +
6437 "': ");
6438 return;
6439 }
6440 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
6441 if (!ImportSummaryForTestingOrErr) {
6442 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
6443 "Error parsing file '" + MemProfImportSummary +
6444 "': ");
6445 return;
6446 }
6447 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
6448 ImportSummary = ImportSummaryForTesting.get();
6449}
6450
6453 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
6454 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
6455 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
6456 };
6457 if (!processModule(M, OREGetter))
6458 return PreservedAnalyses::all();
6459 return PreservedAnalyses::none();
6460}
6461
6463 ModuleSummaryIndex &Index,
6465 isPrevailing,
6466 LLVMContext &Ctx,
6467 function_ref<void(StringRef, StringRef, const Twine &)> EmitRemark) {
6468 // TODO: If/when other types of memprof cloning are enabled beyond just for
6469 // hot and cold, we will need to change this to individually control the
6470 // AllocationType passed to addStackNodesForMIB during CCG construction.
6471 // The index was set from the option, so these should be in sync.
6472 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
6473 if (!SupportsHotColdNew)
6474 return;
6475
6476 bool AllowExtraAnalysis =
6478
6479 IndexCallsiteContextGraph CCG(Index, isPrevailing);
6480 CCG.process(EmitRemark, AllowExtraAnalysis);
6481}
6482
6483// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
6484// when we don't have an index that has recorded that we are linking with
6485// allocation libraries containing the necessary APIs for downstream
6486// transformations.
6488 // The profile matcher applies hotness attributes directly for allocations,
6489 // and those will cause us to generate calls to the hot/cold interfaces
6490 // unconditionally. If supports-hot-cold-new was not enabled in the LTO
6491 // link then assume we don't want these calls (e.g. not linking with
6492 // the appropriate library, or otherwise trying to disable this behavior).
6493 bool Changed = false;
6494 for (auto &F : M) {
6495 for (auto &BB : F) {
6496 for (auto &I : BB) {
6497 auto *CI = dyn_cast<CallBase>(&I);
6498 if (!CI)
6499 continue;
6500 if (CI->hasFnAttr("memprof")) {
6501 CI->removeFnAttr("memprof");
6502 Changed = true;
6503 }
6504 if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
6505 assert(!CI->hasMetadata(LLVMContext::MD_memprof));
6506 continue;
6507 }
6508 // Strip off all memprof metadata as it is no longer needed.
6509 // Importantly, this avoids the addition of new memprof attributes
6510 // after inlining propagation.
6511 CI->setMetadata(LLVMContext::MD_memprof, nullptr);
6512 CI->setMetadata(LLVMContext::MD_callsite, nullptr);
6513 Changed = true;
6514 }
6515 }
6516 }
6517 if (!Changed)
6518 return PreservedAnalyses::all();
6519 return PreservedAnalyses::none();
6520}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Unify divergent function exit nodes
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
This file implements a map that provides insertion order iteration.
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
uint64_t ComputeHash(const FunctionSummary *FS, unsigned I)
static cl::opt< DotScope > DotGraphScope("memprof-dot-scope", cl::desc("Scope of graph to export to dot"), cl::Hidden, cl::init(DotScope::All), cl::values(clEnumValN(DotScope::All, "all", "Export full callsite graph"), clEnumValN(DotScope::Alloc, "alloc", "Export only nodes with contexts feeding given " "-memprof-dot-alloc-id"), clEnumValN(DotScope::Context, "context", "Export only nodes with given -memprof-dot-context-id")))
static cl::opt< bool > DoMergeIteration("memprof-merge-iteration", cl::init(true), cl::Hidden, cl::desc("Iteratively apply merging on a node to catch new callers"))
static bool isMemProfClone(const Function &F)
static cl::opt< unsigned > AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " "or to highlight if -memprof-dot-scope=all"))
static cl::opt< unsigned > ContextIdForDot("memprof-dot-context-id", cl::init(0), cl::Hidden, cl::desc("Id of context to export if -memprof-dot-scope=context or to " "highlight otherwise"))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary, const Function *CallingFunc=nullptr)
static cl::opt< bool > CloneRecursiveContexts("memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static std::string getAllocTypeString(uint8_t AllocTypes)
bool DOTGraphTraits< constCallsiteContextGraph< DerivedCCG, FuncTy, CallTy > * >::DoHighlight
static unsigned getMemProfCloneNum(const Function &F)
static cl::opt< unsigned > MemProfICPNoInlineThreshold("memprof-icp-noinline-threshold", cl::init(0), cl::Hidden, cl::desc("Minimum absolute count for promoted target to be inlinable"))
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap, FunctionSummary *FS)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static cl::opt< bool > MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, cl::desc("Merge clones before assigning functions"))
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
static const std::string MemProfCloneSuffix
static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name)
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts having recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static void checkAllocContextIds(const AllocInfo &AllocNode, const MDNode *MemProfMD, const CallStack< MDNode, MDNode::op_iterator > &CallsiteContext, const ModuleSummaryIndex *ImportSummary)
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
std::pair< BasicBlock *, BasicBlock * > Edge
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
void print(OutputBuffer &OB) const
ValueInfo getAliaseeVI() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
const_pointer iterator
Definition ArrayRef.h:47
iterator begin() const
Definition ArrayRef.h:129
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
void setCalledOperand(Value *V)
Subprogram description. Uses SubclassData1.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:250
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:223
unsigned size() const
Definition DenseMap.h:172
bool empty() const
Definition DenseMap.h:171
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:219
iterator end()
Definition DenseMap.h:141
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:214
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:284
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:176
Implements a dense probed hash-table based set.
Definition DenseSet.h:281
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
DISubprogram * getSubprogram() const
Get the attached subprogram.
const Function & getFunction() const
Definition Function.h:166
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
static LLVM_ABI GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition Globals.cpp:630
Function and variable summary information to aid decisions and implementation of importing.
static LLVM_ABI GUID getGUIDAssumingExternalLinkage(StringRef GlobalName)
Return a 64-bit global unique ID constructed from the name of a global symbol.
Definition Globals.cpp:80
static bool isLocalLinkage(LinkageTypes Linkage)
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:346
uint64_t GUID
Declare a type to represent a global unique identifier for a global value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:96
static LLVM_ABI std::string getGlobalIdentifier(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName)
Return the modified name for a global value suitable to be used as the key for a global lookup (e....
Definition Globals.cpp:172
bool isWeakForLinker() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Metadata node.
Definition Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1426
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1424
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1432
LLVM_ABI TempMDNode clone() const
Create a (temporary) clone of this.
Definition Metadata.cpp:683
static std::enable_if_t< std::is_base_of< MDNode, T >::value, T * > replaceWithUniqued(std::unique_ptr< T, TempMDNodeDeleter > N)
Replace a temporary node with a uniqued one.
Definition Metadata.h:1301
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
size_type count(const KeyT &Key) const
Definition MapVector.h:152
LLVM_ABI MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:287
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bits of the poi...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A class that wrap the SHA1 algorithm.
Definition SHA1.h:27
LLVM_ABI void update(ArrayRef< uint8_t > Data)
Digest more data.
Definition SHA1.cpp:208
LLVM_ABI std::array< uint8_t, 20 > result()
Return the current raw 160-bits SHA1 for the digested data since the last call to init().
Definition SHA1.cpp:288
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:209
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:182
bool erase(const ValueT &V)
Definition DenseSet.h:97
void insert_range(Range &&R)
Definition DenseSet.h:235
void swap(DenseSetImpl &RHS)
Definition DenseSet.h:109
size_type size() const
Definition DenseSet.h:84
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:187
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition DenseSet.h:93
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator beginAfterSharedPrefix(const CallStack &Other)
CallStackIterator end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:50
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > dyn_extract(Y &&MD)
Extract a Value from Metadata, if any.
Definition Metadata.h:696
LLVM_ABI AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
LLVM_ABI bool metadataMayIncludeContextSizeInfo()
Whether the alloc memprof metadata may include context size info for some MIBs (but possibly not all)...
LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
LLVM_ABI MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB)
Removes any existing "ambiguous" memprof attribute.
DiagnosticInfoOptimizationBase::Argument NV
LLVM_ABI CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
uint32_t NodeId
Definition RDFGraph.h:262
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
uint64_t read64le(const void *P)
Definition Endian.h:435
void write32le(void *P, uint32_t V)
Definition Endian.h:475
This is an optimization pass for GlobalISel generic memory operations.
cl::opt< unsigned > MinClonedColdBytePercent("memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes to hint alloc cold during cloning"))
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition Error.cpp:61
void stable_sort(R &&Range)
Definition STLExtras.h:2116
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
LLVM_ABI bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
constexpr from_range_t from_range
static cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
cl::opt< unsigned > MemProfTopNImportant("memprof-top-n-important", cl::init(10), cl::Hidden, cl::desc("Number of largest cold contexts to consider important"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
bool set_intersects(const S1Ty &S1, const S2Ty &S2)
set_intersects(A, B) - Return true iff A ^ B is non empty
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
LLVM_ABI Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
cl::opt< unsigned > MaxSummaryIndirectEdges("module-summary-max-indirect-edges", cl::init(0), cl::Hidden, cl::desc("Max number of summary edges added from " "indirect call profile metadata"))
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition Error.h:1261
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
cl::opt< bool > MemProfFixupImportant("memprof-fixup-important", cl::init(true), cl::Hidden, cl::desc("Enables edge fixup for important contexts"))
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType G)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
mapped_iterator< typename std::vector< std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > >::const_iterator, decltype(&GetCallee)> ChildIteratorType
mapped_iterator< typename std::vector< NodePtrTy >::const_iterator, decltype(&getNode)> nodes_iterator
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
Summary of memprof metadata on allocations.
std::vector< MIBInfo > MIBs
SmallVector< unsigned > StackIdIndices
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
An information struct used to provide DenseMap with the various necessary components for a given valu...
typename GraphType::UnknownGraphTypeError NodeRef
Definition GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(IndexCall &Val)
const PointerUnion< CallsiteInfo *, AllocInfo * > SimpleType
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition Casting.h:34