LLVM 20.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
43#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125// By default enable cloning of callsites involved with recursive cycles
127 "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
128 cl::desc("Allow cloning of callsites involved in recursive cycles"));
129
130// When disabled, try to detect and prevent cloning of recursive contexts.
131// This is only necessary until we support cloning through recursive cycles.
132// Leave on by default for now, as disabling requires a little bit of compile
133// time overhead and doesn't affect correctness, it will just inflate the cold
134// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
136 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137 cl::desc("Allow cloning of contexts through recursive cycles"));
138
139namespace llvm {
141 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
142 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
143
144// Indicate we are linking with an allocator that supports hot/cold operator
145// new interfaces.
147 "supports-hot-cold-new", cl::init(false), cl::Hidden,
148 cl::desc("Linking with hot/cold operator new interfaces"));
149
151 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
152 cl::desc(
153 "Require target function definition when promoting indirect calls"));
154} // namespace llvm
155
158
159namespace {
160/// CRTP base for graphs built from either IR or ThinLTO summary index.
161///
162/// The graph represents the call contexts in all memprof metadata on allocation
163/// calls, with nodes for the allocations themselves, as well as for the calls
164/// in each context. The graph is initially built from the allocation memprof
165/// metadata (or summary) MIBs. It is then updated to match calls with callsite
166/// metadata onto the nodes, updating it to reflect any inlining performed on
167/// those calls.
168///
169/// Each MIB (representing an allocation's call context with allocation
170/// behavior) is assigned a unique context id during the graph build. The edges
171/// and nodes in the graph are decorated with the context ids they carry. This
172/// is used to correctly update the graph when cloning is performed so that we
173/// can uniquify the context for a single (possibly cloned) allocation.
174template <typename DerivedCCG, typename FuncTy, typename CallTy>
175class CallsiteContextGraph {
176public:
177 CallsiteContextGraph() = default;
178 CallsiteContextGraph(const CallsiteContextGraph &) = default;
179 CallsiteContextGraph(CallsiteContextGraph &&) = default;
180
181 /// Main entry point to perform analysis and transformations on graph.
182 bool process();
183
184 /// Perform cloning on the graph necessary to uniquely identify the allocation
185 /// behavior of an allocation based on its context.
186 void identifyClones();
187
188 /// Assign callsite clones to functions, cloning functions as needed to
189 /// accommodate the combinations of their callsite clones reached by callers.
190 /// For regular LTO this clones functions and callsites in the IR, but for
191 /// ThinLTO the cloning decisions are noted in the summaries and later applied
192 /// in applyImport.
193 bool assignFunctions();
194
195 void dump() const;
196 void print(raw_ostream &OS) const;
197 void printTotalSizes(raw_ostream &OS) const;
198
200 const CallsiteContextGraph &CCG) {
201 CCG.print(OS);
202 return OS;
203 }
204
205 friend struct GraphTraits<
206 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
207 friend struct DOTGraphTraits<
208 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
209
210 void exportToDot(std::string Label) const;
211
212 /// Represents a function clone via FuncTy pointer and clone number pair.
213 struct FuncInfo final
214 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
215 using Base = std::pair<FuncTy *, unsigned>;
216 FuncInfo(const Base &B) : Base(B) {}
217 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
218 explicit operator bool() const { return this->first != nullptr; }
219 FuncTy *func() const { return this->first; }
220 unsigned cloneNo() const { return this->second; }
221 };
222
223 /// Represents a callsite clone via CallTy and clone number pair.
224 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
225 using Base = std::pair<CallTy, unsigned>;
226 CallInfo(const Base &B) : Base(B) {}
227 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
228 : Base(Call, CloneNo) {}
229 explicit operator bool() const { return (bool)this->first; }
230 CallTy call() const { return this->first; }
231 unsigned cloneNo() const { return this->second; }
232 void setCloneNo(unsigned N) { this->second = N; }
233 void print(raw_ostream &OS) const {
234 if (!operator bool()) {
235 assert(!cloneNo());
236 OS << "null Call";
237 return;
238 }
239 call()->print(OS);
240 OS << "\t(clone " << cloneNo() << ")";
241 }
242 void dump() const {
243 print(dbgs());
244 dbgs() << "\n";
245 }
246 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
247 Call.print(OS);
248 return OS;
249 }
250 };
251
252 struct ContextEdge;
253
254 /// Node in the Callsite Context Graph
255 struct ContextNode {
256 // Keep this for now since in the IR case where we have an Instruction* it
257 // is not as immediately discoverable. Used for printing richer information
258 // when dumping graph.
259 bool IsAllocation;
260
261 // Keeps track of when the Call was reset to null because there was
262 // recursion.
263 bool Recursive = false;
264
265 // This will be formed by ORing together the AllocationType enum values
266 // for contexts including this node.
267 uint8_t AllocTypes = 0;
268
269 // The corresponding allocation or interior call. This is the primary call
270 // for which we have created this node.
272
273 // List of other calls that can be treated the same as the primary call
274 // through cloning. I.e. located in the same function and have the same
275 // (possibly pruned) stack ids. They will be updated the same way as the
276 // primary call when assigning to function clones.
277 SmallVector<CallInfo, 0> MatchingCalls;
278
279 // For alloc nodes this is a unique id assigned when constructed, and for
280 // callsite stack nodes it is the original stack id when the node is
281 // constructed from the memprof MIB metadata on the alloc nodes. Note that
282 // this is only used when matching callsite metadata onto the stack nodes
283 // created when processing the allocation memprof MIBs, and for labeling
284 // nodes in the dot graph. Therefore we don't bother to assign a value for
285 // clones.
286 uint64_t OrigStackOrAllocId = 0;
287
288 // Edges to all callees in the profiled call stacks.
289 // TODO: Should this be a map (from Callee node) for more efficient lookup?
290 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
291
292 // Edges to all callers in the profiled call stacks.
293 // TODO: Should this be a map (from Caller node) for more efficient lookup?
294 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295
296 // Get the list of edges from which we can compute allocation information
297 // such as the context ids and allocation type of this node.
298 const std::vector<std::shared_ptr<ContextEdge>> *
299 getEdgesWithAllocInfo() const {
300 // If node has any callees, compute from those, otherwise compute from
301 // callers (i.e. if this is the leaf allocation node).
302 if (!CalleeEdges.empty())
303 return &CalleeEdges;
304 if (!CallerEdges.empty()) {
305 // A node with caller edges but no callee edges must be the allocation
306 // node.
307 assert(IsAllocation);
308 return &CallerEdges;
309 }
310 return nullptr;
311 }
312
313 // Compute the context ids for this node from the union of its edge context
314 // ids.
315 DenseSet<uint32_t> getContextIds() const {
316 DenseSet<uint32_t> ContextIds;
317 auto *Edges = getEdgesWithAllocInfo();
318 if (!Edges)
319 return {};
320 unsigned Count = 0;
321 for (auto &Edge : *Edges)
322 Count += Edge->getContextIds().size();
323 ContextIds.reserve(Count);
324 for (auto &Edge : *Edges)
325 ContextIds.insert(Edge->getContextIds().begin(),
326 Edge->getContextIds().end());
327 return ContextIds;
328 }
329
330 // Compute the allocation type for this node from the OR of its edge
331 // allocation types.
332 uint8_t computeAllocType() const {
333 auto *Edges = getEdgesWithAllocInfo();
334 if (!Edges)
335 return (uint8_t)AllocationType::None;
336 uint8_t BothTypes =
337 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
338 uint8_t AllocType = (uint8_t)AllocationType::None;
339 for (auto &Edge : *Edges) {
340 AllocType |= Edge->AllocTypes;
341 // Bail early if alloc type reached both, no further refinement.
342 if (AllocType == BothTypes)
343 return AllocType;
344 }
345 return AllocType;
346 }
347
348 // The context ids set for this node is empty if its edge context ids are
349 // also all empty.
350 bool emptyContextIds() const {
351 auto *Edges = getEdgesWithAllocInfo();
352 if (!Edges)
353 return true;
354 for (auto &Edge : *Edges) {
355 if (!Edge->getContextIds().empty())
356 return false;
357 }
358 return true;
359 }
360
361 // List of clones of this ContextNode, initially empty.
362 std::vector<ContextNode *> Clones;
363
364 // If a clone, points to the original uncloned node.
365 ContextNode *CloneOf = nullptr;
366
367 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
368
369 ContextNode(bool IsAllocation, CallInfo C)
370 : IsAllocation(IsAllocation), Call(C) {}
371
372 void addClone(ContextNode *Clone) {
373 if (CloneOf) {
374 CloneOf->Clones.push_back(Clone);
375 Clone->CloneOf = CloneOf;
376 } else {
377 Clones.push_back(Clone);
378 assert(!Clone->CloneOf);
379 Clone->CloneOf = this;
380 }
381 }
382
383 ContextNode *getOrigNode() {
384 if (!CloneOf)
385 return this;
386 return CloneOf;
387 }
388
389 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
390 unsigned int ContextId);
391
392 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
393 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
394 void eraseCalleeEdge(const ContextEdge *Edge);
395 void eraseCallerEdge(const ContextEdge *Edge);
396
397 void setCall(CallInfo C) { Call = C; }
398
399 bool hasCall() const { return (bool)Call.call(); }
400
401 void printCall(raw_ostream &OS) const { Call.print(OS); }
402
403 // True if this node was effectively removed from the graph, in which case
404 // it should have an allocation type of None and empty context ids.
405 bool isRemoved() const {
406 assert((AllocTypes == (uint8_t)AllocationType::None) ==
407 emptyContextIds());
408 return AllocTypes == (uint8_t)AllocationType::None;
409 }
410
411 void dump() const;
412 void print(raw_ostream &OS) const;
413
414 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
415 Node.print(OS);
416 return OS;
417 }
418 };
419
420 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
421 /// callee.
422 struct ContextEdge {
423 ContextNode *Callee;
424 ContextNode *Caller;
425
426 // This will be formed by ORing together the AllocationType enum values
427 // for contexts including this edge.
428 uint8_t AllocTypes = 0;
429
430 // The set of IDs for contexts including this edge.
431 DenseSet<uint32_t> ContextIds;
432
433 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
434 DenseSet<uint32_t> ContextIds)
435 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
436 ContextIds(std::move(ContextIds)) {}
437
438 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
439
440 // Helper to clear the fields of this edge when we are removing it from the
441 // graph.
442 inline void clear() {
443 ContextIds.clear();
444 AllocTypes = (uint8_t)AllocationType::None;
445 Caller = nullptr;
446 Callee = nullptr;
447 }
448
449 // Check if edge was removed from the graph. This is useful while iterating
450 // over a copy of edge lists when performing operations that mutate the
451 // graph in ways that might remove one of the edges.
452 inline bool isRemoved() const {
453 if (Callee || Caller)
454 return false;
455 // Any edges that have been removed from the graph but are still in a
456 // shared_ptr somewhere should have all fields null'ed out by clear()
457 // above.
458 assert(AllocTypes == (uint8_t)AllocationType::None);
459 assert(ContextIds.empty());
460 return true;
461 }
462
463 void dump() const;
464 void print(raw_ostream &OS) const;
465
466 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
467 Edge.print(OS);
468 return OS;
469 }
470 };
471
472 /// Helpers to remove edges that have allocation type None (due to not
473 /// carrying any context ids) after transformations.
474 void removeNoneTypeCalleeEdges(ContextNode *Node);
475 void removeNoneTypeCallerEdges(ContextNode *Node);
476 void
477 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
479
480protected:
481 /// Get a list of nodes corresponding to the stack ids in the given callsite
482 /// context.
483 template <class NodeT, class IteratorT>
484 std::vector<uint64_t>
485 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
486
487 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
488 /// metadata (or summary).
489 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
490
491 /// Adds nodes for the given MIB stack ids.
492 template <class NodeT, class IteratorT>
493 void addStackNodesForMIB(ContextNode *AllocNode,
494 CallStack<NodeT, IteratorT> &StackContext,
495 CallStack<NodeT, IteratorT> &CallsiteContext,
497 ArrayRef<ContextTotalSize> ContextSizeInfo);
498
499 /// Matches all callsite metadata (or summary) to the nodes created for
500 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
501 /// inlining performed on those callsite instructions.
502 void updateStackNodes();
503
504 /// Update graph to conservatively handle any callsite stack nodes that target
505 /// multiple different callee target functions.
506 void handleCallsitesWithMultipleTargets();
507
508 // Try to partition calls on the given node (already placed into the AllCalls
509 // array) by callee function, creating new copies of Node as needed to hold
510 // calls with different callees, and moving the callee edges appropriately.
511 // Returns true if partitioning was successful.
512 bool partitionCallsByCallee(
513 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
514 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
515
516 /// Save lists of calls with MemProf metadata in each function, for faster
517 /// iteration.
518 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
519
520 /// Map from callsite node to the enclosing caller function.
521 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
522
523private:
524 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
525
526 // Structure to keep track of information for each call as we are matching
527 // non-allocation callsites onto context nodes created from the allocation
528 // call metadata / summary contexts.
529 struct CallContextInfo {
530 // The callsite we're trying to match.
531 CallTy Call;
532 // The callsites stack ids that have a context node in the graph.
533 std::vector<uint64_t> StackIds;
534 // The function containing this callsite.
535 const FuncTy *Func;
536 // Initially empty, if needed this will be updated to contain the context
537 // ids for use in a new context node created for this callsite.
538 DenseSet<uint32_t> ContextIds;
539 };
540
541 /// Helper to remove edge from graph, updating edge iterator if it is provided
542 /// (in which case CalleeIter indicates which edge list is being iterated).
543 /// This will also perform the necessary clearing of the ContextEdge members
544 /// to enable later checking if the edge has been removed (since we may have
545 /// other copies of the shared_ptr in existence, and in fact rely on this to
546 /// enable removal while iterating over a copy of a node's edge list).
547 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
548 bool CalleeIter = true);
549
550 /// Assigns the given Node to calls at or inlined into the location with
551 /// the Node's stack id, after post order traversing and processing its
552 /// caller nodes. Uses the call information recorded in the given
553 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
554 /// as needed. Called by updateStackNodes which sets up the given
555 /// StackIdToMatchingCalls map.
556 void assignStackNodesPostOrder(
557 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
558 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
559 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
560
561 /// Duplicates the given set of context ids, updating the provided
562 /// map from each original id with the newly generated context ids,
563 /// and returning the new duplicated id set.
564 DenseSet<uint32_t> duplicateContextIds(
565 const DenseSet<uint32_t> &StackSequenceContextIds,
566 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
567
568 /// Propagates all duplicated context ids across the graph.
569 void propagateDuplicateContextIds(
570 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
571
572 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
573 /// else to its callers. Also updates OrigNode's edges to remove any context
574 /// ids moved to the newly created edge.
575 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
576 bool TowardsCallee,
577 DenseSet<uint32_t> RemainingContextIds);
578
579 /// Get the stack id corresponding to the given Id or Index (for IR this will
580 /// return itself, for a summary index this will return the id recorded in the
581 /// index for that stack id index value).
582 uint64_t getStackId(uint64_t IdOrIndex) const {
583 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
584 }
585
586 /// Returns true if the given call targets the callee of the given edge, or if
587 /// we were able to identify the call chain through intermediate tail calls.
588 /// In the latter case new context nodes are added to the graph for the
589 /// identified tail calls, and their synthesized nodes are added to
590 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
591 /// the updated edges and to prepare it for an increment in the caller.
592 bool
593 calleesMatch(CallTy Call, EdgeIter &EI,
594 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
595
596 // Return the callee function of the given call, or nullptr if it can't be
597 // determined
598 const FuncTy *getCalleeFunc(CallTy Call) {
599 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
600 }
601
602 /// Returns true if the given call targets the given function, or if we were
603 /// able to identify the call chain through intermediate tail calls (in which
604 /// case FoundCalleeChain will be populated).
605 bool calleeMatchesFunc(
606 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
607 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
608 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
609 Call, Func, CallerFunc, FoundCalleeChain);
610 }
611
612 /// Returns true if both call instructions have the same callee.
613 bool sameCallee(CallTy Call1, CallTy Call2) {
614 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
615 }
616
617 /// Get a list of nodes corresponding to the stack ids in the given
618 /// callsite's context.
619 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
620 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
621 Call);
622 }
623
624 /// Get the last stack id in the context for callsite.
625 uint64_t getLastStackId(CallTy Call) {
626 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
627 }
628
629 /// Update the allocation call to record type of allocated memory.
630 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
631 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
632 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
633 }
634
635 /// Get the AllocationType assigned to the given allocation instruction clone.
636 AllocationType getAllocationCallType(const CallInfo &Call) const {
637 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
638 }
639
640 /// Update non-allocation call to invoke (possibly cloned) function
641 /// CalleeFunc.
642 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
643 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
644 }
645
646 /// Clone the given function for the given callsite, recording mapping of all
647 /// of the functions tracked calls to their new versions in the CallMap.
648 /// Assigns new clones to clone number CloneNo.
649 FuncInfo cloneFunctionForCallsite(
650 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
651 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
652 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
653 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
654 }
655
656 /// Gets a label to use in the dot graph for the given call clone in the given
657 /// function.
658 std::string getLabel(const FuncTy *Func, const CallTy Call,
659 unsigned CloneNo) const {
660 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
661 }
662
663 // Create and return a new ContextNode.
664 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
665 CallInfo C = CallInfo()) {
666 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
667 auto *NewNode = NodeOwner.back().get();
668 if (F)
669 NodeToCallingFunc[NewNode] = F;
670 return NewNode;
671 }
672
673 /// Helpers to find the node corresponding to the given call or stackid.
674 ContextNode *getNodeForInst(const CallInfo &C);
675 ContextNode *getNodeForAlloc(const CallInfo &C);
676 ContextNode *getNodeForStackId(uint64_t StackId);
677
678 /// Computes the alloc type corresponding to the given context ids, by
679 /// unioning their recorded alloc types.
680 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
681
682 /// Returns the allocation type of the intersection of the contexts of two
683 /// nodes (based on their provided context id sets), optimized for the case
684 /// when Node1Ids is smaller than Node2Ids.
685 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
686 const DenseSet<uint32_t> &Node2Ids);
687
688 /// Returns the allocation type of the intersection of the contexts of two
689 /// nodes (based on their provided context id sets).
690 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
691 const DenseSet<uint32_t> &Node2Ids);
692
693 /// Create a clone of Edge's callee and move Edge to that new callee node,
694 /// performing the necessary context id and allocation type updates.
695 /// If callee's caller edge iterator is supplied, it is updated when removing
696 /// the edge from that list. If ContextIdsToMove is non-empty, only that
697 /// subset of Edge's ids are moved to an edge to the new callee.
698 ContextNode *
699 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
700 EdgeIter *CallerEdgeI = nullptr,
701 DenseSet<uint32_t> ContextIdsToMove = {});
702
703 /// Change the callee of Edge to existing callee clone NewCallee, performing
704 /// the necessary context id and allocation type updates.
705 /// If callee's caller edge iterator is supplied, it is updated when removing
706 /// the edge from that list. If ContextIdsToMove is non-empty, only that
707 /// subset of Edge's ids are moved to an edge to the new callee.
708 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
709 ContextNode *NewCallee,
710 EdgeIter *CallerEdgeI = nullptr,
711 bool NewClone = false,
712 DenseSet<uint32_t> ContextIdsToMove = {});
713
714 /// Change the caller of the edge at the given callee edge iterator to be
715 /// NewCaller, performing the necessary context id and allocation type
716 /// updates. The iterator is updated as the edge is removed from the list of
717 /// callee edges in the original caller. This is similar to the above
718 /// moveEdgeToExistingCalleeClone, but a simplified version of it as we always
719 /// move the given edge and all of its context ids.
720 void moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller);
721
722 /// Recursively perform cloning on the graph for the given Node and its
723 /// callers, in order to uniquely identify the allocation behavior of an
724 /// allocation given its context. The context ids of the allocation being
725 /// processed are given in AllocContextIds.
726 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
727 const DenseSet<uint32_t> &AllocContextIds);
728
729 /// Map from each context ID to the AllocationType assigned to that context.
730 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
731
732 /// Map from each contextID to the profiled full contexts and their total
733 /// sizes (there may be more than one due to context trimming),
734 /// optionally populated when requested (via MemProfReportHintedSizes or
735 /// MinClonedColdBytePercent).
736 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
737
738 /// Identifies the context node created for a stack id when adding the MIB
739 /// contexts to the graph. This is used to locate the context nodes when
740 /// trying to assign the corresponding callsites with those stack ids to these
741 /// nodes.
742 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
743
744 /// Maps to track the calls to their corresponding nodes in the graph.
745 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
746 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
747
748 /// Owner of all ContextNode unique_ptrs.
749 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
750
751 /// Perform sanity checks on graph when requested.
752 void check() const;
753
754 /// Keeps track of the last unique context id assigned.
755 unsigned int LastContextId = 0;
756};
757
758template <typename DerivedCCG, typename FuncTy, typename CallTy>
759using ContextNode =
760 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
761template <typename DerivedCCG, typename FuncTy, typename CallTy>
762using ContextEdge =
763 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
764template <typename DerivedCCG, typename FuncTy, typename CallTy>
765using FuncInfo =
766 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
767template <typename DerivedCCG, typename FuncTy, typename CallTy>
768using CallInfo =
769 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
770
771/// CRTP derived class for graphs built from IR (regular LTO).
772class ModuleCallsiteContextGraph
773 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
774 Instruction *> {
775public:
776 ModuleCallsiteContextGraph(
777 Module &M,
779
780private:
781 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
782 Instruction *>;
783
784 uint64_t getStackId(uint64_t IdOrIndex) const;
785 const Function *getCalleeFunc(Instruction *Call);
786 bool calleeMatchesFunc(
787 Instruction *Call, const Function *Func, const Function *CallerFunc,
788 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
789 bool sameCallee(Instruction *Call1, Instruction *Call2);
790 bool findProfiledCalleeThroughTailCalls(
791 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
792 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
793 bool &FoundMultipleCalleeChains);
794 uint64_t getLastStackId(Instruction *Call);
795 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
796 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
797 AllocationType getAllocationCallType(const CallInfo &Call) const;
798 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
799 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
800 Instruction *>::FuncInfo
801 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
802 std::map<CallInfo, CallInfo> &CallMap,
803 std::vector<CallInfo> &CallsWithMetadataInFunc,
804 unsigned CloneNo);
805 std::string getLabel(const Function *Func, const Instruction *Call,
806 unsigned CloneNo) const;
807
808 const Module &Mod;
810};
811
812/// Represents a call in the summary index graph, which can either be an
813/// allocation or an interior callsite node in an allocation's context.
814/// Holds a pointer to the corresponding data structure in the index.
815struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
816 IndexCall() : PointerUnion() {}
817 IndexCall(std::nullptr_t) : IndexCall() {}
819 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
820 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
821
822 IndexCall *operator->() { return this; }
823
824 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
825
826 void print(raw_ostream &OS) const {
827 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
828 OS << *AI;
829 } else {
830 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
831 assert(CI);
832 OS << *CI;
833 }
834 }
835};
836
837/// CRTP derived class for graphs built from summary index (ThinLTO).
838class IndexCallsiteContextGraph
839 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
840 IndexCall> {
841public:
842 IndexCallsiteContextGraph(
843 ModuleSummaryIndex &Index,
845 isPrevailing);
846
847 ~IndexCallsiteContextGraph() {
848 // Now that we are done with the graph it is safe to add the new
849 // CallsiteInfo structs to the function summary vectors. The graph nodes
850 // point into locations within these vectors, so we don't want to add them
851 // any earlier.
852 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
853 auto *FS = I.first;
854 for (auto &Callsite : I.second)
855 FS->addCallsite(*Callsite.second);
856 }
857 }
858
859private:
860 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
861 IndexCall>;
862
863 uint64_t getStackId(uint64_t IdOrIndex) const;
864 const FunctionSummary *getCalleeFunc(IndexCall &Call);
865 bool calleeMatchesFunc(
866 IndexCall &Call, const FunctionSummary *Func,
867 const FunctionSummary *CallerFunc,
868 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
869 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
870 bool findProfiledCalleeThroughTailCalls(
871 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
872 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
873 bool &FoundMultipleCalleeChains);
874 uint64_t getLastStackId(IndexCall &Call);
875 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
876 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
877 AllocationType getAllocationCallType(const CallInfo &Call) const;
878 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
879 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
880 IndexCall>::FuncInfo
881 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
882 std::map<CallInfo, CallInfo> &CallMap,
883 std::vector<CallInfo> &CallsWithMetadataInFunc,
884 unsigned CloneNo);
885 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
886 unsigned CloneNo) const;
887
888 // Saves mapping from function summaries containing memprof records back to
889 // its VI, for use in checking and debugging.
890 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
891
894 isPrevailing;
895
896 // Saves/owns the callsite info structures synthesized for missing tail call
897 // frames that we discover while building the graph.
898 // It maps from the summary of the function making the tail call, to a map
899 // of callee ValueInfo to corresponding synthesized callsite info.
900 std::unordered_map<FunctionSummary *,
901 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
902 FunctionCalleesToSynthesizedCallsiteInfos;
903};
904} // namespace
905
906namespace llvm {
907template <>
908struct DenseMapInfo<typename CallsiteContextGraph<
909 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
911template <>
912struct DenseMapInfo<typename CallsiteContextGraph<
913 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
914 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
915template <>
916struct DenseMapInfo<IndexCall>
917 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
918} // end namespace llvm
919
920namespace {
921
922// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
923// type we should actually use on the corresponding allocation.
924// If we can't clone a node that has NotCold+Cold alloc type, we will fall
925// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
926// from NotCold.
927AllocationType allocTypeToUse(uint8_t AllocTypes) {
928 assert(AllocTypes != (uint8_t)AllocationType::None);
929 if (AllocTypes ==
930 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
931 return AllocationType::NotCold;
932 else
933 return (AllocationType)AllocTypes;
934}
935
936// Helper to check if the alloc types for all edges recorded in the
937// InAllocTypes vector match the alloc types for all edges in the Edges
938// vector.
939template <typename DerivedCCG, typename FuncTy, typename CallTy>
940bool allocTypesMatch(
941 const std::vector<uint8_t> &InAllocTypes,
942 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
943 &Edges) {
944 // This should be called only when the InAllocTypes vector was computed for
945 // this set of Edges. Make sure the sizes are the same.
946 assert(InAllocTypes.size() == Edges.size());
947 return std::equal(
948 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
949 [](const uint8_t &l,
950 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
951 // Can share if one of the edges is None type - don't
952 // care about the type along that edge as it doesn't
953 // exist for those context ids.
954 if (l == (uint8_t)AllocationType::None ||
955 r->AllocTypes == (uint8_t)AllocationType::None)
956 return true;
957 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
958 });
959}
960
961// Helper to check if the alloc types for all edges recorded in the
962// InAllocTypes vector match the alloc types for callee edges in the given
963// clone. Because the InAllocTypes were computed from the original node's callee
964// edges, and other cloning could have happened after this clone was created, we
965// need to find the matching clone callee edge, which may or may not exist.
966template <typename DerivedCCG, typename FuncTy, typename CallTy>
967bool allocTypesMatchClone(
968 const std::vector<uint8_t> &InAllocTypes,
969 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
970 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
971 assert(Node);
972 // InAllocTypes should have been computed for the original node's callee
973 // edges.
974 assert(InAllocTypes.size() == Node->CalleeEdges.size());
975 // First create a map of the clone callee edge callees to the edge alloc type.
977 EdgeCalleeMap;
978 for (const auto &E : Clone->CalleeEdges) {
979 assert(!EdgeCalleeMap.contains(E->Callee));
980 EdgeCalleeMap[E->Callee] = E->AllocTypes;
981 }
982 // Next, walk the original node's callees, and look for the corresponding
983 // clone edge to that callee.
984 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
985 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
986 // Not found is ok, we will simply add an edge if we use this clone.
987 if (Iter == EdgeCalleeMap.end())
988 continue;
989 // Can share if one of the edges is None type - don't
990 // care about the type along that edge as it doesn't
991 // exist for those context ids.
992 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
993 Iter->second == (uint8_t)AllocationType::None)
994 continue;
995 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
996 return false;
997 }
998 return true;
999}
1000
1001} // end anonymous namespace
1002
1003template <typename DerivedCCG, typename FuncTy, typename CallTy>
1004typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1005CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1006 const CallInfo &C) {
1007 ContextNode *Node = getNodeForAlloc(C);
1008 if (Node)
1009 return Node;
1010
1011 return NonAllocationCallToContextNodeMap.lookup(C);
1012}
1013
1014template <typename DerivedCCG, typename FuncTy, typename CallTy>
1015typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1016CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1017 const CallInfo &C) {
1018 return AllocationCallToContextNodeMap.lookup(C);
1019}
1020
1021template <typename DerivedCCG, typename FuncTy, typename CallTy>
1022typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1023CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1024 uint64_t StackId) {
1025 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1026 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1027 return StackEntryNode->second;
1028 return nullptr;
1029}
1030
1031template <typename DerivedCCG, typename FuncTy, typename CallTy>
1032void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1033 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1034 unsigned int ContextId) {
1035 for (auto &Edge : CallerEdges) {
1036 if (Edge->Caller == Caller) {
1037 Edge->AllocTypes |= (uint8_t)AllocType;
1038 Edge->getContextIds().insert(ContextId);
1039 return;
1040 }
1041 }
1042 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1043 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1044 CallerEdges.push_back(Edge);
1045 Caller->CalleeEdges.push_back(Edge);
1046}
1047
1048template <typename DerivedCCG, typename FuncTy, typename CallTy>
1049void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1050 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1051 assert(!EI || (*EI)->get() == Edge);
1052 // Save the Caller and Callee pointers so we can erase Edge from their edge
1053 // lists after clearing Edge below. We do the clearing first in case it is
1054 // destructed after removing from the edge lists (if those were the last
1055 // shared_ptr references to Edge).
1056 auto *Callee = Edge->Callee;
1057 auto *Caller = Edge->Caller;
1058
1059 // Make sure the edge fields are cleared out so we can properly detect
1060 // removed edges if Edge is not destructed because there is still a shared_ptr
1061 // reference.
1062 Edge->clear();
1063
1064 if (!EI) {
1065 Callee->eraseCallerEdge(Edge);
1066 Caller->eraseCalleeEdge(Edge);
1067 } else if (CalleeIter) {
1068 Callee->eraseCallerEdge(Edge);
1069 *EI = Caller->CalleeEdges.erase(*EI);
1070 } else {
1071 Caller->eraseCalleeEdge(Edge);
1072 *EI = Callee->CallerEdges.erase(*EI);
1073 }
1074}
1075
1076template <typename DerivedCCG, typename FuncTy, typename CallTy>
1077void CallsiteContextGraph<
1078 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1079 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1080 auto Edge = *EI;
1081 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1082 assert(Edge->ContextIds.empty());
1083 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1084 } else
1085 ++EI;
1086 }
1087}
1088
1089template <typename DerivedCCG, typename FuncTy, typename CallTy>
1090void CallsiteContextGraph<
1091 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1092 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1093 auto Edge = *EI;
1094 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1095 assert(Edge->ContextIds.empty());
1096 Edge->Caller->eraseCalleeEdge(Edge.get());
1097 EI = Node->CallerEdges.erase(EI);
1098 } else
1099 ++EI;
1100 }
1101}
1102
1103template <typename DerivedCCG, typename FuncTy, typename CallTy>
1104typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1105CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1106 findEdgeFromCallee(const ContextNode *Callee) {
1107 for (const auto &Edge : CalleeEdges)
1108 if (Edge->Callee == Callee)
1109 return Edge.get();
1110 return nullptr;
1111}
1112
1113template <typename DerivedCCG, typename FuncTy, typename CallTy>
1114typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1115CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1116 findEdgeFromCaller(const ContextNode *Caller) {
1117 for (const auto &Edge : CallerEdges)
1118 if (Edge->Caller == Caller)
1119 return Edge.get();
1120 return nullptr;
1121}
1122
1123template <typename DerivedCCG, typename FuncTy, typename CallTy>
1124void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1125 eraseCalleeEdge(const ContextEdge *Edge) {
1126 auto EI = llvm::find_if(
1127 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1128 return CalleeEdge.get() == Edge;
1129 });
1130 assert(EI != CalleeEdges.end());
1131 CalleeEdges.erase(EI);
1132}
1133
1134template <typename DerivedCCG, typename FuncTy, typename CallTy>
1135void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1136 eraseCallerEdge(const ContextEdge *Edge) {
1137 auto EI = llvm::find_if(
1138 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1139 return CallerEdge.get() == Edge;
1140 });
1141 assert(EI != CallerEdges.end());
1142 CallerEdges.erase(EI);
1143}
1144
1145template <typename DerivedCCG, typename FuncTy, typename CallTy>
1146uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1147 DenseSet<uint32_t> &ContextIds) {
1148 uint8_t BothTypes =
1149 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1150 uint8_t AllocType = (uint8_t)AllocationType::None;
1151 for (auto Id : ContextIds) {
1152 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1153 // Bail early if alloc type reached both, no further refinement.
1154 if (AllocType == BothTypes)
1155 return AllocType;
1156 }
1157 return AllocType;
1158}
1159
1160template <typename DerivedCCG, typename FuncTy, typename CallTy>
1161uint8_t
1162CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1163 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1164 uint8_t BothTypes =
1165 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1166 uint8_t AllocType = (uint8_t)AllocationType::None;
1167 for (auto Id : Node1Ids) {
1168 if (!Node2Ids.count(Id))
1169 continue;
1170 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1171 // Bail early if alloc type reached both, no further refinement.
1172 if (AllocType == BothTypes)
1173 return AllocType;
1174 }
1175 return AllocType;
1176}
1177
1178template <typename DerivedCCG, typename FuncTy, typename CallTy>
1179uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1180 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1181 if (Node1Ids.size() < Node2Ids.size())
1182 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1183 else
1184 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1185}
1186
1187template <typename DerivedCCG, typename FuncTy, typename CallTy>
1188typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1189CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1190 CallInfo Call, const FuncTy *F) {
1191 assert(!getNodeForAlloc(Call));
1192 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1193 AllocationCallToContextNodeMap[Call] = AllocNode;
1194 // Use LastContextId as a uniq id for MIB allocation nodes.
1195 AllocNode->OrigStackOrAllocId = LastContextId;
1196 // Alloc type should be updated as we add in the MIBs. We should assert
1197 // afterwards that it is not still None.
1198 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1199
1200 return AllocNode;
1201}
1202
1203static std::string getAllocTypeString(uint8_t AllocTypes) {
1204 if (!AllocTypes)
1205 return "None";
1206 std::string Str;
1207 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1208 Str += "NotCold";
1209 if (AllocTypes & (uint8_t)AllocationType::Cold)
1210 Str += "Cold";
1211 return Str;
1212}
1213
1214template <typename DerivedCCG, typename FuncTy, typename CallTy>
1215template <class NodeT, class IteratorT>
1216void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1217 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1219 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1220 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1221 // is done.
1222 if (AllocType == AllocationType::Hot)
1223 AllocType = AllocationType::NotCold;
1224
1225 ContextIdToAllocationType[++LastContextId] = AllocType;
1226
1227 if (!ContextSizeInfo.empty()) {
1228 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1229 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1230 }
1231
1232 // Update alloc type and context ids for this MIB.
1233 AllocNode->AllocTypes |= (uint8_t)AllocType;
1234
1235 // Now add or update nodes for each stack id in alloc's context.
1236 // Later when processing the stack ids on non-alloc callsites we will adjust
1237 // for any inlining in the context.
1238 ContextNode *PrevNode = AllocNode;
1239 // Look for recursion (direct recursion should have been collapsed by
1240 // module summary analysis, here we should just be detecting mutual
1241 // recursion). Mark these nodes so we don't try to clone.
1242 SmallSet<uint64_t, 8> StackIdSet;
1243 // Skip any on the allocation call (inlining).
1244 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1245 ContextIter != StackContext.end(); ++ContextIter) {
1246 auto StackId = getStackId(*ContextIter);
1247 ContextNode *StackNode = getNodeForStackId(StackId);
1248 if (!StackNode) {
1249 StackNode = createNewNode(/*IsAllocation=*/false);
1250 StackEntryIdToContextNodeMap[StackId] = StackNode;
1251 StackNode->OrigStackOrAllocId = StackId;
1252 }
1253 // Marking a node recursive will prevent its cloning completely, even for
1254 // non-recursive contexts flowing through it.
1256 auto Ins = StackIdSet.insert(StackId);
1257 if (!Ins.second)
1258 StackNode->Recursive = true;
1259 }
1260 StackNode->AllocTypes |= (uint8_t)AllocType;
1261 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1262 PrevNode = StackNode;
1263 }
1264}
1265
1266template <typename DerivedCCG, typename FuncTy, typename CallTy>
1268CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1269 const DenseSet<uint32_t> &StackSequenceContextIds,
1270 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1271 DenseSet<uint32_t> NewContextIds;
1272 for (auto OldId : StackSequenceContextIds) {
1273 NewContextIds.insert(++LastContextId);
1274 OldToNewContextIds[OldId].insert(LastContextId);
1275 assert(ContextIdToAllocationType.count(OldId));
1276 // The new context has the same allocation type as original.
1277 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1278 }
1279 return NewContextIds;
1280}
1281
1282template <typename DerivedCCG, typename FuncTy, typename CallTy>
1283void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1284 propagateDuplicateContextIds(
1285 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1286 // Build a set of duplicated context ids corresponding to the input id set.
1287 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1288 DenseSet<uint32_t> NewIds;
1289 for (auto Id : ContextIds)
1290 if (auto NewId = OldToNewContextIds.find(Id);
1291 NewId != OldToNewContextIds.end())
1292 NewIds.insert(NewId->second.begin(), NewId->second.end());
1293 return NewIds;
1294 };
1295
1296 // Recursively update context ids sets along caller edges.
1297 auto UpdateCallers = [&](ContextNode *Node,
1299 auto &&UpdateCallers) -> void {
1300 for (const auto &Edge : Node->CallerEdges) {
1301 auto Inserted = Visited.insert(Edge.get());
1302 if (!Inserted.second)
1303 continue;
1304 ContextNode *NextNode = Edge->Caller;
1305 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1306 // Only need to recursively iterate to NextNode via this caller edge if
1307 // it resulted in any added ids to NextNode.
1308 if (!NewIdsToAdd.empty()) {
1309 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1310 UpdateCallers(NextNode, Visited, UpdateCallers);
1311 }
1312 }
1313 };
1314
1316 for (auto &Entry : AllocationCallToContextNodeMap) {
1317 auto *Node = Entry.second;
1318 UpdateCallers(Node, Visited, UpdateCallers);
1319 }
1320}
1321
1322template <typename DerivedCCG, typename FuncTy, typename CallTy>
1323void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1324 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1325 // This must be passed by value to make a copy since it will be adjusted
1326 // as ids are moved.
1327 DenseSet<uint32_t> RemainingContextIds) {
1328 auto &OrigEdges =
1329 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1330 // Increment iterator in loop so that we can remove edges as needed.
1331 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1332 auto Edge = *EI;
1333 // Remove any matching context ids from Edge, return set that were found and
1334 // removed, these are the new edge's context ids. Also update the remaining
1335 // (not found ids).
1336 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1337 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1338 NotFoundContextIds);
1339 RemainingContextIds.swap(NotFoundContextIds);
1340 // If no matching context ids for this edge, skip it.
1341 if (NewEdgeContextIds.empty()) {
1342 ++EI;
1343 continue;
1344 }
1345 if (TowardsCallee) {
1346 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1347 auto NewEdge = std::make_shared<ContextEdge>(
1348 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1349 NewNode->CalleeEdges.push_back(NewEdge);
1350 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1351 } else {
1352 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1353 auto NewEdge = std::make_shared<ContextEdge>(
1354 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1355 NewNode->CallerEdges.push_back(NewEdge);
1356 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1357 }
1358 // Remove old edge if context ids empty.
1359 if (Edge->getContextIds().empty()) {
1360 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1361 continue;
1362 }
1363 ++EI;
1364 }
1365}
1366
1367template <typename DerivedCCG, typename FuncTy, typename CallTy>
1368static void checkEdge(
1369 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1370 // Confirm that alloc type is not None and that we have at least one context
1371 // id.
1372 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1373 assert(!Edge->ContextIds.empty());
1374}
1375
1376template <typename DerivedCCG, typename FuncTy, typename CallTy>
1377static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1378 bool CheckEdges = true) {
1379 if (Node->isRemoved())
1380 return;
1381#ifndef NDEBUG
1382 // Compute node's context ids once for use in asserts.
1383 auto NodeContextIds = Node->getContextIds();
1384#endif
1385 // Node's context ids should be the union of both its callee and caller edge
1386 // context ids.
1387 if (Node->CallerEdges.size()) {
1388 DenseSet<uint32_t> CallerEdgeContextIds(
1389 Node->CallerEdges.front()->ContextIds);
1390 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1391 if (CheckEdges)
1392 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1393 set_union(CallerEdgeContextIds, Edge->ContextIds);
1394 }
1395 // Node can have more context ids than callers if some contexts terminate at
1396 // node and some are longer. If we are allowing recursive callsites but
1397 // haven't disabled recursive contexts, this will be violated for
1398 // incompletely cloned recursive cycles, so skip the checking in that case.
1400 NodeContextIds == CallerEdgeContextIds ||
1401 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1402 }
1403 if (Node->CalleeEdges.size()) {
1404 DenseSet<uint32_t> CalleeEdgeContextIds(
1405 Node->CalleeEdges.front()->ContextIds);
1406 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1407 if (CheckEdges)
1408 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1409 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1410 }
1411 assert(NodeContextIds == CalleeEdgeContextIds);
1412 }
1413 // FIXME: Since this checking is only invoked under an option, we should
1414 // change the error checking from using assert to something that will trigger
1415 // an error on a release build.
1416#ifndef NDEBUG
1417 // Make sure we don't end up with duplicate edges between the same caller and
1418 // callee.
1420 for (const auto &E : Node->CalleeEdges)
1421 NodeSet.insert(E->Callee);
1422 assert(NodeSet.size() == Node->CalleeEdges.size());
1423#endif
1424}
1425
1426template <typename DerivedCCG, typename FuncTy, typename CallTy>
1427void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1428 assignStackNodesPostOrder(
1429 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1430 DenseMap<uint64_t, std::vector<CallContextInfo>>
1431 &StackIdToMatchingCalls,
1432 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1433 auto Inserted = Visited.insert(Node);
1434 if (!Inserted.second)
1435 return;
1436 // Post order traversal. Iterate over a copy since we may add nodes and
1437 // therefore new callers during the recursive call, invalidating any
1438 // iterator over the original edge vector. We don't need to process these
1439 // new nodes as they were already processed on creation.
1440 auto CallerEdges = Node->CallerEdges;
1441 for (auto &Edge : CallerEdges) {
1442 // Skip any that have been removed during the recursion.
1443 if (Edge->isRemoved()) {
1444 assert(!is_contained(Node->CallerEdges, Edge));
1445 continue;
1446 }
1447 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1448 CallToMatchingCall);
1449 }
1450
1451 // If this node's stack id is in the map, update the graph to contain new
1452 // nodes representing any inlining at interior callsites. Note we move the
1453 // associated context ids over to the new nodes.
1454
1455 // Ignore this node if it is for an allocation or we didn't record any
1456 // stack id lists ending at it.
1457 if (Node->IsAllocation ||
1458 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1459 return;
1460
1461 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1462 // Handle the simple case first. A single call with a single stack id.
1463 // In this case there is no need to create any new context nodes, simply
1464 // assign the context node for stack id to this Call.
1465 if (Calls.size() == 1) {
1466 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1467 if (Ids.size() == 1) {
1468 assert(SavedContextIds.empty());
1469 // It should be this Node
1470 assert(Node == getNodeForStackId(Ids[0]));
1471 if (Node->Recursive)
1472 return;
1473 Node->setCall(Call);
1474 NonAllocationCallToContextNodeMap[Call] = Node;
1475 NodeToCallingFunc[Node] = Func;
1476 return;
1477 }
1478 }
1479
1480#ifndef NDEBUG
1481 // Find the node for the last stack id, which should be the same
1482 // across all calls recorded for this id, and is this node's id.
1483 uint64_t LastId = Node->OrigStackOrAllocId;
1484 ContextNode *LastNode = getNodeForStackId(LastId);
1485 // We should only have kept stack ids that had nodes.
1486 assert(LastNode);
1487 assert(LastNode == Node);
1488#else
1489 ContextNode *LastNode = Node;
1490#endif
1491
1492 // Compute the last node's context ids once, as it is shared by all calls in
1493 // this entry.
1494 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1495
1496 bool PrevIterCreatedNode = false;
1497 bool CreatedNode = false;
1498 for (unsigned I = 0; I < Calls.size();
1499 I++, PrevIterCreatedNode = CreatedNode) {
1500 CreatedNode = false;
1501 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1502 // Skip any for which we didn't assign any ids, these don't get a node in
1503 // the graph.
1504 if (SavedContextIds.empty()) {
1505 // If this call has a matching call (located in the same function and
1506 // having the same stack ids), simply add it to the context node created
1507 // for its matching call earlier. These can be treated the same through
1508 // cloning and get updated at the same time.
1509 if (!CallToMatchingCall.contains(Call))
1510 continue;
1511 auto MatchingCall = CallToMatchingCall[Call];
1512 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1513 // This should only happen if we had a prior iteration, and it didn't
1514 // create a node because of the below recomputation of context ids
1515 // finding none remaining and continuing early.
1516 assert(I > 0 && !PrevIterCreatedNode);
1517 continue;
1518 }
1519 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1520 Call);
1521 continue;
1522 }
1523
1524 assert(LastId == Ids.back());
1525
1526 // Recompute the context ids for this stack id sequence (the
1527 // intersection of the context ids of the corresponding nodes).
1528 // Start with the ids we saved in the map for this call, which could be
1529 // duplicated context ids. We have to recompute as we might have overlap
1530 // overlap between the saved context ids for different last nodes, and
1531 // removed them already during the post order traversal.
1532 set_intersect(SavedContextIds, LastNodeContextIds);
1533 ContextNode *PrevNode = LastNode;
1534 bool Skip = false;
1535 // Iterate backwards through the stack Ids, starting after the last Id
1536 // in the list, which was handled once outside for all Calls.
1537 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1538 auto Id = *IdIter;
1539 ContextNode *CurNode = getNodeForStackId(Id);
1540 // We should only have kept stack ids that had nodes and weren't
1541 // recursive.
1542 assert(CurNode);
1543 assert(!CurNode->Recursive);
1544
1545 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1546 if (!Edge) {
1547 Skip = true;
1548 break;
1549 }
1550 PrevNode = CurNode;
1551
1552 // Update the context ids, which is the intersection of the ids along
1553 // all edges in the sequence.
1554 set_intersect(SavedContextIds, Edge->getContextIds());
1555
1556 // If we now have no context ids for clone, skip this call.
1557 if (SavedContextIds.empty()) {
1558 Skip = true;
1559 break;
1560 }
1561 }
1562 if (Skip)
1563 continue;
1564
1565 // Create new context node.
1566 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1567 NonAllocationCallToContextNodeMap[Call] = NewNode;
1568 CreatedNode = true;
1569 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1570
1571 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1572 assert(FirstNode);
1573
1574 // Connect to callees of innermost stack frame in inlined call chain.
1575 // This updates context ids for FirstNode's callee's to reflect those
1576 // moved to NewNode.
1577 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1578
1579 // Connect to callers of outermost stack frame in inlined call chain.
1580 // This updates context ids for FirstNode's caller's to reflect those
1581 // moved to NewNode.
1582 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1583
1584 // Now we need to remove context ids from edges/nodes between First and
1585 // Last Node.
1586 PrevNode = nullptr;
1587 for (auto Id : Ids) {
1588 ContextNode *CurNode = getNodeForStackId(Id);
1589 // We should only have kept stack ids that had nodes.
1590 assert(CurNode);
1591
1592 // Remove the context ids moved to NewNode from CurNode, and the
1593 // edge from the prior node.
1594 if (PrevNode) {
1595 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1596 assert(PrevEdge);
1597 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1598 if (PrevEdge->getContextIds().empty())
1599 removeEdgeFromGraph(PrevEdge);
1600 }
1601 // Since we update the edges from leaf to tail, only look at the callee
1602 // edges. This isn't an alloc node, so if there are no callee edges, the
1603 // alloc type is None.
1604 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1605 ? (uint8_t)AllocationType::None
1606 : CurNode->computeAllocType();
1607 PrevNode = CurNode;
1608 }
1609 if (VerifyNodes) {
1610 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1611 for (auto Id : Ids) {
1612 ContextNode *CurNode = getNodeForStackId(Id);
1613 // We should only have kept stack ids that had nodes.
1614 assert(CurNode);
1615 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1616 }
1617 }
1618 }
1619}
1620
1621template <typename DerivedCCG, typename FuncTy, typename CallTy>
1622void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1623 // Map of stack id to all calls with that as the last (outermost caller)
1624 // callsite id that has a context node (some might not due to pruning
1625 // performed during matching of the allocation profile contexts).
1626 // The CallContextInfo contains the Call and a list of its stack ids with
1627 // ContextNodes, the function containing Call, and the set of context ids
1628 // the analysis will eventually identify for use in any new node created
1629 // for that callsite.
1631 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1632 for (auto &Call : CallsWithMetadata) {
1633 // Ignore allocations, already handled.
1634 if (AllocationCallToContextNodeMap.count(Call))
1635 continue;
1636 auto StackIdsWithContextNodes =
1637 getStackIdsWithContextNodesForCall(Call.call());
1638 // If there were no nodes created for MIBs on allocs (maybe this was in
1639 // the unambiguous part of the MIB stack that was pruned), ignore.
1640 if (StackIdsWithContextNodes.empty())
1641 continue;
1642 // Otherwise, record this Call along with the list of ids for the last
1643 // (outermost caller) stack id with a node.
1644 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1645 {Call.call(), StackIdsWithContextNodes, Func, {}});
1646 }
1647 }
1648
1649 // First make a pass through all stack ids that correspond to a call,
1650 // as identified in the above loop. Compute the context ids corresponding to
1651 // each of these calls when they correspond to multiple stack ids due to
1652 // due to inlining. Perform any duplication of context ids required when
1653 // there is more than one call with the same stack ids. Their (possibly newly
1654 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1655 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1656 // Save a map from each call to any that are found to match it. I.e. located
1657 // in the same function and have the same (possibly pruned) stack ids. We use
1658 // this to avoid creating extra graph nodes as they can be treated the same.
1659 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1660 for (auto &It : StackIdToMatchingCalls) {
1661 auto &Calls = It.getSecond();
1662 // Skip single calls with a single stack id. These don't need a new node.
1663 if (Calls.size() == 1) {
1664 auto &Ids = Calls[0].StackIds;
1665 if (Ids.size() == 1)
1666 continue;
1667 }
1668 // In order to do the best and maximal matching of inlined calls to context
1669 // node sequences we will sort the vectors of stack ids in descending order
1670 // of length, and within each length, lexicographically by stack id. The
1671 // latter is so that we can specially handle calls that have identical stack
1672 // id sequences (either due to cloning or artificially because of the MIB
1673 // context pruning). Those with the same Ids are then sorted by function to
1674 // facilitate efficiently mapping them to the same context node.
1675 // Because the functions are pointers, to ensure a stable sort first assign
1676 // each function pointer to its first index in the Calls array, and then use
1677 // that to sort by.
1679 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1680 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1681 std::stable_sort(
1682 Calls.begin(), Calls.end(),
1683 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1684 return A.StackIds.size() > B.StackIds.size() ||
1685 (A.StackIds.size() == B.StackIds.size() &&
1686 (A.StackIds < B.StackIds ||
1687 (A.StackIds == B.StackIds &&
1688 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1689 });
1690
1691 // Find the node for the last stack id, which should be the same
1692 // across all calls recorded for this id, and is the id for this
1693 // entry in the StackIdToMatchingCalls map.
1694 uint64_t LastId = It.getFirst();
1695 ContextNode *LastNode = getNodeForStackId(LastId);
1696 // We should only have kept stack ids that had nodes.
1697 assert(LastNode);
1698
1699 if (LastNode->Recursive)
1700 continue;
1701
1702 // Initialize the context ids with the last node's. We will subsequently
1703 // refine the context ids by computing the intersection along all edges.
1704 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1705 assert(!LastNodeContextIds.empty());
1706
1707#ifndef NDEBUG
1708 // Save the set of functions seen for a particular set of the same stack
1709 // ids. This is used to ensure that they have been correctly sorted to be
1710 // adjacent in the Calls list, since we rely on that to efficiently place
1711 // all such matching calls onto the same context node.
1712 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1713#endif
1714
1715 for (unsigned I = 0; I < Calls.size(); I++) {
1716 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1717 assert(SavedContextIds.empty());
1718 assert(LastId == Ids.back());
1719
1720#ifndef NDEBUG
1721 // If this call has a different set of ids than the last one, clear the
1722 // set used to ensure they are sorted properly.
1723 if (I > 0 && Ids != Calls[I - 1].StackIds)
1724 MatchingIdsFuncSet.clear();
1725#endif
1726
1727 // First compute the context ids for this stack id sequence (the
1728 // intersection of the context ids of the corresponding nodes).
1729 // Start with the remaining saved ids for the last node.
1730 assert(!LastNodeContextIds.empty());
1731 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1732
1733 ContextNode *PrevNode = LastNode;
1734 ContextNode *CurNode = LastNode;
1735 bool Skip = false;
1736
1737 // Iterate backwards through the stack Ids, starting after the last Id
1738 // in the list, which was handled once outside for all Calls.
1739 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1740 auto Id = *IdIter;
1741 CurNode = getNodeForStackId(Id);
1742 // We should only have kept stack ids that had nodes.
1743 assert(CurNode);
1744
1745 if (CurNode->Recursive) {
1746 Skip = true;
1747 break;
1748 }
1749
1750 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1751 // If there is no edge then the nodes belong to different MIB contexts,
1752 // and we should skip this inlined context sequence. For example, this
1753 // particular inlined context may include stack ids A->B, and we may
1754 // indeed have nodes for both A and B, but it is possible that they were
1755 // never profiled in sequence in a single MIB for any allocation (i.e.
1756 // we might have profiled an allocation that involves the callsite A,
1757 // but through a different one of its callee callsites, and we might
1758 // have profiled an allocation that involves callsite B, but reached
1759 // from a different caller callsite).
1760 if (!Edge) {
1761 Skip = true;
1762 break;
1763 }
1764 PrevNode = CurNode;
1765
1766 // Update the context ids, which is the intersection of the ids along
1767 // all edges in the sequence.
1768 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1769
1770 // If we now have no context ids for clone, skip this call.
1771 if (StackSequenceContextIds.empty()) {
1772 Skip = true;
1773 break;
1774 }
1775 }
1776 if (Skip)
1777 continue;
1778
1779 // If some of this call's stack ids did not have corresponding nodes (due
1780 // to pruning), don't include any context ids for contexts that extend
1781 // beyond these nodes. Otherwise we would be matching part of unrelated /
1782 // not fully matching stack contexts. To do this, subtract any context ids
1783 // found in caller nodes of the last node found above.
1784 if (Ids.back() != getLastStackId(Call)) {
1785 for (const auto &PE : LastNode->CallerEdges) {
1786 set_subtract(StackSequenceContextIds, PE->getContextIds());
1787 if (StackSequenceContextIds.empty())
1788 break;
1789 }
1790 // If we now have no context ids for clone, skip this call.
1791 if (StackSequenceContextIds.empty())
1792 continue;
1793 }
1794
1795#ifndef NDEBUG
1796 // If the prior call had the same stack ids this set would not be empty.
1797 // Check if we already have a call that "matches" because it is located
1798 // in the same function. If the Calls list was sorted properly we should
1799 // not encounter this situation as all such entries should be adjacent
1800 // and processed in bulk further below.
1801 assert(!MatchingIdsFuncSet.contains(Func));
1802
1803 MatchingIdsFuncSet.insert(Func);
1804#endif
1805
1806 // Check if the next set of stack ids is the same (since the Calls vector
1807 // of tuples is sorted by the stack ids we can just look at the next one).
1808 // If so, save them in the CallToMatchingCall map so that they get
1809 // assigned to the same context node, and skip them.
1810 bool DuplicateContextIds = false;
1811 for (unsigned J = I + 1; J < Calls.size(); J++) {
1812 auto &CallCtxInfo = Calls[J];
1813 auto &NextIds = CallCtxInfo.StackIds;
1814 if (NextIds != Ids)
1815 break;
1816 auto *NextFunc = CallCtxInfo.Func;
1817 if (NextFunc != Func) {
1818 // We have another Call with the same ids but that cannot share this
1819 // node, must duplicate ids for it.
1820 DuplicateContextIds = true;
1821 break;
1822 }
1823 auto &NextCall = CallCtxInfo.Call;
1824 CallToMatchingCall[NextCall] = Call;
1825 // Update I so that it gets incremented correctly to skip this call.
1826 I = J;
1827 }
1828
1829 // If we don't have duplicate context ids, then we can assign all the
1830 // context ids computed for the original node sequence to this call.
1831 // If there are duplicate calls with the same stack ids then we synthesize
1832 // new context ids that are duplicates of the originals. These are
1833 // assigned to SavedContextIds, which is a reference into the map entry
1834 // for this call, allowing us to access these ids later on.
1835 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1836 StackSequenceContextIds.size());
1837 SavedContextIds =
1838 DuplicateContextIds
1839 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1840 : StackSequenceContextIds;
1841 assert(!SavedContextIds.empty());
1842
1843 if (!DuplicateContextIds) {
1844 // Update saved last node's context ids to remove those that are
1845 // assigned to other calls, so that it is ready for the next call at
1846 // this stack id.
1847 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1848 if (LastNodeContextIds.empty())
1849 break;
1850 }
1851 }
1852 }
1853
1854 // Propagate the duplicate context ids over the graph.
1855 propagateDuplicateContextIds(OldToNewContextIds);
1856
1857 if (VerifyCCG)
1858 check();
1859
1860 // Now perform a post-order traversal over the graph, starting with the
1861 // allocation nodes, essentially processing nodes from callers to callees.
1862 // For any that contains an id in the map, update the graph to contain new
1863 // nodes representing any inlining at interior callsites. Note we move the
1864 // associated context ids over to the new nodes.
1866 for (auto &Entry : AllocationCallToContextNodeMap)
1867 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1868 CallToMatchingCall);
1869 if (VerifyCCG)
1870 check();
1871}
1872
1873uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1875 Call->getMetadata(LLVMContext::MD_callsite));
1876 return CallsiteContext.back();
1877}
1878
1879uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1880 assert(isa<CallsiteInfo *>(Call.getBase()));
1882 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1883 // Need to convert index into stack id.
1884 return Index.getStackIdAtIndex(CallsiteContext.back());
1885}
1886
1887static const std::string MemProfCloneSuffix = ".memprof.";
1888
1889static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1890 // We use CloneNo == 0 to refer to the original version, which doesn't get
1891 // renamed with a suffix.
1892 if (!CloneNo)
1893 return Base.str();
1894 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1895}
1896
1897static bool isMemProfClone(const Function &F) {
1898 return F.getName().contains(MemProfCloneSuffix);
1899}
1900
1901std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1902 const Instruction *Call,
1903 unsigned CloneNo) const {
1904 return (Twine(Call->getFunction()->getName()) + " -> " +
1905 cast<CallBase>(Call)->getCalledFunction()->getName())
1906 .str();
1907}
1908
1909std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1910 const IndexCall &Call,
1911 unsigned CloneNo) const {
1912 auto VI = FSToVIMap.find(Func);
1913 assert(VI != FSToVIMap.end());
1914 if (isa<AllocInfo *>(Call.getBase()))
1915 return (VI->second.name() + " -> alloc").str();
1916 else {
1917 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1918 return (VI->second.name() + " -> " +
1919 getMemProfFuncName(Callsite->Callee.name(),
1920 Callsite->Clones[CloneNo]))
1921 .str();
1922 }
1923}
1924
1925std::vector<uint64_t>
1926ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1927 Instruction *Call) {
1929 Call->getMetadata(LLVMContext::MD_callsite));
1930 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1931 CallsiteContext);
1932}
1933
1934std::vector<uint64_t>
1935IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1936 assert(isa<CallsiteInfo *>(Call.getBase()));
1938 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1939 return getStackIdsWithContextNodes<CallsiteInfo,
1941 CallsiteContext);
1942}
1943
1944template <typename DerivedCCG, typename FuncTy, typename CallTy>
1945template <class NodeT, class IteratorT>
1946std::vector<uint64_t>
1947CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1948 CallStack<NodeT, IteratorT> &CallsiteContext) {
1949 std::vector<uint64_t> StackIds;
1950 for (auto IdOrIndex : CallsiteContext) {
1951 auto StackId = getStackId(IdOrIndex);
1952 ContextNode *Node = getNodeForStackId(StackId);
1953 if (!Node)
1954 break;
1955 StackIds.push_back(StackId);
1956 }
1957 return StackIds;
1958}
1959
1960ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1961 Module &M,
1963 : Mod(M), OREGetter(OREGetter) {
1964 for (auto &F : M) {
1965 std::vector<CallInfo> CallsWithMetadata;
1966 for (auto &BB : F) {
1967 for (auto &I : BB) {
1968 if (!isa<CallBase>(I))
1969 continue;
1970 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1971 CallsWithMetadata.push_back(&I);
1972 auto *AllocNode = addAllocNode(&I, &F);
1973 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1974 assert(CallsiteMD);
1975 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1976 // Add all of the MIBs and their stack nodes.
1977 for (auto &MDOp : MemProfMD->operands()) {
1978 auto *MIBMD = cast<const MDNode>(MDOp);
1979 std::vector<ContextTotalSize> ContextSizeInfo;
1980 // Collect the context size information if it exists.
1981 if (MIBMD->getNumOperands() > 2) {
1982 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
1983 MDNode *ContextSizePair =
1984 dyn_cast<MDNode>(MIBMD->getOperand(I));
1985 assert(ContextSizePair->getNumOperands() == 2);
1986 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
1987 ContextSizePair->getOperand(0))
1988 ->getZExtValue();
1989 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
1990 ContextSizePair->getOperand(1))
1991 ->getZExtValue();
1992 ContextSizeInfo.push_back({FullStackId, TotalSize});
1993 }
1994 }
1998 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1999 AllocNode, StackContext, CallsiteContext,
2000 getMIBAllocType(MIBMD), ContextSizeInfo);
2001 }
2002 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2003 // Memprof and callsite metadata on memory allocations no longer
2004 // needed.
2005 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2006 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2007 }
2008 // For callsite metadata, add to list for this function for later use.
2009 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2010 CallsWithMetadata.push_back(&I);
2011 }
2012 }
2013 }
2014 if (!CallsWithMetadata.empty())
2015 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2016 }
2017
2018 if (DumpCCG) {
2019 dbgs() << "CCG before updating call stack chains:\n";
2020 dbgs() << *this;
2021 }
2022
2023 if (ExportToDot)
2024 exportToDot("prestackupdate");
2025
2026 updateStackNodes();
2027
2028 handleCallsitesWithMultipleTargets();
2029
2030 // Strip off remaining callsite metadata, no longer needed.
2031 for (auto &FuncEntry : FuncToCallsWithMetadata)
2032 for (auto &Call : FuncEntry.second)
2033 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2034}
2035
2036IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2037 ModuleSummaryIndex &Index,
2039 isPrevailing)
2040 : Index(Index), isPrevailing(isPrevailing) {
2041 for (auto &I : Index) {
2042 auto VI = Index.getValueInfo(I);
2043 for (auto &S : VI.getSummaryList()) {
2044 // We should only add the prevailing nodes. Otherwise we may try to clone
2045 // in a weak copy that won't be linked (and may be different than the
2046 // prevailing version).
2047 // We only keep the memprof summary on the prevailing copy now when
2048 // building the combined index, as a space optimization, however don't
2049 // rely on this optimization. The linker doesn't resolve local linkage
2050 // values so don't check whether those are prevailing.
2051 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2052 !isPrevailing(VI.getGUID(), S.get()))
2053 continue;
2054 auto *FS = dyn_cast<FunctionSummary>(S.get());
2055 if (!FS)
2056 continue;
2057 std::vector<CallInfo> CallsWithMetadata;
2058 if (!FS->allocs().empty()) {
2059 for (auto &AN : FS->mutableAllocs()) {
2060 // This can happen because of recursion elimination handling that
2061 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2062 // We still added them to the summary because we need to be able to
2063 // correlate properly in applyImport in the backends.
2064 if (AN.MIBs.empty())
2065 continue;
2066 IndexCall AllocCall(&AN);
2067 CallsWithMetadata.push_back(AllocCall);
2068 auto *AllocNode = addAllocNode(AllocCall, FS);
2069 // Pass an empty CallStack to the CallsiteContext (second)
2070 // parameter, since for ThinLTO we already collapsed out the inlined
2071 // stack ids on the allocation call during ModuleSummaryAnalysis.
2073 EmptyContext;
2074 unsigned I = 0;
2075 assert(
2077 AN.ContextSizeInfos.size() == AN.MIBs.size());
2078 // Now add all of the MIBs and their stack nodes.
2079 for (auto &MIB : AN.MIBs) {
2081 StackContext(&MIB);
2082 std::vector<ContextTotalSize> ContextSizeInfo;
2083 if (!AN.ContextSizeInfos.empty()) {
2084 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2085 ContextSizeInfo.push_back({FullStackId, TotalSize});
2086 }
2087 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2088 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2089 ContextSizeInfo);
2090 I++;
2091 }
2092 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2093 // Initialize version 0 on the summary alloc node to the current alloc
2094 // type, unless it has both types in which case make it default, so
2095 // that in the case where we aren't able to clone the original version
2096 // always ends up with the default allocation behavior.
2097 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2098 }
2099 }
2100 // For callsite metadata, add to list for this function for later use.
2101 if (!FS->callsites().empty())
2102 for (auto &SN : FS->mutableCallsites()) {
2103 IndexCall StackNodeCall(&SN);
2104 CallsWithMetadata.push_back(StackNodeCall);
2105 }
2106
2107 if (!CallsWithMetadata.empty())
2108 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2109
2110 if (!FS->allocs().empty() || !FS->callsites().empty())
2111 FSToVIMap[FS] = VI;
2112 }
2113 }
2114
2115 if (DumpCCG) {
2116 dbgs() << "CCG before updating call stack chains:\n";
2117 dbgs() << *this;
2118 }
2119
2120 if (ExportToDot)
2121 exportToDot("prestackupdate");
2122
2123 updateStackNodes();
2124
2125 handleCallsitesWithMultipleTargets();
2126}
2127
2128template <typename DerivedCCG, typename FuncTy, typename CallTy>
2129void CallsiteContextGraph<DerivedCCG, FuncTy,
2130 CallTy>::handleCallsitesWithMultipleTargets() {
2131 // Look for and workaround callsites that call multiple functions.
2132 // This can happen for indirect calls, which needs better handling, and in
2133 // more rare cases (e.g. macro expansion).
2134 // TODO: To fix this for indirect calls we will want to perform speculative
2135 // devirtualization using either the normal PGO info with ICP, or using the
2136 // information in the profiled MemProf contexts. We can do this prior to
2137 // this transformation for regular LTO, and for ThinLTO we can simulate that
2138 // effect in the summary and perform the actual speculative devirtualization
2139 // while cloning in the ThinLTO backend.
2140
2141 // Keep track of the new nodes synthesized for discovered tail calls missing
2142 // from the profiled contexts.
2143 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2144
2145 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2146 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2147 auto *Node = Entry.second;
2148 assert(Node->Clones.empty());
2149 // Check all node callees and see if in the same function.
2150 // We need to check all of the calls recorded in this Node, because in some
2151 // cases we may have had multiple calls with the same debug info calling
2152 // different callees. This can happen, for example, when an object is
2153 // constructed in the paramter list - the destructor call of the object has
2154 // the same debug info (line/col) as the call the object was passed to.
2155 // Here we will prune any that don't match all callee nodes.
2156 std::vector<CallInfo> AllCalls;
2157 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2158 AllCalls.push_back(Node->Call);
2159 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2160 Node->MatchingCalls.end());
2161
2162 // First see if we can partition the calls by callee function, creating new
2163 // nodes to host each set of calls calling the same callees. This is
2164 // necessary for support indirect calls with ThinLTO, for which we
2165 // synthesized CallsiteInfo records for each target. They will all have the
2166 // same callsite stack ids and would be sharing a context node at this
2167 // point. We need to perform separate cloning for each, which will be
2168 // applied along with speculative devirtualization in the ThinLTO backends
2169 // as needed. Note this does not currently support looking through tail
2170 // calls, it is unclear if we need that for indirect call targets.
2171 // First partition calls by callee func. Map indexed by func, value is
2172 // struct with list of matching calls, assigned node.
2173 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2174 continue;
2175
2176 auto It = AllCalls.begin();
2177 // Iterate through the calls until we find the first that matches.
2178 for (; It != AllCalls.end(); ++It) {
2179 auto ThisCall = *It;
2180 bool Match = true;
2181 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2182 ++EI) {
2183 auto Edge = *EI;
2184 if (!Edge->Callee->hasCall())
2185 continue;
2186 assert(NodeToCallingFunc.count(Edge->Callee));
2187 // Check if the called function matches that of the callee node.
2188 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2189 Match = false;
2190 break;
2191 }
2192 }
2193 // Found a call that matches the callee nodes, we can quit now.
2194 if (Match) {
2195 // If the first match is not the primary call on the Node, update it
2196 // now. We will update the list of matching calls further below.
2197 if (Node->Call != ThisCall) {
2198 Node->setCall(ThisCall);
2199 // We need to update the NonAllocationCallToContextNodeMap, but don't
2200 // want to do this during iteration over that map, so save the calls
2201 // that need updated entries.
2202 NewCallToNode.push_back({ThisCall, Node});
2203 }
2204 break;
2205 }
2206 }
2207 // We will update this list below (or leave it cleared if there was no
2208 // match found above).
2209 Node->MatchingCalls.clear();
2210 // If we hit the end of the AllCalls vector, no call matching the callee
2211 // nodes was found, clear the call information in the node.
2212 if (It == AllCalls.end()) {
2213 RemovedEdgesWithMismatchedCallees++;
2214 // Work around by setting Node to have a null call, so it gets
2215 // skipped during cloning. Otherwise assignFunctions will assert
2216 // because its data structures are not designed to handle this case.
2217 Node->setCall(CallInfo());
2218 continue;
2219 }
2220 // Now add back any matching calls that call the same function as the
2221 // matching primary call on Node.
2222 for (++It; It != AllCalls.end(); ++It) {
2223 auto ThisCall = *It;
2224 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2225 continue;
2226 Node->MatchingCalls.push_back(ThisCall);
2227 }
2228 }
2229
2230 // Remove all mismatched nodes identified in the above loop from the node map
2231 // (checking whether they have a null call which is set above). For a
2232 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2233 // to do the removal via remove_if than by individually erasing entries above.
2234 // Also remove any entries if we updated the node's primary call above.
2235 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2236 return !it.second->hasCall() || it.second->Call != it.first;
2237 });
2238
2239 // Add entries for any new primary calls recorded above.
2240 for (auto &[Call, Node] : NewCallToNode)
2241 NonAllocationCallToContextNodeMap[Call] = Node;
2242
2243 // Add the new nodes after the above loop so that the iteration is not
2244 // invalidated.
2245 for (auto &[Call, Node] : TailCallToContextNodeMap)
2246 NonAllocationCallToContextNodeMap[Call] = Node;
2247}
2248
2249template <typename DerivedCCG, typename FuncTy, typename CallTy>
2250bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2251 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2252 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2253 // Struct to keep track of all the calls having the same callee function,
2254 // and the node we eventually assign to them. Eventually we will record the
2255 // context node assigned to this group of calls.
2256 struct CallsWithSameCallee {
2257 std::vector<CallInfo> Calls;
2258 ContextNode *Node = nullptr;
2259 };
2260
2261 // First partition calls by callee function. Build map from each function
2262 // to the list of matching calls.
2264 for (auto ThisCall : AllCalls) {
2265 auto *F = getCalleeFunc(ThisCall.call());
2266 if (F)
2267 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2268 }
2269
2270 // Next, walk through all callee edges. For each callee node, get its
2271 // containing function and see if it was recorded in the above map (meaning we
2272 // have at least one matching call). Build another map from each callee node
2273 // with a matching call to the structure instance created above containing all
2274 // the calls.
2276 for (const auto &Edge : Node->CalleeEdges) {
2277 if (!Edge->Callee->hasCall())
2278 continue;
2279 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2280 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2281 CalleeNodeToCallInfo[Edge->Callee] =
2282 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2283 }
2284
2285 // If there are entries in the second map, then there were no matching
2286 // calls/callees, nothing to do here. Return so we can go to the handling that
2287 // looks through tail calls.
2288 if (CalleeNodeToCallInfo.empty())
2289 return false;
2290
2291 // Walk through all callee edges again. Any and all callee edges that didn't
2292 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2293 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2294 // ignored during cloning. If it is in the map, then we use the node recorded
2295 // in that entry (creating it if needed), and move the callee edge to it.
2296 // The first callee will use the original node instead of creating a new one.
2297 // Note that any of the original calls on this node (in AllCalls) that didn't
2298 // have a callee function automatically get dropped from the node as part of
2299 // this process.
2300 ContextNode *UnmatchedCalleesNode = nullptr;
2301 // Track whether we already assigned original node to a callee.
2302 bool UsedOrigNode = false;
2303 assert(NodeToCallingFunc[Node]);
2304 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
2305 auto Edge = *EI;
2306 if (!Edge->Callee->hasCall()) {
2307 ++EI;
2308 continue;
2309 }
2310
2311 // Will be updated below to point to whatever (caller) node this callee edge
2312 // should be moved to.
2313 ContextNode *CallerNodeToUse = nullptr;
2314
2315 // Handle the case where there were no matching calls first. Move this
2316 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2317 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2318 if (!UnmatchedCalleesNode)
2319 UnmatchedCalleesNode =
2320 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2321 CallerNodeToUse = UnmatchedCalleesNode;
2322 } else {
2323 // Look up the information recorded for this callee node, and use the
2324 // recorded caller node (creating it if needed).
2325 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2326 if (!Info->Node) {
2327 // If we haven't assigned any callees to the original node use it.
2328 if (!UsedOrigNode) {
2329 Info->Node = Node;
2330 // Clear the set of matching calls which will be updated below.
2331 Node->MatchingCalls.clear();
2332 UsedOrigNode = true;
2333 } else
2334 Info->Node =
2335 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2336 assert(!Info->Calls.empty());
2337 // The first call becomes the primary call for this caller node, and the
2338 // rest go in the matching calls list.
2339 Info->Node->setCall(Info->Calls.front());
2340 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2341 Info->Calls.begin() + 1,
2342 Info->Calls.end());
2343 // Save the primary call to node correspondence so that we can update
2344 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2345 // caller of this function.
2346 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2347 }
2348 CallerNodeToUse = Info->Node;
2349 }
2350
2351 // Don't need to move edge if we are using the original node;
2352 if (CallerNodeToUse == Node) {
2353 ++EI;
2354 continue;
2355 }
2356
2357 moveCalleeEdgeToNewCaller(EI, CallerNodeToUse);
2358 }
2359 // Now that we are done moving edges, clean up any caller edges that ended
2360 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2361 // caller edges from Node are replicated onto the new callers, and it
2362 // simplifies the handling to leave them until we have moved all
2363 // edges/context ids.
2364 for (auto &I : CalleeNodeToCallInfo)
2365 removeNoneTypeCallerEdges(I.second->Node);
2366 if (UnmatchedCalleesNode)
2367 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2368 removeNoneTypeCallerEdges(Node);
2369
2370 return true;
2371}
2372
2373uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2374 // In the Module (IR) case this is already the Id.
2375 return IdOrIndex;
2376}
2377
2378uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2379 // In the Index case this is an index into the stack id list in the summary
2380 // index, convert it to an Id.
2381 return Index.getStackIdAtIndex(IdOrIndex);
2382}
2383
2384template <typename DerivedCCG, typename FuncTy, typename CallTy>
2385bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2386 CallTy Call, EdgeIter &EI,
2387 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2388 auto Edge = *EI;
2389 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2390 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2391 // Will be populated in order of callee to caller if we find a chain of tail
2392 // calls between the profiled caller and callee.
2393 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2394 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2395 FoundCalleeChain))
2396 return false;
2397
2398 // The usual case where the profiled callee matches that of the IR/summary.
2399 if (FoundCalleeChain.empty())
2400 return true;
2401
2402 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2403 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2404 // If there is already an edge between these nodes, simply update it and
2405 // return.
2406 if (CurEdge) {
2407 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2408 Edge->ContextIds.end());
2409 CurEdge->AllocTypes |= Edge->AllocTypes;
2410 return;
2411 }
2412 // Otherwise, create a new edge and insert it into the caller and callee
2413 // lists.
2414 auto NewEdge = std::make_shared<ContextEdge>(
2415 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2416 Callee->CallerEdges.push_back(NewEdge);
2417 if (Caller == Edge->Caller) {
2418 // If we are inserting the new edge into the current edge's caller, insert
2419 // the new edge before the current iterator position, and then increment
2420 // back to the current edge.
2421 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2422 ++EI;
2423 assert(*EI == Edge &&
2424 "Iterator position not restored after insert and increment");
2425 } else
2426 Caller->CalleeEdges.push_back(NewEdge);
2427 };
2428
2429 // Create new nodes for each found callee and connect in between the profiled
2430 // caller and callee.
2431 auto *CurCalleeNode = Edge->Callee;
2432 for (auto &[NewCall, Func] : FoundCalleeChain) {
2433 ContextNode *NewNode = nullptr;
2434 // First check if we have already synthesized a node for this tail call.
2435 if (TailCallToContextNodeMap.count(NewCall)) {
2436 NewNode = TailCallToContextNodeMap[NewCall];
2437 NewNode->AllocTypes |= Edge->AllocTypes;
2438 } else {
2439 FuncToCallsWithMetadata[Func].push_back({NewCall});
2440 // Create Node and record node info.
2441 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2442 TailCallToContextNodeMap[NewCall] = NewNode;
2443 NewNode->AllocTypes = Edge->AllocTypes;
2444 }
2445
2446 // Hook up node to its callee node
2447 AddEdge(NewNode, CurCalleeNode);
2448
2449 CurCalleeNode = NewNode;
2450 }
2451
2452 // Hook up edge's original caller to new callee node.
2453 AddEdge(Edge->Caller, CurCalleeNode);
2454
2455#ifndef NDEBUG
2456 // Save this because Edge's fields get cleared below when removed.
2457 auto *Caller = Edge->Caller;
2458#endif
2459
2460 // Remove old edge
2461 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2462
2463 // To simplify the increment of EI in the caller, subtract one from EI.
2464 // In the final AddEdge call we would have either added a new callee edge,
2465 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2466 // that there is at least one callee edge.
2467 assert(!Caller->CalleeEdges.empty());
2468 --EI;
2469
2470 return true;
2471}
2472
2473bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2474 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2475 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2476 bool &FoundMultipleCalleeChains) {
2477 // Stop recursive search if we have already explored the maximum specified
2478 // depth.
2480 return false;
2481
2482 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2483 FoundCalleeChain.push_back({Callsite, F});
2484 };
2485
2486 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2487 if (!CalleeFunc) {
2488 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2489 assert(Alias);
2490 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2491 assert(CalleeFunc);
2492 }
2493
2494 // Look for tail calls in this function, and check if they either call the
2495 // profiled callee directly, or indirectly (via a recursive search).
2496 // Only succeed if there is a single unique tail call chain found between the
2497 // profiled caller and callee, otherwise we could perform incorrect cloning.
2498 bool FoundSingleCalleeChain = false;
2499 for (auto &BB : *CalleeFunc) {
2500 for (auto &I : BB) {
2501 auto *CB = dyn_cast<CallBase>(&I);
2502 if (!CB || !CB->isTailCall())
2503 continue;
2504 auto *CalledValue = CB->getCalledOperand();
2505 auto *CalledFunction = CB->getCalledFunction();
2506 if (CalledValue && !CalledFunction) {
2507 CalledValue = CalledValue->stripPointerCasts();
2508 // Stripping pointer casts can reveal a called function.
2509 CalledFunction = dyn_cast<Function>(CalledValue);
2510 }
2511 // Check if this is an alias to a function. If so, get the
2512 // called aliasee for the checks below.
2513 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2514 assert(!CalledFunction &&
2515 "Expected null called function in callsite for alias");
2516 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2517 }
2518 if (!CalledFunction)
2519 continue;
2520 if (CalledFunction == ProfiledCallee) {
2521 if (FoundSingleCalleeChain) {
2522 FoundMultipleCalleeChains = true;
2523 return false;
2524 }
2525 FoundSingleCalleeChain = true;
2526 FoundProfiledCalleeCount++;
2527 FoundProfiledCalleeDepth += Depth;
2528 if (Depth > FoundProfiledCalleeMaxDepth)
2529 FoundProfiledCalleeMaxDepth = Depth;
2530 SaveCallsiteInfo(&I, CalleeFunc);
2531 } else if (findProfiledCalleeThroughTailCalls(
2532 ProfiledCallee, CalledFunction, Depth + 1,
2533 FoundCalleeChain, FoundMultipleCalleeChains)) {
2534 // findProfiledCalleeThroughTailCalls should not have returned
2535 // true if FoundMultipleCalleeChains.
2536 assert(!FoundMultipleCalleeChains);
2537 if (FoundSingleCalleeChain) {
2538 FoundMultipleCalleeChains = true;
2539 return false;
2540 }
2541 FoundSingleCalleeChain = true;
2542 SaveCallsiteInfo(&I, CalleeFunc);
2543 } else if (FoundMultipleCalleeChains)
2544 return false;
2545 }
2546 }
2547
2548 return FoundSingleCalleeChain;
2549}
2550
2551const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2552 auto *CB = dyn_cast<CallBase>(Call);
2553 if (!CB->getCalledOperand() || CB->isIndirectCall())
2554 return nullptr;
2555 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2556 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2557 if (Alias)
2558 return dyn_cast<Function>(Alias->getAliasee());
2559 return dyn_cast<Function>(CalleeVal);
2560}
2561
2562bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2563 Instruction *Call, const Function *Func, const Function *CallerFunc,
2564 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2565 auto *CB = dyn_cast<CallBase>(Call);
2566 if (!CB->getCalledOperand() || CB->isIndirectCall())
2567 return false;
2568 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2569 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2570 if (CalleeFunc == Func)
2571 return true;
2572 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2573 if (Alias && Alias->getAliasee() == Func)
2574 return true;
2575
2576 // Recursively search for the profiled callee through tail calls starting with
2577 // the actual Callee. The discovered tail call chain is saved in
2578 // FoundCalleeChain, and we will fixup the graph to include these callsites
2579 // after returning.
2580 // FIXME: We will currently redo the same recursive walk if we find the same
2581 // mismatched callee from another callsite. We can improve this with more
2582 // bookkeeping of the created chain of new nodes for each mismatch.
2583 unsigned Depth = 1;
2584 bool FoundMultipleCalleeChains = false;
2585 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2586 FoundCalleeChain,
2587 FoundMultipleCalleeChains)) {
2588 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2589 << Func->getName() << " from " << CallerFunc->getName()
2590 << " that actually called " << CalleeVal->getName()
2591 << (FoundMultipleCalleeChains
2592 ? " (found multiple possible chains)"
2593 : "")
2594 << "\n");
2595 if (FoundMultipleCalleeChains)
2596 FoundProfiledCalleeNonUniquelyCount++;
2597 return false;
2598 }
2599
2600 return true;
2601}
2602
2603bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2604 Instruction *Call2) {
2605 auto *CB1 = cast<CallBase>(Call1);
2606 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2607 return false;
2608 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2609 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2610 auto *CB2 = cast<CallBase>(Call2);
2611 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2612 return false;
2613 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2614 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2615 return CalleeFunc1 == CalleeFunc2;
2616}
2617
2618bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2619 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2620 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2621 bool &FoundMultipleCalleeChains) {
2622 // Stop recursive search if we have already explored the maximum specified
2623 // depth.
2625 return false;
2626
2627 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2628 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2629 // been synthesized.
2630 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2631 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2632 // StackIds is empty (we don't have debug info available in the index for
2633 // these callsites)
2634 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2635 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2636 CallsiteInfo *NewCallsiteInfo =
2637 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2638 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2639 };
2640
2641 // Look for tail calls in this function, and check if they either call the
2642 // profiled callee directly, or indirectly (via a recursive search).
2643 // Only succeed if there is a single unique tail call chain found between the
2644 // profiled caller and callee, otherwise we could perform incorrect cloning.
2645 bool FoundSingleCalleeChain = false;
2646 for (auto &S : CurCallee.getSummaryList()) {
2647 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2648 !isPrevailing(CurCallee.getGUID(), S.get()))
2649 continue;
2650 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2651 if (!FS)
2652 continue;
2653 auto FSVI = CurCallee;
2654 auto *AS = dyn_cast<AliasSummary>(S.get());
2655 if (AS)
2656 FSVI = AS->getAliaseeVI();
2657 for (auto &CallEdge : FS->calls()) {
2658 if (!CallEdge.second.hasTailCall())
2659 continue;
2660 if (CallEdge.first == ProfiledCallee) {
2661 if (FoundSingleCalleeChain) {
2662 FoundMultipleCalleeChains = true;
2663 return false;
2664 }
2665 FoundSingleCalleeChain = true;
2666 FoundProfiledCalleeCount++;
2667 FoundProfiledCalleeDepth += Depth;
2668 if (Depth > FoundProfiledCalleeMaxDepth)
2669 FoundProfiledCalleeMaxDepth = Depth;
2670 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2671 // Add FS to FSToVIMap in case it isn't already there.
2672 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2673 FSToVIMap[FS] = FSVI;
2674 } else if (findProfiledCalleeThroughTailCalls(
2675 ProfiledCallee, CallEdge.first, Depth + 1,
2676 FoundCalleeChain, FoundMultipleCalleeChains)) {
2677 // findProfiledCalleeThroughTailCalls should not have returned
2678 // true if FoundMultipleCalleeChains.
2679 assert(!FoundMultipleCalleeChains);
2680 if (FoundSingleCalleeChain) {
2681 FoundMultipleCalleeChains = true;
2682 return false;
2683 }
2684 FoundSingleCalleeChain = true;
2685 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2686 // Add FS to FSToVIMap in case it isn't already there.
2687 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2688 FSToVIMap[FS] = FSVI;
2689 } else if (FoundMultipleCalleeChains)
2690 return false;
2691 }
2692 }
2693
2694 return FoundSingleCalleeChain;
2695}
2696
2697const FunctionSummary *
2698IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2700 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2701 if (Callee.getSummaryList().empty())
2702 return nullptr;
2703 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2704}
2705
2706bool IndexCallsiteContextGraph::calleeMatchesFunc(
2707 IndexCall &Call, const FunctionSummary *Func,
2708 const FunctionSummary *CallerFunc,
2709 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2711 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2712 // If there is no summary list then this is a call to an externally defined
2713 // symbol.
2714 AliasSummary *Alias =
2715 Callee.getSummaryList().empty()
2716 ? nullptr
2717 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2718 assert(FSToVIMap.count(Func));
2719 auto FuncVI = FSToVIMap[Func];
2720 if (Callee == FuncVI ||
2721 // If callee is an alias, check the aliasee, since only function
2722 // summary base objects will contain the stack node summaries and thus
2723 // get a context node.
2724 (Alias && Alias->getAliaseeVI() == FuncVI))
2725 return true;
2726
2727 // Recursively search for the profiled callee through tail calls starting with
2728 // the actual Callee. The discovered tail call chain is saved in
2729 // FoundCalleeChain, and we will fixup the graph to include these callsites
2730 // after returning.
2731 // FIXME: We will currently redo the same recursive walk if we find the same
2732 // mismatched callee from another callsite. We can improve this with more
2733 // bookkeeping of the created chain of new nodes for each mismatch.
2734 unsigned Depth = 1;
2735 bool FoundMultipleCalleeChains = false;
2736 if (!findProfiledCalleeThroughTailCalls(
2737 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2738 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2739 << " from " << FSToVIMap[CallerFunc]
2740 << " that actually called " << Callee
2741 << (FoundMultipleCalleeChains
2742 ? " (found multiple possible chains)"
2743 : "")
2744 << "\n");
2745 if (FoundMultipleCalleeChains)
2746 FoundProfiledCalleeNonUniquelyCount++;
2747 return false;
2748 }
2749
2750 return true;
2751}
2752
2753bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2754 ValueInfo Callee1 =
2755 dyn_cast_if_present<CallsiteInfo *>(Call1.getBase())->Callee;
2756 ValueInfo Callee2 =
2757 dyn_cast_if_present<CallsiteInfo *>(Call2.getBase())->Callee;
2758 return Callee1 == Callee2;
2759}
2760
2761template <typename DerivedCCG, typename FuncTy, typename CallTy>
2762void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2763 const {
2764 print(dbgs());
2765 dbgs() << "\n";
2766}
2767
2768template <typename DerivedCCG, typename FuncTy, typename CallTy>
2769void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2770 raw_ostream &OS) const {
2771 OS << "Node " << this << "\n";
2772 OS << "\t";
2773 printCall(OS);
2774 if (Recursive)
2775 OS << " (recursive)";
2776 OS << "\n";
2777 if (!MatchingCalls.empty()) {
2778 OS << "\tMatchingCalls:\n";
2779 for (auto &MatchingCall : MatchingCalls) {
2780 OS << "\t";
2781 MatchingCall.print(OS);
2782 OS << "\n";
2783 }
2784 }
2785 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2786 OS << "\tContextIds:";
2787 // Make a copy of the computed context ids that we can sort for stability.
2788 auto ContextIds = getContextIds();
2789 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2790 std::sort(SortedIds.begin(), SortedIds.end());
2791 for (auto Id : SortedIds)
2792 OS << " " << Id;
2793 OS << "\n";
2794 OS << "\tCalleeEdges:\n";
2795 for (auto &Edge : CalleeEdges)
2796 OS << "\t\t" << *Edge << "\n";
2797 OS << "\tCallerEdges:\n";
2798 for (auto &Edge : CallerEdges)
2799 OS << "\t\t" << *Edge << "\n";
2800 if (!Clones.empty()) {
2801 OS << "\tClones: ";
2802 ListSeparator LS;
2803 for (auto *Clone : Clones)
2804 OS << LS << Clone;
2805 OS << "\n";
2806 } else if (CloneOf) {
2807 OS << "\tClone of " << CloneOf << "\n";
2808 }
2809}
2810
2811template <typename DerivedCCG, typename FuncTy, typename CallTy>
2812void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2813 const {
2814 print(dbgs());
2815 dbgs() << "\n";
2816}
2817
2818template <typename DerivedCCG, typename FuncTy, typename CallTy>
2819void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2820 raw_ostream &OS) const {
2821 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2822 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2823 OS << " ContextIds:";
2824 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2825 std::sort(SortedIds.begin(), SortedIds.end());
2826 for (auto Id : SortedIds)
2827 OS << " " << Id;
2828}
2829
2830template <typename DerivedCCG, typename FuncTy, typename CallTy>
2831void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2832 print(dbgs());
2833}
2834
2835template <typename DerivedCCG, typename FuncTy, typename CallTy>
2836void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2837 raw_ostream &OS) const {
2838 OS << "Callsite Context Graph:\n";
2839 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2840 for (const auto Node : nodes<GraphType>(this)) {
2841 if (Node->isRemoved())
2842 continue;
2843 Node->print(OS);
2844 OS << "\n";
2845 }
2846}
2847
2848template <typename DerivedCCG, typename FuncTy, typename CallTy>
2849void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2850 raw_ostream &OS) const {
2851 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2852 for (const auto Node : nodes<GraphType>(this)) {
2853 if (Node->isRemoved())
2854 continue;
2855 if (!Node->IsAllocation)
2856 continue;
2857 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2858 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
2859 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2860 std::sort(SortedIds.begin(), SortedIds.end());
2861 for (auto Id : SortedIds) {
2862 auto TypeI = ContextIdToAllocationType.find(Id);
2863 assert(TypeI != ContextIdToAllocationType.end());
2864 auto CSI = ContextIdToContextSizeInfos.find(Id);
2865 if (CSI != ContextIdToContextSizeInfos.end()) {
2866 for (auto &Info : CSI->second) {
2867 OS << "MemProf hinting: "
2868 << getAllocTypeString((uint8_t)TypeI->second)
2869 << " full allocation context " << Info.FullStackId
2870 << " with total size " << Info.TotalSize << " is "
2871 << getAllocTypeString(Node->AllocTypes) << " after cloning";
2872 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2873 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2874 << " due to cold byte percent";
2875 OS << "\n";
2876 }
2877 }
2878 }
2879 }
2880}
2881
2882template <typename DerivedCCG, typename FuncTy, typename CallTy>
2883void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2884 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2885 for (const auto Node : nodes<GraphType>(this)) {
2886 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2887 for (auto &Edge : Node->CallerEdges)
2888 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2889 }
2890}
2891
2892template <typename DerivedCCG, typename FuncTy, typename CallTy>
2893struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2894 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2895 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2896
2897 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2898 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2899
2902 decltype(&getNode)>;
2903
2905 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2906 }
2907
2909 return nodes_iterator(G->NodeOwner.end(), &getNode);
2910 }
2911
2913 return G->NodeOwner.begin()->get();
2914 }
2915
2916 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2917 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2919 return P->Callee;
2920 }
2921
2923 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2924 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2925 decltype(&GetCallee)>;
2926
2928 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2929 }
2930
2932 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2933 }
2934};
2935
2936template <typename DerivedCCG, typename FuncTy, typename CallTy>
2937struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2938 : public DefaultDOTGraphTraits {
2939 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2940
2941 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2943 using NodeRef = typename GTraits::NodeRef;
2944 using ChildIteratorType = typename GTraits::ChildIteratorType;
2945
2946 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2947 std::string LabelString =
2948 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2949 Twine(Node->OrigStackOrAllocId))
2950 .str();
2951 LabelString += "\n";
2952 if (Node->hasCall()) {
2953 auto Func = G->NodeToCallingFunc.find(Node);
2954 assert(Func != G->NodeToCallingFunc.end());
2955 LabelString +=
2956 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2957 } else {
2958 LabelString += "null call";
2959 if (Node->Recursive)
2960 LabelString += " (recursive)";
2961 else
2962 LabelString += " (external)";
2963 }
2964 return LabelString;
2965 }
2966
2967 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2968 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2969 getContextIds(Node->getContextIds()) + "\"")
2970 .str();
2971 AttributeString +=
2972 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2973 AttributeString += ",style=\"filled\"";
2974 if (Node->CloneOf) {
2975 AttributeString += ",color=\"blue\"";
2976 AttributeString += ",style=\"filled,bold,dashed\"";
2977 } else
2978 AttributeString += ",style=\"filled\"";
2979 return AttributeString;
2980 }
2981
2982 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2983 GraphType) {
2984 auto &Edge = *(ChildIter.getCurrent());
2985 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2986 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2987 .str();
2988 }
2989
2990 // Since the NodeOwners list includes nodes that are no longer connected to
2991 // the graph, skip them here.
2992 static bool isNodeHidden(NodeRef Node, GraphType) {
2993 return Node->isRemoved();
2994 }
2995
2996private:
2997 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2998 std::string IdString = "ContextIds:";
2999 if (ContextIds.size() < 100) {
3000 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3001 std::sort(SortedIds.begin(), SortedIds.end());
3002 for (auto Id : SortedIds)
3003 IdString += (" " + Twine(Id)).str();
3004 } else {
3005 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3006 }
3007 return IdString;
3008 }
3009
3010 static std::string getColor(uint8_t AllocTypes) {
3011 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3012 // Color "brown1" actually looks like a lighter red.
3013 return "brown1";
3014 if (AllocTypes == (uint8_t)AllocationType::Cold)
3015 return "cyan";
3016 if (AllocTypes ==
3018 // Lighter purple.
3019 return "mediumorchid1";
3020 return "gray";
3021 }
3022
3023 static std::string getNodeId(NodeRef Node) {
3024 std::stringstream SStream;
3025 SStream << std::hex << "N0x" << (unsigned long long)Node;
3026 std::string Result = SStream.str();
3027 return Result;
3028 }
3029};
3030
3031template <typename DerivedCCG, typename FuncTy, typename CallTy>
3032void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3033 std::string Label) const {
3034 WriteGraph(this, "", false, Label,
3035 DotFilePathPrefix + "ccg." + Label + ".dot");
3036}
3037
3038template <typename DerivedCCG, typename FuncTy, typename CallTy>
3039typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3040CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3041 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
3042 DenseSet<uint32_t> ContextIdsToMove) {
3043 ContextNode *Node = Edge->Callee;
3044 assert(NodeToCallingFunc.count(Node));
3045 ContextNode *Clone =
3046 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3047 Node->addClone(Clone);
3048 Clone->MatchingCalls = Node->MatchingCalls;
3049 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
3050 ContextIdsToMove);
3051 return Clone;
3052}
3053
3054template <typename DerivedCCG, typename FuncTy, typename CallTy>
3055void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3056 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3057 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
3058 bool NewClone,
3059 DenseSet<uint32_t> ContextIdsToMove) {
3060 // NewCallee and Edge's current callee must be clones of the same original
3061 // node (Edge's current callee may be the original node too).
3062 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3063
3064 ContextNode *OldCallee = Edge->Callee;
3065
3066 // We might already have an edge to the new callee from earlier cloning for a
3067 // different allocation. If one exists we will reuse it.
3068 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3069
3070 // Callers will pass an empty ContextIdsToMove set when they want to move the
3071 // edge. Copy in Edge's ids for simplicity.
3072 if (ContextIdsToMove.empty())
3073 ContextIdsToMove = Edge->getContextIds();
3074
3075 // If we are moving all of Edge's ids, then just move the whole Edge.
3076 // Otherwise only move the specified subset, to a new edge if needed.
3077 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3078 // First, update the alloc types on New Callee from Edge.
3079 // Do this before we potentially clear Edge's fields below!
3080 NewCallee->AllocTypes |= Edge->AllocTypes;
3081 // Moving the whole Edge.
3082 if (ExistingEdgeToNewCallee) {
3083 // Since we already have an edge to NewCallee, simply move the ids
3084 // onto it, and remove the existing Edge.
3085 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3086 ContextIdsToMove.end());
3087 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3088 assert(Edge->ContextIds == ContextIdsToMove);
3089 removeEdgeFromGraph(Edge.get(), CallerEdgeI, /*CalleeIter=*/false);
3090 } else {
3091 // Otherwise just reconnect Edge to NewCallee.
3092 Edge->Callee = NewCallee;
3093 NewCallee->CallerEdges.push_back(Edge);
3094 // Remove it from callee where it was previously connected.
3095 if (CallerEdgeI)
3096 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
3097 else
3098 OldCallee->eraseCallerEdge(Edge.get());
3099 // Don't need to update Edge's context ids since we are simply
3100 // reconnecting it.
3101 }
3102 } else {
3103 // Only moving a subset of Edge's ids.
3104 if (CallerEdgeI)
3105 ++CallerEdgeI;
3106 // Compute the alloc type of the subset of ids being moved.
3107 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3108 if (ExistingEdgeToNewCallee) {
3109 // Since we already have an edge to NewCallee, simply move the ids
3110 // onto it.
3111 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3112 ContextIdsToMove.end());
3113 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3114 } else {
3115 // Otherwise, create a new edge to NewCallee for the ids being moved.
3116 auto NewEdge = std::make_shared<ContextEdge>(
3117 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3118 Edge->Caller->CalleeEdges.push_back(NewEdge);
3119 NewCallee->CallerEdges.push_back(NewEdge);
3120 }
3121 // In either case, need to update the alloc types on NewCallee, and remove
3122 // those ids and update the alloc type on the original Edge.
3123 NewCallee->AllocTypes |= CallerEdgeAllocType;
3124 set_subtract(Edge->ContextIds, ContextIdsToMove);
3125 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3126 }
3127 // Now walk the old callee node's callee edges and move Edge's context ids
3128 // over to the corresponding edge into the clone (which is created here if
3129 // this is a newly created clone).
3130 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3131 // The context ids moving to the new callee are the subset of this edge's
3132 // context ids and the context ids on the caller edge being moved.
3133 DenseSet<uint32_t> EdgeContextIdsToMove =
3134 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3135 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3136 OldCalleeEdge->AllocTypes =
3137 computeAllocType(OldCalleeEdge->getContextIds());
3138 if (!NewClone) {
3139 // Update context ids / alloc type on corresponding edge to NewCallee.
3140 // There is a chance this may not exist if we are reusing an existing
3141 // clone, specifically during function assignment, where we would have
3142 // removed none type edges after creating the clone. If we can't find
3143 // a corresponding edge there, fall through to the cloning below.
3144 if (auto *NewCalleeEdge =
3145 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
3146 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3147 EdgeContextIdsToMove.end());
3148 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3149 continue;
3150 }
3151 }
3152 auto NewEdge = std::make_shared<ContextEdge>(
3153 OldCalleeEdge->Callee, NewCallee,
3154 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3155 NewCallee->CalleeEdges.push_back(NewEdge);
3156 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3157 }
3158 // Recompute the node alloc type now that its callee edges have been
3159 // updated (since we will compute from those edges).
3160 OldCallee->AllocTypes = OldCallee->computeAllocType();
3161 // OldCallee alloc type should be None iff its context id set is now empty.
3162 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3163 OldCallee->emptyContextIds());
3164 if (VerifyCCG) {
3165 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3166 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3167 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3168 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3169 /*CheckEdges=*/false);
3170 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3171 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3172 /*CheckEdges=*/false);
3173 }
3174}
3175
3176template <typename DerivedCCG, typename FuncTy, typename CallTy>
3177void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3178 moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller) {
3179 auto Edge = *CalleeEdgeI;
3180
3181 ContextNode *OldCaller = Edge->Caller;
3182
3183 // We might already have an edge to the new caller. If one exists we will
3184 // reuse it.
3185 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
3186
3187 CalleeEdgeI = OldCaller->CalleeEdges.erase(CalleeEdgeI);
3188 if (ExistingEdgeToNewCaller) {
3189 // Since we already have an edge to NewCaller, simply move the ids
3190 // onto it, and remove the existing Edge.
3191 ExistingEdgeToNewCaller->getContextIds().insert(
3192 Edge->getContextIds().begin(), Edge->getContextIds().end());
3193 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3194 Edge->ContextIds.clear();
3195 Edge->AllocTypes = (uint8_t)AllocationType::None;
3196 Edge->Callee->eraseCallerEdge(Edge.get());
3197 } else {
3198 // Otherwise just reconnect Edge to NewCaller.
3199 Edge->Caller = NewCaller;
3200 NewCaller->CalleeEdges.push_back(Edge);
3201 // Don't need to update Edge's context ids since we are simply
3202 // reconnecting it.
3203 }
3204 // In either case, need to update the alloc types on New Caller.
3205 NewCaller->AllocTypes |= Edge->AllocTypes;
3206
3207 // Now walk the old caller node's caller edges and move Edge's context ids
3208 // over to the corresponding edge into the node (which is created here if
3209 // this is a newly created node). We can tell whether this is a newly created
3210 // node by seeing if it has any caller edges yet.
3211#ifndef NDEBUG
3212 bool IsNewNode = NewCaller->CallerEdges.empty();
3213#endif
3214 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3215 // The context ids moving to the new caller are the subset of this edge's
3216 // context ids and the context ids on the callee edge being moved.
3217 DenseSet<uint32_t> EdgeContextIdsToMove =
3218 set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
3219 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3220 OldCallerEdge->AllocTypes =
3221 computeAllocType(OldCallerEdge->getContextIds());
3222 // In this function we expect that any pre-existing node already has edges
3223 // from the same callers as the old node. That should be true in the current
3224 // use case, where we will remove None-type edges after copying over all
3225 // caller edges from the callee.
3226 auto *ExistingCallerEdge =
3227 NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
3228 assert(IsNewNode || ExistingCallerEdge);
3229 if (ExistingCallerEdge) {
3230 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3231 EdgeContextIdsToMove.end());
3232 ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3233 continue;
3234 }
3235 auto NewEdge = std::make_shared<ContextEdge>(
3236 NewCaller, OldCallerEdge->Caller,
3237 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3238 NewCaller->CallerEdges.push_back(NewEdge);
3239 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3240 }
3241 // Recompute the node alloc type now that its caller edges have been
3242 // updated (since we will compute from those edges).
3243 OldCaller->AllocTypes = OldCaller->computeAllocType();
3244 // OldCaller alloc type should be None iff its context id set is now empty.
3245 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3246 OldCaller->emptyContextIds());
3247 if (VerifyCCG) {
3248 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3249 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3250 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3251 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3252 /*CheckEdges=*/false);
3253 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3254 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3255 /*CheckEdges=*/false);
3256 }
3257}
3258
3259template <typename DerivedCCG, typename FuncTy, typename CallTy>
3260void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3261 recursivelyRemoveNoneTypeCalleeEdges(
3262 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3263 auto Inserted = Visited.insert(Node);
3264 if (!Inserted.second)
3265 return;
3266
3267 removeNoneTypeCalleeEdges(Node);
3268
3269 for (auto *Clone : Node->Clones)
3270 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3271
3272 // The recursive call may remove some of this Node's caller edges.
3273 // Iterate over a copy and skip any that were removed.
3274 auto CallerEdges = Node->CallerEdges;
3275 for (auto &Edge : CallerEdges) {
3276 // Skip any that have been removed by an earlier recursive call.
3277 if (Edge->isRemoved()) {
3278 assert(!is_contained(Node->CallerEdges, Edge));
3279 continue;
3280 }
3281 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3282 }
3283}
3284
3285template <typename DerivedCCG, typename FuncTy, typename CallTy>
3286void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3288 for (auto &Entry : AllocationCallToContextNodeMap) {
3289 Visited.clear();
3290 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3291 }
3292 Visited.clear();
3293 for (auto &Entry : AllocationCallToContextNodeMap)
3294 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3295 if (VerifyCCG)
3296 check();
3297}
3298
3299// helper function to check an AllocType is cold or notcold or both.
3301 return (AllocType == (uint8_t)AllocationType::Cold) ||
3303 (AllocType ==
3305}
3306
3307template <typename DerivedCCG, typename FuncTy, typename CallTy>
3308void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3309 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3310 const DenseSet<uint32_t> &AllocContextIds) {
3311 if (VerifyNodes)
3312 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3313 assert(!Node->CloneOf);
3314
3315 // If Node as a null call, then either it wasn't found in the module (regular
3316 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3317 // cloning (e.g. recursion, calls multiple targets, etc).
3318 // Do this here so that we don't try to recursively clone callers below, which
3319 // isn't useful at least for this node.
3320 if (!Node->hasCall())
3321 return;
3322
3323#ifndef NDEBUG
3324 auto Insert =
3325#endif
3326 Visited.insert(Node);
3327 // We should not have visited this node yet.
3328 assert(Insert.second);
3329 // The recursive call to identifyClones may delete the current edge from the
3330 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3331 // in an iterator and having recursive call erase from it. Other edges may
3332 // also get removed during the recursion, which will have null Callee and
3333 // Caller pointers (and are deleted later), so we skip those below.
3334 {
3335 auto CallerEdges = Node->CallerEdges;
3336 for (auto &Edge : CallerEdges) {
3337 // Skip any that have been removed by an earlier recursive call.
3338 if (Edge->isRemoved()) {
3339 assert(!is_contained(Node->CallerEdges, Edge));
3340 continue;
3341 }
3342 // Ignore any caller we previously visited via another edge.
3343 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3344 identifyClones(Edge->Caller, Visited, AllocContextIds);
3345 }
3346 }
3347 }
3348
3349 // Check if we reached an unambiguous call or have have only a single caller.
3350 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3351 return;
3352
3353 // We need to clone.
3354
3355 // Try to keep the original version as alloc type NotCold. This will make
3356 // cases with indirect calls or any other situation with an unknown call to
3357 // the original function get the default behavior. We do this by sorting the
3358 // CallerEdges of the Node we will clone by alloc type.
3359 //
3360 // Give NotCold edge the lowest sort priority so those edges are at the end of
3361 // the caller edges vector, and stay on the original version (since the below
3362 // code clones greedily until it finds all remaining edges have the same type
3363 // and leaves the remaining ones on the original Node).
3364 //
3365 // We shouldn't actually have any None type edges, so the sorting priority for
3366 // that is arbitrary, and we assert in that case below.
3367 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3368 /*Cold*/ 1,
3369 /*NotColdCold*/ 2};
3370 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3371 [&](const std::shared_ptr<ContextEdge> &A,
3372 const std::shared_ptr<ContextEdge> &B) {
3373 // Nodes with non-empty context ids should be sorted before
3374 // those with empty context ids.
3375 if (A->ContextIds.empty())
3376 // Either B ContextIds are non-empty (in which case we
3377 // should return false because B < A), or B ContextIds
3378 // are empty, in which case they are equal, and we should
3379 // maintain the original relative ordering.
3380 return false;
3381 if (B->ContextIds.empty())
3382 return true;
3383
3384 if (A->AllocTypes == B->AllocTypes)
3385 // Use the first context id for each edge as a
3386 // tie-breaker.
3387 return *A->ContextIds.begin() < *B->ContextIds.begin();
3388 return AllocTypeCloningPriority[A->AllocTypes] <
3389 AllocTypeCloningPriority[B->AllocTypes];
3390 });
3391
3392 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3393
3394 DenseSet<uint32_t> RecursiveContextIds;
3395 // If we are allowing recursive callsites, but have also disabled recursive
3396 // contexts, look for context ids that show up in multiple caller edges.
3398 DenseSet<uint32_t> AllCallerContextIds;
3399 for (auto &CE : Node->CallerEdges) {
3400 // Resize to the largest set of caller context ids, since we know the
3401 // final set will be at least that large.
3402 AllCallerContextIds.reserve(CE->getContextIds().size());
3403 for (auto Id : CE->getContextIds())
3404 if (!AllCallerContextIds.insert(Id).second)
3405 RecursiveContextIds.insert(Id);
3406 }
3407 }
3408
3409 // Iterate until we find no more opportunities for disambiguating the alloc
3410 // types via cloning. In most cases this loop will terminate once the Node
3411 // has a single allocation type, in which case no more cloning is needed.
3412 // We need to be able to remove Edge from CallerEdges, so need to adjust
3413 // iterator inside the loop.
3414 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
3415 auto CallerEdge = *EI;
3416
3417 // See if cloning the prior caller edge left this node with a single alloc
3418 // type or a single caller. In that case no more cloning of Node is needed.
3419 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3420 break;
3421
3422 // If the caller was not successfully matched to a call in the IR/summary,
3423 // there is no point in trying to clone for it as we can't update that call.
3424 if (!CallerEdge->Caller->hasCall()) {
3425 ++EI;
3426 continue;
3427 }
3428
3429 // Only need to process the ids along this edge pertaining to the given
3430 // allocation.
3431 auto CallerEdgeContextsForAlloc =
3432 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3433 if (!RecursiveContextIds.empty())
3434 CallerEdgeContextsForAlloc =
3435 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3436 if (CallerEdgeContextsForAlloc.empty()) {
3437 ++EI;
3438 continue;
3439 }
3440 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3441
3442 // Compute the node callee edge alloc types corresponding to the context ids
3443 // for this caller edge.
3444 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3445 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3446 for (auto &CalleeEdge : Node->CalleeEdges)
3447 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3448 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3449
3450 // Don't clone if doing so will not disambiguate any alloc types amongst
3451 // caller edges (including the callee edges that would be cloned).
3452 // Otherwise we will simply move all edges to the clone.
3453 //
3454 // First check if by cloning we will disambiguate the caller allocation
3455 // type from node's allocation type. Query allocTypeToUse so that we don't
3456 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3457 // neither of these should be None type.
3458 //
3459 // Then check if by cloning node at least one of the callee edges will be
3460 // disambiguated by splitting out different context ids.
3461 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3462 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3463 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3464 allocTypeToUse(Node->AllocTypes) &&
3465 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3466 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3467 ++EI;
3468 continue;
3469 }
3470
3471 // First see if we can use an existing clone. Check each clone and its
3472 // callee edges for matching alloc types.
3473 ContextNode *Clone = nullptr;
3474 for (auto *CurClone : Node->Clones) {
3475 if (allocTypeToUse(CurClone->AllocTypes) !=
3476 allocTypeToUse(CallerAllocTypeForAlloc))
3477 continue;
3478
3479 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3480 hasSingleAllocType(CallerAllocTypeForAlloc);
3481 // The above check should mean that if both have single alloc types that
3482 // they should be equal.
3483 assert(!BothSingleAlloc ||
3484 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3485
3486 // If either both have a single alloc type (which are the same), or if the
3487 // clone's callee edges have the same alloc types as those for the current
3488 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3489 // then we can reuse this clone.
3490 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3491 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3492 Clone = CurClone;
3493 break;
3494 }
3495 }
3496
3497 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3498 if (Clone)
3499 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
3500 CallerEdgeContextsForAlloc);
3501 else
3502 Clone =
3503 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
3504
3505 assert(EI == Node->CallerEdges.end() ||
3506 Node->AllocTypes != (uint8_t)AllocationType::None);
3507 // Sanity check that no alloc types on clone or its edges are None.
3508 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3509 }
3510
3511 // We should still have some context ids on the original Node.
3512 assert(!Node->emptyContextIds());
3513
3514 // Sanity check that no alloc types on node or edges are None.
3515 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3516
3517 if (VerifyNodes)
3518 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3519}
3520
3521void ModuleCallsiteContextGraph::updateAllocationCall(
3523 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3524 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3525 "memprof", AllocTypeString);
3526 cast<CallBase>(Call.call())->addFnAttr(A);
3527 OREGetter(Call.call()->getFunction())
3528 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3529 << ore::NV("AllocationCall", Call.call()) << " in clone "
3530 << ore::NV("Caller", Call.call()->getFunction())
3531 << " marked with memprof allocation attribute "
3532 << ore::NV("Attribute", AllocTypeString));
3533}
3534
3535void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3537 auto *AI = Call.call().dyn_cast<AllocInfo *>();
3538 assert(AI);
3539 assert(AI->Versions.size() > Call.cloneNo());
3540 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3541}
3542
3544ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3545 const auto *CB = cast<CallBase>(Call.call());
3546 if (!CB->getAttributes().hasFnAttr("memprof"))
3547 return AllocationType::None;
3548 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3551}
3552
3554IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3555 const auto *AI = Call.call().dyn_cast<AllocInfo *>();
3556 assert(AI->Versions.size() > Call.cloneNo());
3557 return (AllocationType)AI->Versions[Call.cloneNo()];
3558}
3559
3560void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3561 FuncInfo CalleeFunc) {
3562 if (CalleeFunc.cloneNo() > 0)
3563 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3564 OREGetter(CallerCall.call()->getFunction())
3565 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3566 << ore::NV("Call", CallerCall.call()) << " in clone "
3567 << ore::NV("Caller", CallerCall.call()->getFunction())
3568 << " assigned to call function clone "
3569 << ore::NV("Callee", CalleeFunc.func()));
3570}
3571
3572void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3573 FuncInfo CalleeFunc) {
3574 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
3575 assert(CI &&
3576 "Caller cannot be an allocation which should not have profiled calls");
3577 assert(CI->Clones.size() > CallerCall.cloneNo());
3578 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3579}
3580
3581CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3582 Instruction *>::FuncInfo
3583ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3584 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3585 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3586 // Use existing LLVM facilities for cloning and obtaining Call in clone
3587 ValueToValueMapTy VMap;
3588 auto *NewFunc = CloneFunction(Func.func(), VMap);
3589 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3590 assert(!Func.func()->getParent()->getFunction(Name));
3591 NewFunc->setName(Name);
3592 for (auto &Inst : CallsWithMetadataInFunc) {
3593 // This map always has the initial version in it.
3594 assert(Inst.cloneNo() == 0);
3595 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3596 }
3597 OREGetter(Func.func())
3598 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3599 << "created clone " << ore::NV("NewFunction", NewFunc));
3600 return {NewFunc, CloneNo};
3601}
3602
3603CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3604 IndexCall>::FuncInfo
3605IndexCallsiteContextGraph::cloneFunctionForCallsite(
3606 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3607 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3608 // Check how many clones we have of Call (and therefore function).
3609 // The next clone number is the current size of versions array.
3610 // Confirm this matches the CloneNo provided by the caller, which is based on
3611 // the number of function clones we have.
3612 assert(CloneNo ==
3613 (Call.call().is<AllocInfo *>()
3614 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
3615 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
3616 // Walk all the instructions in this function. Create a new version for
3617 // each (by adding an entry to the Versions/Clones summary array), and copy
3618 // over the version being called for the function clone being cloned here.
3619 // Additionally, add an entry to the CallMap for the new function clone,
3620 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3621 // to the new call clone.
3622 for (auto &Inst : CallsWithMetadataInFunc) {
3623 // This map always has the initial version in it.
3624 assert(Inst.cloneNo() == 0);
3625 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
3626 assert(AI->Versions.size() == CloneNo);
3627 // We assign the allocation type later (in updateAllocationCall), just add
3628 // an entry for it here.
3629 AI->Versions.push_back(0);
3630 } else {
3631 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
3632 assert(CI && CI->Clones.size() == CloneNo);
3633 // We assign the clone number later (in updateCall), just add an entry for
3634 // it here.
3635 CI->Clones.push_back(0);
3636 }
3637 CallMap[Inst] = {Inst.call(), CloneNo};
3638 }
3639 return {Func.func(), CloneNo};
3640}
3641
3642// This method assigns cloned callsites to functions, cloning the functions as
3643// needed. The assignment is greedy and proceeds roughly as follows:
3644//
3645// For each function Func:
3646// For each call with graph Node having clones:
3647// Initialize ClonesWorklist to Node and its clones
3648// Initialize NodeCloneCount to 0
3649// While ClonesWorklist is not empty:
3650// Clone = pop front ClonesWorklist
3651// NodeCloneCount++
3652// If Func has been cloned less than NodeCloneCount times:
3653// If NodeCloneCount is 1:
3654// Assign Clone to original Func
3655// Continue
3656// Create a new function clone
3657// If other callers not assigned to call a function clone yet:
3658// Assign them to call new function clone
3659// Continue
3660// Assign any other caller calling the cloned version to new clone
3661//
3662// For each caller of Clone:
3663// If caller is assigned to call a specific function clone:
3664// If we cannot assign Clone to that function clone:
3665// Create new callsite Clone NewClone
3666// Add NewClone to ClonesWorklist
3667// Continue
3668// Assign Clone to existing caller's called function clone
3669// Else:
3670// If Clone not already assigned to a function clone:
3671// Assign to first function clone without assignment
3672// Assign caller to selected function clone
3673template <typename DerivedCCG, typename FuncTy, typename CallTy>
3674bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3675 bool Changed = false;
3676
3677 // Keep track of the assignment of nodes (callsites) to function clones they
3678 // call.
3679 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3680
3681 // Update caller node to call function version CalleeFunc, by recording the
3682 // assignment in CallsiteToCalleeFuncCloneMap.
3683 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3684 const FuncInfo &CalleeFunc) {
3685 assert(Caller->hasCall());
3686 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3687 };
3688
3689 // Walk all functions for which we saw calls with memprof metadata, and handle
3690 // cloning for each of its calls.
3691 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3692 FuncInfo OrigFunc(Func);
3693 // Map from each clone of OrigFunc to a map of remappings of each call of
3694 // interest (from original uncloned call to the corresponding cloned call in
3695 // that function clone).
3696 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3697 for (auto &Call : CallsWithMetadata) {
3698 ContextNode *Node = getNodeForInst(Call);
3699 // Skip call if we do not have a node for it (all uses of its stack ids
3700 // were either on inlined chains or pruned from the MIBs), or if we did
3701 // not create any clones for it.
3702 if (!Node || Node->Clones.empty())
3703 continue;
3704 assert(Node->hasCall() &&
3705 "Not having a call should have prevented cloning");
3706
3707 // Track the assignment of function clones to clones of the current
3708 // callsite Node being handled.
3709 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3710
3711 // Assign callsite version CallsiteClone to function version FuncClone,
3712 // and also assign (possibly cloned) Call to CallsiteClone.
3713 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3714 CallInfo &Call,
3715 ContextNode *CallsiteClone,
3716 bool IsAlloc) {
3717 // Record the clone of callsite node assigned to this function clone.
3718 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3719
3720 assert(FuncClonesToCallMap.count(FuncClone));
3721 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3722 CallInfo CallClone(Call);
3723 if (CallMap.count(Call))
3724 CallClone = CallMap[Call];
3725 CallsiteClone->setCall(CallClone);
3726 // Need to do the same for all matching calls.
3727 for (auto &MatchingCall : Node->MatchingCalls) {
3728 CallInfo CallClone(MatchingCall);
3729 if (CallMap.count(MatchingCall))
3730 CallClone = CallMap[MatchingCall];
3731 // Updates the call in the list.
3732 MatchingCall = CallClone;
3733 }
3734 };
3735
3736 // Keep track of the clones of callsite Node that need to be assigned to
3737 // function clones. This list may be expanded in the loop body below if we
3738 // find additional cloning is required.
3739 std::deque<ContextNode *> ClonesWorklist;
3740 // Ignore original Node if we moved all of its contexts to clones.
3741 if (!Node->emptyContextIds())
3742 ClonesWorklist.push_back(Node);
3743 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3744 Node->Clones.end());
3745
3746 // Now walk through all of the clones of this callsite Node that we need,
3747 // and determine the assignment to a corresponding clone of the current
3748 // function (creating new function clones as needed).
3749 unsigned NodeCloneCount = 0;
3750 while (!ClonesWorklist.empty()) {
3751 ContextNode *Clone = ClonesWorklist.front();
3752 ClonesWorklist.pop_front();
3753 NodeCloneCount++;
3754 if (VerifyNodes)
3755 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3756
3757 // Need to create a new function clone if we have more callsite clones
3758 // than existing function clones, which would have been assigned to an
3759 // earlier clone in the list (we assign callsite clones to function
3760 // clones greedily).
3761 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3762 // If this is the first callsite copy, assign to original function.
3763 if (NodeCloneCount == 1) {
3764 // Since FuncClonesToCallMap is empty in this case, no clones have
3765 // been created for this function yet, and no callers should have
3766 // been assigned a function clone for this callee node yet.
3768 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3769 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3770 }));
3771 // Initialize with empty call map, assign Clone to original function
3772 // and its callers, and skip to the next clone.
3773 FuncClonesToCallMap[OrigFunc] = {};
3774 AssignCallsiteCloneToFuncClone(
3775 OrigFunc, Call, Clone,
3776 AllocationCallToContextNodeMap.count(Call));
3777 for (auto &CE : Clone->CallerEdges) {
3778 // Ignore any caller that does not have a recorded callsite Call.
3779 if (!CE->Caller->hasCall())
3780 continue;
3781 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3782 }
3783 continue;
3784 }
3785
3786 // First locate which copy of OrigFunc to clone again. If a caller
3787 // of this callsite clone was already assigned to call a particular
3788 // function clone, we need to redirect all of those callers to the
3789 // new function clone, and update their other callees within this
3790 // function.
3791 FuncInfo PreviousAssignedFuncClone;
3792 auto EI = llvm::find_if(
3793 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3794 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3795 });
3796 bool CallerAssignedToCloneOfFunc = false;
3797 if (EI != Clone->CallerEdges.end()) {
3798 const std::shared_ptr<ContextEdge> &Edge = *EI;
3799 PreviousAssignedFuncClone =
3800 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3801 CallerAssignedToCloneOfFunc = true;
3802 }
3803
3804 // Clone function and save it along with the CallInfo map created
3805 // during cloning in the FuncClonesToCallMap.
3806 std::map<CallInfo, CallInfo> NewCallMap;
3807 unsigned CloneNo = FuncClonesToCallMap.size();
3808 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3809 "should already exist in the map");
3810 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3811 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3812 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3813 FunctionClonesAnalysis++;
3814 Changed = true;
3815
3816 // If no caller callsites were already assigned to a clone of this
3817 // function, we can simply assign this clone to the new func clone
3818 // and update all callers to it, then skip to the next clone.
3819 if (!CallerAssignedToCloneOfFunc) {
3820 AssignCallsiteCloneToFuncClone(
3821 NewFuncClone, Call, Clone,
3822 AllocationCallToContextNodeMap.count(Call));
3823 for (auto &CE : Clone->CallerEdges) {
3824 // Ignore any caller that does not have a recorded callsite Call.
3825 if (!CE->Caller->hasCall())
3826 continue;
3827 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3828 }
3829 continue;
3830 }
3831
3832 // We may need to do additional node cloning in this case.
3833 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3834 // that were previously assigned to call PreviousAssignedFuncClone,
3835 // to record that they now call NewFuncClone.
3836 // The none type edge removal may remove some of this Clone's caller
3837 // edges, if it is reached via another of its caller's callees.
3838 // Iterate over a copy and skip any that were removed.
3839 auto CallerEdges = Clone->CallerEdges;
3840 for (auto CE : CallerEdges) {
3841 // Skip any that have been removed on an earlier iteration.
3842 if (CE->isRemoved()) {
3843 assert(!is_contained(Clone->CallerEdges, CE));
3844 continue;
3845 }
3846 assert(CE);
3847 // Ignore any caller that does not have a recorded callsite Call.
3848 if (!CE->Caller->hasCall())
3849 continue;
3850
3851 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3852 // We subsequently fall through to later handling that
3853 // will perform any additional cloning required for
3854 // callers that were calling other function clones.
3855 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3856 PreviousAssignedFuncClone)
3857 continue;
3858
3859 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3860
3861 // If we are cloning a function that was already assigned to some
3862 // callers, then essentially we are creating new callsite clones
3863 // of the other callsites in that function that are reached by those
3864 // callers. Clone the other callees of the current callsite's caller
3865 // that were already assigned to PreviousAssignedFuncClone
3866 // accordingly. This is important since we subsequently update the
3867 // calls from the nodes in the graph and their assignments to callee
3868 // functions recorded in CallsiteToCalleeFuncCloneMap.
3869 // The none type edge removal may remove some of this caller's
3870 // callee edges, if it is reached via another of its callees.
3871 // Iterate over a copy and skip any that were removed.
3872 auto CalleeEdges = CE->Caller->CalleeEdges;
3873 for (auto CalleeEdge : CalleeEdges) {
3874 // Skip any that have been removed on an earlier iteration when
3875 // cleaning up newly None type callee edges.
3876 if (CalleeEdge->isRemoved()) {
3877 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3878 continue;
3879 }
3880 assert(CalleeEdge);
3881 ContextNode *Callee = CalleeEdge->Callee;
3882 // Skip the current callsite, we are looking for other
3883 // callsites Caller calls, as well as any that does not have a
3884 // recorded callsite Call.
3885 if (Callee == Clone || !Callee->hasCall())
3886 continue;
3887 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3888 removeNoneTypeCalleeEdges(NewClone);
3889 // Moving the edge may have resulted in some none type
3890 // callee edges on the original Callee.
3891 removeNoneTypeCalleeEdges(Callee);
3892 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3893 // If the Callee node was already assigned to call a specific
3894 // function version, make sure its new clone is assigned to call
3895 // that same function clone.
3896 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3897 RecordCalleeFuncOfCallsite(
3898 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3899 // Update NewClone with the new Call clone of this callsite's Call
3900 // created for the new function clone created earlier.
3901 // Recall that we have already ensured when building the graph
3902 // that each caller can only call callsites within the same
3903 // function, so we are guaranteed that Callee Call is in the
3904 // current OrigFunc.
3905 // CallMap is set up as indexed by original Call at clone 0.
3906 CallInfo OrigCall(Callee->getOrigNode()->Call);
3907 OrigCall.setCloneNo(0);
3908 std::map<CallInfo, CallInfo> &CallMap =
3909 FuncClonesToCallMap[NewFuncClone];
3910 assert(CallMap.count(OrigCall));
3911 CallInfo NewCall(CallMap[OrigCall]);
3912 assert(NewCall);
3913 NewClone->setCall(NewCall);
3914 // Need to do the same for all matching calls.
3915 for (auto &MatchingCall : NewClone->MatchingCalls) {
3916 CallInfo OrigMatchingCall(MatchingCall);
3917 OrigMatchingCall.setCloneNo(0);
3918 assert(CallMap.count(OrigMatchingCall));
3919 CallInfo NewCall(CallMap[OrigMatchingCall]);
3920 assert(NewCall);
3921 // Updates the call in the list.
3922 MatchingCall = NewCall;
3923 }
3924 }
3925 }
3926 // Fall through to handling below to perform the recording of the
3927 // function for this callsite clone. This enables handling of cases
3928 // where the callers were assigned to different clones of a function.
3929 }
3930
3931 // See if we can use existing function clone. Walk through
3932 // all caller edges to see if any have already been assigned to
3933 // a clone of this callsite's function. If we can use it, do so. If not,
3934 // because that function clone is already assigned to a different clone
3935 // of this callsite, then we need to clone again.
3936 // Basically, this checking is needed to handle the case where different
3937 // caller functions/callsites may need versions of this function
3938 // containing different mixes of callsite clones across the different
3939 // callsites within the function. If that happens, we need to create
3940 // additional function clones to handle the various combinations.
3941 //
3942 // Keep track of any new clones of this callsite created by the
3943 // following loop, as well as any existing clone that we decided to
3944 // assign this clone to.
3945 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3946 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3947 // We need to be able to remove Edge from CallerEdges, so need to adjust
3948 // iterator in the loop.
3949 for (auto EI = Clone->CallerEdges.begin();
3950 EI != Clone->CallerEdges.end();) {
3951 auto Edge = *EI;
3952 // Ignore any caller that does not have a recorded callsite Call.
3953 if (!Edge->Caller->hasCall()) {
3954 EI++;
3955 continue;
3956 }
3957 // If this caller already assigned to call a version of OrigFunc, need
3958 // to ensure we can assign this callsite clone to that function clone.
3959 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3960 FuncInfo FuncCloneCalledByCaller =
3961 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3962 // First we need to confirm that this function clone is available
3963 // for use by this callsite node clone.
3964 //
3965 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3966 // its callsite clones, one of those callsite clones X could have
3967 // been assigned to the same function clone called by Edge's caller
3968 // - if Edge's caller calls another callsite within Node's original
3969 // function, and that callsite has another caller reaching clone X.
3970 // We need to clone Node again in this case.
3971 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3972 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3973 Clone) ||
3974 // Detect when we have multiple callers of this callsite that
3975 // have already been assigned to specific, and different, clones
3976 // of OrigFunc (due to other unrelated callsites in Func they
3977 // reach via call contexts). Is this Clone of callsite Node
3978 // assigned to a different clone of OrigFunc? If so, clone Node
3979 // again.
3980 (FuncCloneAssignedToCurCallsiteClone &&
3981 FuncCloneAssignedToCurCallsiteClone !=
3982 FuncCloneCalledByCaller)) {
3983 // We need to use a different newly created callsite clone, in
3984 // order to assign it to another new function clone on a
3985 // subsequent iteration over the Clones array (adjusted below).
3986 // Note we specifically do not reset the
3987 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3988 // when this new clone is processed later we know which version of
3989 // the function to copy (so that other callsite clones we have
3990 // assigned to that function clone are properly cloned over). See
3991 // comments in the function cloning handling earlier.
3992
3993 // Check if we already have cloned this callsite again while
3994 // walking through caller edges, for a caller calling the same
3995 // function clone. If so, we can move this edge to that new clone
3996 // rather than creating yet another new clone.
3997 if (FuncCloneToNewCallsiteCloneMap.count(
3998 FuncCloneCalledByCaller)) {
3999 ContextNode *NewClone =
4000 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4001 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
4002 // Cleanup any none type edges cloned over.
4003 removeNoneTypeCalleeEdges(NewClone);
4004 } else {
4005 // Create a new callsite clone.
4006 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
4007 removeNoneTypeCalleeEdges(NewClone);
4008 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4009 NewClone;
4010 // Add to list of clones and process later.
4011 ClonesWorklist.push_back(NewClone);
4012 assert(EI == Clone->CallerEdges.end() ||
4013 Clone->AllocTypes != (uint8_t)AllocationType::None);
4014 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4015 }
4016 // Moving the caller edge may have resulted in some none type
4017 // callee edges.
4018 removeNoneTypeCalleeEdges(Clone);
4019 // We will handle the newly created callsite clone in a subsequent
4020 // iteration over this Node's Clones. Continue here since we
4021 // already adjusted iterator EI while moving the edge.
4022 continue;
4023 }
4024
4025 // Otherwise, we can use the function clone already assigned to this
4026 // caller.
4027 if (!FuncCloneAssignedToCurCallsiteClone) {
4028 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4029 // Assign Clone to FuncCloneCalledByCaller
4030 AssignCallsiteCloneToFuncClone(
4031 FuncCloneCalledByCaller, Call, Clone,
4032 AllocationCallToContextNodeMap.count(Call));
4033 } else
4034 // Don't need to do anything - callsite is already calling this
4035 // function clone.
4036 assert(FuncCloneAssignedToCurCallsiteClone ==
4037 FuncCloneCalledByCaller);
4038
4039 } else {
4040 // We have not already assigned this caller to a version of
4041 // OrigFunc. Do the assignment now.
4042
4043 // First check if we have already assigned this callsite clone to a
4044 // clone of OrigFunc for another caller during this iteration over
4045 // its caller edges.
4046 if (!FuncCloneAssignedToCurCallsiteClone) {
4047 // Find first function in FuncClonesToCallMap without an assigned
4048 // clone of this callsite Node. We should always have one
4049 // available at this point due to the earlier cloning when the
4050 // FuncClonesToCallMap size was smaller than the clone number.
4051 for (auto &CF : FuncClonesToCallMap) {
4052 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4053 FuncCloneAssignedToCurCallsiteClone = CF.first;
4054 break;
4055 }
4056 }
4057 assert(FuncCloneAssignedToCurCallsiteClone);
4058 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4059 AssignCallsiteCloneToFuncClone(
4060 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4061 AllocationCallToContextNodeMap.count(Call));
4062 } else
4063 assert(FuncCloneToCurNodeCloneMap
4064 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4065 // Update callers to record function version called.
4066 RecordCalleeFuncOfCallsite(Edge->Caller,
4067 FuncCloneAssignedToCurCallsiteClone);
4068 }
4069
4070 EI++;
4071 }
4072 }
4073 if (VerifyCCG) {
4074 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4075 for (const auto &PE : Node->CalleeEdges)
4076 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4077 for (const auto &CE : Node->CallerEdges)
4078 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4079 for (auto *Clone : Node->Clones) {
4080 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4081 for (const auto &PE : Clone->CalleeEdges)
4082 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4083 for (const auto &CE : Clone->CallerEdges)
4084 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4085 }
4086 }
4087 }
4088 }
4089
4090 uint8_t BothTypes =
4092
4093 auto UpdateCalls = [&](ContextNode *Node,
4095 auto &&UpdateCalls) {
4096 auto Inserted = Visited.insert(Node);
4097 if (!Inserted.second)
4098 return;
4099
4100 for (auto *Clone : Node->Clones)
4101 UpdateCalls(Clone, Visited, UpdateCalls);
4102
4103 for (auto &Edge : Node->CallerEdges)
4104 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4105
4106 // Skip if either no call to update, or if we ended up with no context ids
4107 // (we moved all edges onto other clones).
4108 if (!Node->hasCall() || Node->emptyContextIds())
4109 return;
4110
4111 if (Node->IsAllocation) {
4112 auto AT = allocTypeToUse(Node->AllocTypes);
4113 // If the allocation type is ambiguous, and more aggressive hinting
4114 // has been enabled via the MinClonedColdBytePercent flag, see if this
4115 // allocation should be hinted cold anyway because its fraction cold bytes
4116 // allocated is at least the given threshold.
4117 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4118 !ContextIdToContextSizeInfos.empty()) {
4119 uint64_t TotalCold = 0;
4120 uint64_t Total = 0;
4121 for (auto Id : Node->getContextIds()) {
4122 auto TypeI = ContextIdToAllocationType.find(Id);
4123 assert(TypeI != ContextIdToAllocationType.end());
4124 auto CSI = ContextIdToContextSizeInfos.find(Id);
4125 if (CSI != ContextIdToContextSizeInfos.end()) {
4126 for (auto &Info : CSI->second) {
4127 Total += Info.TotalSize;
4128 if (TypeI->second == AllocationType::Cold)
4129 TotalCold += Info.TotalSize;
4130 }
4131 }
4132 }
4133 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4135 }
4136 updateAllocationCall(Node->Call, AT);
4137 assert(Node->MatchingCalls.empty());
4138 return;
4139 }
4140
4141 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4142 return;
4143
4144 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4145 updateCall(Node->Call, CalleeFunc);
4146 // Update all the matching calls as well.
4147 for (auto &Call : Node->MatchingCalls)
4148 updateCall(Call, CalleeFunc);
4149 };
4150
4151 // Performs DFS traversal starting from allocation nodes to update calls to
4152 // reflect cloning decisions recorded earlier. For regular LTO this will
4153 // update the actual calls in the IR to call the appropriate function clone
4154 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4155 // are recorded in the summary entries.
4157 for (auto &Entry : AllocationCallToContextNodeMap)
4158 UpdateCalls(Entry.second, Visited, UpdateCalls);
4159
4160 return Changed;
4161}
4162
4164 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4166 &FuncToAliasMap) {
4167 // The first "clone" is the original copy, we should only call this if we
4168 // needed to create new clones.
4169 assert(NumClones > 1);
4171 VMaps.reserve(NumClones - 1);
4172 FunctionsClonedThinBackend++;
4173 for (unsigned I = 1; I < NumClones; I++) {
4174 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4175 auto *NewF = CloneFunction(&F, *VMaps.back());
4176 FunctionClonesThinBackend++;
4177 // Strip memprof and callsite metadata from clone as they are no longer
4178 // needed.
4179 for (auto &BB : *NewF) {
4180 for (auto &Inst : BB) {
4181 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4182 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4183 }
4184 }
4185 std::string Name = getMemProfFuncName(F.getName(), I);
4186 auto *PrevF = M.getFunction(Name);
4187 if (PrevF) {
4188 // We might have created this when adjusting callsite in another
4189 // function. It should be a declaration.
4190 assert(PrevF->isDeclaration());
4191 NewF->takeName(PrevF);
4192 PrevF->replaceAllUsesWith(NewF);
4193 PrevF->eraseFromParent();
4194 } else
4195 NewF->setName(Name);
4196 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4197 << "created clone " << ore::NV("NewFunction", NewF));
4198
4199 // Now handle aliases to this function, and clone those as well.
4200 if (!FuncToAliasMap.count(&F))
4201 continue;
4202 for (auto *A : FuncToAliasMap[&F]) {
4203 std::string Name = getMemProfFuncName(A->getName(), I);
4204 auto *PrevA = M.getNamedAlias(Name);
4205 auto *NewA = GlobalAlias::create(A->getValueType(),
4206 A->getType()->getPointerAddressSpace(),
4207 A->getLinkage(), Name, NewF);
4208 NewA->copyAttributesFrom(A);
4209 if (PrevA) {
4210 // We might have created this when adjusting callsite in another
4211 // function. It should be a declaration.
4212 assert(PrevA->isDeclaration());
4213 NewA->takeName(PrevA);
4214 PrevA->replaceAllUsesWith(NewA);
4215 PrevA->eraseFromParent();
4216 }
4217 }
4218 }
4219 return VMaps;
4220}
4221
4222// Locate the summary for F. This is complicated by the fact that it might
4223// have been internalized or promoted.
4225 const ModuleSummaryIndex *ImportSummary) {
4226 // FIXME: Ideally we would retain the original GUID in some fashion on the
4227 // function (e.g. as metadata), but for now do our best to locate the
4228 // summary without that information.
4229 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4230 if (!TheFnVI)
4231 // See if theFn was internalized, by checking index directly with
4232 // original name (this avoids the name adjustment done by getGUID() for
4233 // internal symbols).
4234 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4235 if (TheFnVI)
4236 return TheFnVI;
4237 // Now query with the original name before any promotion was performed.
4238 StringRef OrigName =
4240 std::string OrigId = GlobalValue::getGlobalIdentifier(
4241 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
4242 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4243 if (TheFnVI)
4244 return TheFnVI;
4245 // Could be a promoted local imported from another module. We need to pass
4246 // down more info here to find the original module id. For now, try with
4247 // the OrigName which might have been stored in the OidGuidMap in the
4248 // index. This would not work if there were same-named locals in multiple
4249 // modules, however.
4250 auto OrigGUID =
4251 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
4252 if (OrigGUID)
4253 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
4254 return TheFnVI;
4255}
4256
4257bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4258 Module &M) {
4259 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4260 Symtab = std::make_unique<InstrProfSymtab>();
4261 // Don't add canonical names, to avoid multiple functions to the symtab
4262 // when they both have the same root name with "." suffixes stripped.
4263 // If we pick the wrong one then this could lead to incorrect ICP and calling
4264 // a memprof clone that we don't actually create (resulting in linker unsats).
4265 // What this means is that the GUID of the function (or its PGOFuncName
4266 // metadata) *must* match that in the VP metadata to allow promotion.
4267 // In practice this should not be a limitation, since local functions should
4268 // have PGOFuncName metadata and global function names shouldn't need any
4269 // special handling (they should not get the ".llvm.*" suffix that the
4270 // canonicalization handling is attempting to strip).
4271 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4272 std::string SymtabFailure = toString(std::move(E));
4273 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4274 return false;
4275 }
4276 return true;
4277}
4278
4279bool MemProfContextDisambiguation::applyImport(Module &M) {
4280 assert(ImportSummary);
4281 bool Changed = false;
4282
4283 // We also need to clone any aliases that reference cloned functions, because
4284 // the modified callsites may invoke via the alias. Keep track of the aliases
4285 // for each function.
4286 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4287 FuncToAliasMap;
4288 for (auto &A : M.aliases()) {
4289 auto *Aliasee = A.getAliaseeObject();
4290 if (auto *F = dyn_cast<Function>(Aliasee))
4291 FuncToAliasMap[F].insert(&A);
4292 }
4293
4294 if (!initializeIndirectCallPromotionInfo(M))
4295 return false;
4296
4297 for (auto &F : M) {
4298 if (F.isDeclaration() || isMemProfClone(F))
4299 continue;
4300
4302
4304 bool ClonesCreated = false;
4305 unsigned NumClonesCreated = 0;
4306 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4307 // We should at least have version 0 which is the original copy.
4308 assert(NumClones > 0);
4309 // If only one copy needed use original.
4310 if (NumClones == 1)
4311 return;
4312 // If we already performed cloning of this function, confirm that the
4313 // requested number of clones matches (the thin link should ensure the
4314 // number of clones for each constituent callsite is consistent within
4315 // each function), before returning.
4316 if (ClonesCreated) {
4317 assert(NumClonesCreated == NumClones);
4318 return;
4319 }
4320 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4321 // The first "clone" is the original copy, which doesn't have a VMap.
4322 assert(VMaps.size() == NumClones - 1);
4323 Changed = true;
4324 ClonesCreated = true;
4325 NumClonesCreated = NumClones;
4326 };
4327
4328 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4329 Function *CalledFunction) {
4330 // Perform cloning if not yet done.
4331 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4332
4333 assert(!isMemProfClone(*CalledFunction));
4334
4335 // Update the calls per the summary info.
4336 // Save orig name since it gets updated in the first iteration
4337 // below.
4338 auto CalleeOrigName = CalledFunction->getName();
4339 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4340 // Do nothing if this version calls the original version of its
4341 // callee.
4342 if (!StackNode.Clones[J])
4343 continue;
4344 auto NewF = M.getOrInsertFunction(
4345 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4346 CalledFunction->getFunctionType());
4347 CallBase *CBClone;
4348 // Copy 0 is the original function.
4349 if (!J)
4350 CBClone = CB;
4351 else
4352 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4353 CBClone->setCalledFunction(NewF);
4354 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4355 << ore::NV("Call", CBClone) << " in clone "
4356 << ore::NV("Caller", CBClone->getFunction())
4357 << " assigned to call function clone "
4358 << ore::NV("Callee", NewF.getCallee()));
4359 }
4360 };
4361
4362 // Locate the summary for F.
4363 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4364 // If not found, this could be an imported local (see comment in
4365 // findValueInfoForFunc). Skip for now as it will be cloned in its original
4366 // module (where it would have been promoted to global scope so should
4367 // satisfy any reference in this module).
4368 if (!TheFnVI)
4369 continue;
4370
4371 auto *GVSummary =
4372 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4373 if (!GVSummary) {
4374 // Must have been imported, use the summary which matches the definition。
4375 // (might be multiple if this was a linkonce_odr).
4376 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4377 assert(SrcModuleMD &&
4378 "enable-import-metadata is needed to emit thinlto_src_module");
4379 StringRef SrcModule =
4380 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4381 for (auto &GVS : TheFnVI.getSummaryList()) {
4382 if (GVS->modulePath() == SrcModule) {
4383 GVSummary = GVS.get();
4384 break;
4385 }
4386 }
4387 assert(GVSummary && GVSummary->modulePath() == SrcModule);
4388 }
4389
4390 // If this was an imported alias skip it as we won't have the function
4391 // summary, and it should be cloned in the original module.
4392 if (isa<AliasSummary>(GVSummary))
4393 continue;
4394
4395 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4396
4397 if (FS->allocs().empty() && FS->callsites().empty())
4398 continue;
4399
4400 auto SI = FS->callsites().begin();
4401 auto AI = FS->allocs().begin();
4402
4403 // To handle callsite infos synthesized for tail calls which have missing
4404 // frames in the profiled context, map callee VI to the synthesized callsite
4405 // info.
4406 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4407 // Iterate the callsites for this function in reverse, since we place all
4408 // those synthesized for tail calls at the end.
4409 for (auto CallsiteIt = FS->callsites().rbegin();
4410 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4411 auto &Callsite = *CallsiteIt;
4412 // Stop as soon as we see a non-synthesized callsite info (see comment
4413 // above loop). All the entries added for discovered tail calls have empty
4414 // stack ids.
4415 if (!Callsite.StackIdIndices.empty())
4416 break;
4417 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4418 }
4419
4420 // Keeps track of needed ICP for the function.
4421 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4422
4423 // Assume for now that the instructions are in the exact same order
4424 // as when the summary was created, but confirm this is correct by
4425 // matching the stack ids.
4426 for (auto &BB : F) {
4427 for (auto &I : BB) {
4428 auto *CB = dyn_cast<CallBase>(&I);
4429 // Same handling as when creating module summary.
4430 if (!mayHaveMemprofSummary(CB))
4431 continue;
4432
4433 auto *CalledValue = CB->getCalledOperand();
4434 auto *CalledFunction = CB->getCalledFunction();
4435 if (CalledValue && !CalledFunction) {
4436 CalledValue = CalledValue->stripPointerCasts();
4437 // Stripping pointer casts can reveal a called function.
4438 CalledFunction = dyn_cast<Function>(CalledValue);
4439 }
4440 // Check if this is an alias to a function. If so, get the
4441 // called aliasee for the checks below.
4442 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4443 assert(!CalledFunction &&
4444 "Expected null called function in callsite for alias");
4445 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4446 }
4447
4449 I.getMetadata(LLVMContext::MD_callsite));
4450 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4451
4452 // Include allocs that were already assigned a memprof function
4453 // attribute in the statistics.
4454 if (CB->getAttributes().hasFnAttr("memprof")) {
4455 assert(!MemProfMD);
4456 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4457 ? AllocTypeColdThinBackend++
4458 : AllocTypeNotColdThinBackend++;
4459 OrigAllocsThinBackend++;
4460 AllocVersionsThinBackend++;
4461 if (!MaxAllocVersionsThinBackend)
4462 MaxAllocVersionsThinBackend = 1;
4463 continue;
4464 }
4465
4466 if (MemProfMD) {
4467 // Consult the next alloc node.
4468 assert(AI != FS->allocs().end());
4469 auto &AllocNode = *(AI++);
4470
4471 // Sanity check that the MIB stack ids match between the summary and
4472 // instruction metadata.
4473 auto MIBIter = AllocNode.MIBs.begin();
4474 for (auto &MDOp : MemProfMD->operands()) {
4475 assert(MIBIter != AllocNode.MIBs.end());
4476 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4477 MIBIter->StackIdIndices.begin();
4478 auto *MIBMD = cast<const MDNode>(MDOp);
4479 MDNode *StackMDNode = getMIBStackNode(MIBMD);
4480 assert(StackMDNode);
4481 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4482 auto ContextIterBegin =
4483 StackContext.beginAfterSharedPrefix(CallsiteContext);
4484 // Skip the checking on the first iteration.
4485 uint64_t LastStackContextId =
4486 (ContextIterBegin != StackContext.end() &&
4487 *ContextIterBegin == 0)
4488 ? 1
4489 : 0;
4490 for (auto ContextIter = ContextIterBegin;
4491 ContextIter != StackContext.end(); ++ContextIter) {
4492 // If this is a direct recursion, simply skip the duplicate
4493 // entries, to be consistent with how the summary ids were
4494 // generated during ModuleSummaryAnalysis.
4495 if (LastStackContextId == *ContextIter)
4496 continue;
4497 LastStackContextId = *ContextIter;
4498 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4499 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4500 *ContextIter);
4501 StackIdIndexIter++;
4502 }
4503 MIBIter++;
4504 }
4505
4506 // Perform cloning if not yet done.
4507 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4508
4509 OrigAllocsThinBackend++;
4510 AllocVersionsThinBackend += AllocNode.Versions.size();
4511 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4512 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4513
4514 // If there is only one version that means we didn't end up
4515 // considering this function for cloning, and in that case the alloc
4516 // will still be none type or should have gotten the default NotCold.
4517 // Skip that after calling clone helper since that does some sanity
4518 // checks that confirm we haven't decided yet that we need cloning.
4519 // We might have a single version that is cold due to the
4520 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4521 // case.
4522 if (AllocNode.Versions.size() == 1 &&
4523 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
4524 assert((AllocationType)AllocNode.Versions[0] ==
4526 (AllocationType)AllocNode.Versions[0] ==
4528 UnclonableAllocsThinBackend++;
4529 continue;
4530 }
4531
4532 // All versions should have a singular allocation type.
4533 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4534 return Type == ((uint8_t)AllocationType::NotCold |
4535 (uint8_t)AllocationType::Cold);
4536 }));
4537
4538 // Update the allocation types per the summary info.
4539 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4540 // Ignore any that didn't get an assigned allocation type.
4541 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4542 continue;
4543 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4544 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4545 : AllocTypeNotColdThinBackend++;
4546 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4547 auto A = llvm::Attribute::get(F.getContext(), "memprof",
4548 AllocTypeString);
4549 CallBase *CBClone;
4550 // Copy 0 is the original function.
4551 if (!J)
4552 CBClone = CB;
4553 else
4554 // Since VMaps are only created for new clones, we index with
4555 // clone J-1 (J==0 is the original clone and does not have a VMaps
4556 // entry).
4557 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4558 CBClone->addFnAttr(A);
4559 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4560 << ore::NV("AllocationCall", CBClone) << " in clone "
4561 << ore::NV("Caller", CBClone->getFunction())
4562 << " marked with memprof allocation attribute "
4563 << ore::NV("Attribute", AllocTypeString));
4564 }
4565 } else if (!CallsiteContext.empty()) {
4566 if (!CalledFunction) {
4567#ifndef NDEBUG
4568 // We should have skipped inline assembly calls.
4569 auto *CI = dyn_cast<CallInst>(CB);
4570 assert(!CI || !CI->isInlineAsm());
4571#endif
4572 // We should have skipped direct calls via a Constant.
4573 assert(CalledValue && !isa<Constant>(CalledValue));
4574
4575 // This is an indirect call, see if we have profile information and
4576 // whether any clones were recorded for the profiled targets (that
4577 // we synthesized CallsiteInfo summary records for when building the
4578 // index).
4579 auto NumClones =
4580 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4581
4582 // Perform cloning if not yet done. This is done here in case
4583 // we don't need to do ICP, but might need to clone this
4584 // function as it is the target of other cloned calls.
4585 if (NumClones)
4586 CloneFuncIfNeeded(NumClones);
4587 }
4588
4589 else {
4590 // Consult the next callsite node.
4591 assert(SI != FS->callsites().end());
4592 auto &StackNode = *(SI++);
4593
4594#ifndef NDEBUG
4595 // Sanity check that the stack ids match between the summary and
4596 // instruction metadata.
4597 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4598 for (auto StackId : CallsiteContext) {
4599 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4600 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4601 StackId);
4602 StackIdIndexIter++;
4603 }
4604#endif
4605
4606 CloneCallsite(StackNode, CB, CalledFunction);
4607 }
4608 } else if (CB->isTailCall() && CalledFunction) {
4609 // Locate the synthesized callsite info for the callee VI, if any was
4610 // created, and use that for cloning.
4611 ValueInfo CalleeVI =
4612 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
4613 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4614 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4615 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4616 CloneCallsite(Callsite->second, CB, CalledFunction);
4617 }
4618 }
4619 }
4620 }
4621
4622 // Now do any promotion required for cloning.
4623 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4624 }
4625
4626 // We skip some of the functions and instructions above, so remove all the
4627 // metadata in a single sweep here.
4628 for (auto &F : M) {
4629 // We can skip memprof clones because createFunctionClones already strips
4630 // the metadata from the newly created clones.
4631 if (F.isDeclaration() || isMemProfClone(F))
4632 continue;
4633 for (auto &BB : F) {
4634 for (auto &I : BB) {
4635 if (!isa<CallBase>(I))
4636 continue;
4637 I.setMetadata(LLVMContext::MD_memprof, nullptr);
4638 I.setMetadata(LLVMContext::MD_callsite, nullptr);
4639 }
4640 }
4641 }
4642
4643 return Changed;
4644}
4645
4646unsigned MemProfContextDisambiguation::recordICPInfo(
4647 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4649 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4650 // First see if we have profile information for this indirect call.
4651 uint32_t NumCandidates;
4652 uint64_t TotalCount;
4653 auto CandidateProfileData =
4654 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4655 NumCandidates);
4656 if (CandidateProfileData.empty())
4657 return 0;
4658
4659 // Iterate through all of the candidate profiled targets along with the
4660 // CallsiteInfo summary records synthesized for them when building the index,
4661 // and see if any are cloned and/or refer to clones.
4662 bool ICPNeeded = false;
4663 unsigned NumClones = 0;
4664 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4665 for (const auto &Candidate : CandidateProfileData) {
4666#ifndef NDEBUG
4667 auto CalleeValueInfo =
4668#endif
4669 ImportSummary->getValueInfo(Candidate.Value);
4670 // We might not have a ValueInfo if this is a distributed
4671 // ThinLTO backend and decided not to import that function.
4672 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4673 assert(SI != AllCallsites.end());
4674 auto &StackNode = *(SI++);
4675 // See if any of the clones of the indirect callsite for this
4676 // profiled target should call a cloned version of the profiled
4677 // target. We only need to do the ICP here if so.
4678 ICPNeeded |= llvm::any_of(StackNode.Clones,
4679 [](unsigned CloneNo) { return CloneNo != 0; });
4680 // Every callsite in the same function should have been cloned the same
4681 // number of times.
4682 assert(!NumClones || NumClones == StackNode.Clones.size());
4683 NumClones = StackNode.Clones.size();
4684 }
4685 if (!ICPNeeded)
4686 return NumClones;
4687 // Save information for ICP, which is performed later to avoid messing up the
4688 // current function traversal.
4689 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4690 TotalCount, CallsiteInfoStartIndex});
4691 return NumClones;
4692}
4693
4694void MemProfContextDisambiguation::performICP(
4695 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4696 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4697 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4699 // Now do any promotion required for cloning. Specifically, for each
4700 // recorded ICP candidate (which was only recorded because one clone of that
4701 // candidate should call a cloned target), we perform ICP (speculative
4702 // devirtualization) for each clone of the callsite, and update its callee
4703 // to the appropriate clone. Note that the ICP compares against the original
4704 // version of the target, which is what is in the vtable.
4705 for (auto &Info : ICallAnalysisInfo) {
4706 auto *CB = Info.CB;
4707 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4708 auto TotalCount = Info.TotalCount;
4709 unsigned NumPromoted = 0;
4710 unsigned NumClones = 0;
4711
4712 for (auto &Candidate : Info.CandidateProfileData) {
4713 auto &StackNode = AllCallsites[CallsiteIndex++];
4714
4715 // All calls in the same function must have the same number of clones.
4716 assert(!NumClones || NumClones == StackNode.Clones.size());
4717 NumClones = StackNode.Clones.size();
4718
4719 // See if the target is in the module. If it wasn't imported, it is
4720 // possible that this profile could have been collected on a different
4721 // target (or version of the code), and we need to be conservative
4722 // (similar to what is done in the ICP pass).
4723 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4724 if (TargetFunction == nullptr ||
4725 // Any ThinLTO global dead symbol removal should have already
4726 // occurred, so it should be safe to promote when the target is a
4727 // declaration.
4728 // TODO: Remove internal option once more fully tested.
4730 TargetFunction->isDeclaration())) {
4731 ORE.emit([&]() {
4732 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4733 << "Memprof cannot promote indirect call: target with md5sum "
4734 << ore::NV("target md5sum", Candidate.Value) << " not found";
4735 });
4736 // FIXME: See if we can use the new declaration importing support to
4737 // at least get the declarations imported for this case. Hot indirect
4738 // targets should have been imported normally, however.
4739 continue;
4740 }
4741
4742 // Check if legal to promote
4743 const char *Reason = nullptr;
4744 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4745 ORE.emit([&]() {
4746 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4747 << "Memprof cannot promote indirect call to "
4748 << ore::NV("TargetFunction", TargetFunction)
4749 << " with count of " << ore::NV("TotalCount", TotalCount)
4750 << ": " << Reason;
4751 });
4752 continue;
4753 }
4754
4755 assert(!isMemProfClone(*TargetFunction));
4756
4757 // Handle each call clone, applying ICP so that each clone directly
4758 // calls the specified callee clone, guarded by the appropriate ICP
4759 // check.
4760 CallBase *CBClone = CB;
4761 for (unsigned J = 0; J < NumClones; J++) {
4762 // Copy 0 is the original function.
4763 if (J > 0)
4764 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4765 // We do the promotion using the original name, so that the comparison
4766 // is against the name in the vtable. Then just below, change the new
4767 // direct call to call the cloned function.
4768 auto &DirectCall =
4769 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4770 TotalCount, isSamplePGO, &ORE);
4771 auto *TargetToUse = TargetFunction;
4772 // Call original if this version calls the original version of its
4773 // callee.
4774 if (StackNode.Clones[J]) {
4775 TargetToUse =
4776 cast<Function>(M.getOrInsertFunction(
4777 getMemProfFuncName(TargetFunction->getName(),
4778 StackNode.Clones[J]),
4779 TargetFunction->getFunctionType())
4780 .getCallee());
4781 }
4782 DirectCall.setCalledFunction(TargetToUse);
4783 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4784 << ore::NV("Call", CBClone) << " in clone "
4785 << ore::NV("Caller", CBClone->getFunction())
4786 << " promoted and assigned to call function clone "
4787 << ore::NV("Callee", TargetToUse));
4788 }
4789
4790 // Update TotalCount (all clones should get same count above)
4791 TotalCount -= Candidate.Count;
4792 NumPromoted++;
4793 }
4794 // Adjust the MD.prof metadata for all clones, now that we have the new
4795 // TotalCount and the number promoted.
4796 CallBase *CBClone = CB;
4797 for (unsigned J = 0; J < NumClones; J++) {
4798 // Copy 0 is the original function.
4799 if (J > 0)
4800 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4801 // First delete the old one.
4802 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4803 // If all promoted, we don't need the MD.prof metadata.
4804 // Otherwise we need update with the un-promoted records back.
4805 if (TotalCount != 0)
4807 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4808 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4809 }
4810 }
4811}
4812
4813template <typename DerivedCCG, typename FuncTy, typename CallTy>
4814bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4815 if (DumpCCG) {
4816 dbgs() << "CCG before cloning:\n";
4817 dbgs() << *this;
4818 }
4819 if (ExportToDot)
4820 exportToDot("postbuild");
4821
4822 if (VerifyCCG) {
4823 check();
4824 }
4825
4826 identifyClones();
4827
4828 if (VerifyCCG) {
4829 check();
4830 }
4831
4832 if (DumpCCG) {
4833 dbgs() << "CCG after cloning:\n";
4834 dbgs() << *this;
4835 }
4836 if (ExportToDot)
4837 exportToDot("cloned");
4838
4839 bool Changed = assignFunctions();
4840
4841 if (DumpCCG) {
4842 dbgs() << "CCG after assigning function clones:\n";
4843 dbgs() << *this;
4844 }
4845 if (ExportToDot)
4846 exportToDot("clonefuncassign");
4847
4849 printTotalSizes(errs());
4850
4851 return Changed;
4852}
4853
4854bool MemProfContextDisambiguation::processModule(
4855 Module &M,
4857
4858 // If we have an import summary, then the cloning decisions were made during
4859 // the thin link on the index. Apply them and return.
4860 if (ImportSummary)
4861 return applyImport(M);
4862
4863 // TODO: If/when other types of memprof cloning are enabled beyond just for
4864 // hot and cold, we will need to change this to individually control the
4865 // AllocationType passed to addStackNodesForMIB during CCG construction.
4866 // Note that we specifically check this after applying imports above, so that
4867 // the option isn't needed to be passed to distributed ThinLTO backend
4868 // clang processes, which won't necessarily have visibility into the linker
4869 // dependences. Instead the information is communicated from the LTO link to
4870 // the backends via the combined summary index.
4871 if (!SupportsHotColdNew)
4872 return false;
4873
4874 ModuleCallsiteContextGraph CCG(M, OREGetter);
4875 return CCG.process();
4876}
4877
4879 const ModuleSummaryIndex *Summary, bool isSamplePGO)
4880 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4881 if (ImportSummary) {
4882 // The MemProfImportSummary should only be used for testing ThinLTO
4883 // distributed backend handling via opt, in which case we don't have a
4884 // summary from the pass pipeline.
4886 return;
4887 }
4888 if (MemProfImportSummary.empty())
4889 return;
4890
4891 auto ReadSummaryFile =
4893 if (!ReadSummaryFile) {
4894 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
4895 "Error loading file '" + MemProfImportSummary +
4896 "': ");
4897 return;
4898 }
4899 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
4900 if (!ImportSummaryForTestingOrErr) {
4901 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
4902 "Error parsing file '" + MemProfImportSummary +
4903 "': ");
4904 return;
4905 }
4906 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4907 ImportSummary = ImportSummaryForTesting.get();
4908}
4909
4912 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4913 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4915 };
4916 if (!processModule(M, OREGetter))
4917 return PreservedAnalyses::all();
4918 return PreservedAnalyses::none();
4919}
4920
4924 isPrevailing) {
4925 // TODO: If/when other types of memprof cloning are enabled beyond just for
4926 // hot and cold, we will need to change this to individually control the
4927 // AllocationType passed to addStackNodesForMIB during CCG construction.
4928 // The index was set from the option, so these should be in sync.
4929 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4930 if (!SupportsHotColdNew)
4931 return;
4932
4933 IndexCallsiteContextGraph CCG(Index, isPrevailing);
4934 CCG.process();
4935}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static bool isMemProfClone(const Function &F)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
cl::opt< unsigned > MinClonedColdBytePercent
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:249
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1482
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1388
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:557
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:184
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:99
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:211
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
Definition: SetOperations.h:93
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1231
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
const char * toString(DWARFSectionKind Kind)
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const