LLVM 20.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
43#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125// Optionally enable cloning of callsites involved with recursive cycles
127 "memprof-allow-recursive-callsites", cl::init(false), cl::Hidden,
128 cl::desc("Allow cloning of callsites involved in recursive cycles"));
129
130// When disabled, try to detect and prevent cloning of recursive contexts.
131// This is only necessary until we support cloning through recursive cycles.
132// Leave on by default for now, as disabling requires a little bit of compile
133// time overhead and doesn't affect correctness, it will just inflate the cold
134// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
136 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137 cl::desc("Allow cloning of contexts through recursive cycles"));
138
139namespace llvm {
141 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
142 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
143
144// Indicate we are linking with an allocator that supports hot/cold operator
145// new interfaces.
147 "supports-hot-cold-new", cl::init(false), cl::Hidden,
148 cl::desc("Linking with hot/cold operator new interfaces"));
149
151 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
152 cl::desc(
153 "Require target function definition when promoting indirect calls"));
154} // namespace llvm
155
158
159namespace {
160/// CRTP base for graphs built from either IR or ThinLTO summary index.
161///
162/// The graph represents the call contexts in all memprof metadata on allocation
163/// calls, with nodes for the allocations themselves, as well as for the calls
164/// in each context. The graph is initially built from the allocation memprof
165/// metadata (or summary) MIBs. It is then updated to match calls with callsite
166/// metadata onto the nodes, updating it to reflect any inlining performed on
167/// those calls.
168///
169/// Each MIB (representing an allocation's call context with allocation
170/// behavior) is assigned a unique context id during the graph build. The edges
171/// and nodes in the graph are decorated with the context ids they carry. This
172/// is used to correctly update the graph when cloning is performed so that we
173/// can uniquify the context for a single (possibly cloned) allocation.
174template <typename DerivedCCG, typename FuncTy, typename CallTy>
175class CallsiteContextGraph {
176public:
177 CallsiteContextGraph() = default;
178 CallsiteContextGraph(const CallsiteContextGraph &) = default;
179 CallsiteContextGraph(CallsiteContextGraph &&) = default;
180
181 /// Main entry point to perform analysis and transformations on graph.
182 bool process();
183
184 /// Perform cloning on the graph necessary to uniquely identify the allocation
185 /// behavior of an allocation based on its context.
186 void identifyClones();
187
188 /// Assign callsite clones to functions, cloning functions as needed to
189 /// accommodate the combinations of their callsite clones reached by callers.
190 /// For regular LTO this clones functions and callsites in the IR, but for
191 /// ThinLTO the cloning decisions are noted in the summaries and later applied
192 /// in applyImport.
193 bool assignFunctions();
194
195 void dump() const;
196 void print(raw_ostream &OS) const;
197 void printTotalSizes(raw_ostream &OS) const;
198
200 const CallsiteContextGraph &CCG) {
201 CCG.print(OS);
202 return OS;
203 }
204
205 friend struct GraphTraits<
206 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
207 friend struct DOTGraphTraits<
208 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
209
210 void exportToDot(std::string Label) const;
211
212 /// Represents a function clone via FuncTy pointer and clone number pair.
213 struct FuncInfo final
214 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
215 using Base = std::pair<FuncTy *, unsigned>;
216 FuncInfo(const Base &B) : Base(B) {}
217 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
218 explicit operator bool() const { return this->first != nullptr; }
219 FuncTy *func() const { return this->first; }
220 unsigned cloneNo() const { return this->second; }
221 };
222
223 /// Represents a callsite clone via CallTy and clone number pair.
224 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
225 using Base = std::pair<CallTy, unsigned>;
226 CallInfo(const Base &B) : Base(B) {}
227 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
228 : Base(Call, CloneNo) {}
229 explicit operator bool() const { return (bool)this->first; }
230 CallTy call() const { return this->first; }
231 unsigned cloneNo() const { return this->second; }
232 void setCloneNo(unsigned N) { this->second = N; }
233 void print(raw_ostream &OS) const {
234 if (!operator bool()) {
235 assert(!cloneNo());
236 OS << "null Call";
237 return;
238 }
239 call()->print(OS);
240 OS << "\t(clone " << cloneNo() << ")";
241 }
242 void dump() const {
243 print(dbgs());
244 dbgs() << "\n";
245 }
246 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
247 Call.print(OS);
248 return OS;
249 }
250 };
251
252 struct ContextEdge;
253
254 /// Node in the Callsite Context Graph
255 struct ContextNode {
256 // Keep this for now since in the IR case where we have an Instruction* it
257 // is not as immediately discoverable. Used for printing richer information
258 // when dumping graph.
259 bool IsAllocation;
260
261 // Keeps track of when the Call was reset to null because there was
262 // recursion.
263 bool Recursive = false;
264
265 // This will be formed by ORing together the AllocationType enum values
266 // for contexts including this node.
267 uint8_t AllocTypes = 0;
268
269 // The corresponding allocation or interior call. This is the primary call
270 // for which we have created this node.
272
273 // List of other calls that can be treated the same as the primary call
274 // through cloning. I.e. located in the same function and have the same
275 // (possibly pruned) stack ids. They will be updated the same way as the
276 // primary call when assigning to function clones.
277 SmallVector<CallInfo, 0> MatchingCalls;
278
279 // For alloc nodes this is a unique id assigned when constructed, and for
280 // callsite stack nodes it is the original stack id when the node is
281 // constructed from the memprof MIB metadata on the alloc nodes. Note that
282 // this is only used when matching callsite metadata onto the stack nodes
283 // created when processing the allocation memprof MIBs, and for labeling
284 // nodes in the dot graph. Therefore we don't bother to assign a value for
285 // clones.
286 uint64_t OrigStackOrAllocId = 0;
287
288 // Edges to all callees in the profiled call stacks.
289 // TODO: Should this be a map (from Callee node) for more efficient lookup?
290 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
291
292 // Edges to all callers in the profiled call stacks.
293 // TODO: Should this be a map (from Caller node) for more efficient lookup?
294 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295
296 // Get the list of edges from which we can compute allocation information
297 // such as the context ids and allocation type of this node.
298 const std::vector<std::shared_ptr<ContextEdge>> *
299 getEdgesWithAllocInfo() const {
300 // If node has any callees, compute from those, otherwise compute from
301 // callers (i.e. if this is the leaf allocation node).
302 if (!CalleeEdges.empty())
303 return &CalleeEdges;
304 if (!CallerEdges.empty()) {
305 // A node with caller edges but no callee edges must be the allocation
306 // node.
307 assert(IsAllocation);
308 return &CallerEdges;
309 }
310 return nullptr;
311 }
312
313 // Compute the context ids for this node from the union of its edge context
314 // ids.
315 DenseSet<uint32_t> getContextIds() const {
316 DenseSet<uint32_t> ContextIds;
317 auto *Edges = getEdgesWithAllocInfo();
318 if (!Edges)
319 return {};
320 unsigned Count = 0;
321 for (auto &Edge : *Edges)
322 Count += Edge->getContextIds().size();
323 ContextIds.reserve(Count);
324 for (auto &Edge : *Edges)
325 ContextIds.insert(Edge->getContextIds().begin(),
326 Edge->getContextIds().end());
327 return ContextIds;
328 }
329
330 // Compute the allocation type for this node from the OR of its edge
331 // allocation types.
332 uint8_t computeAllocType() const {
333 auto *Edges = getEdgesWithAllocInfo();
334 if (!Edges)
335 return (uint8_t)AllocationType::None;
336 uint8_t BothTypes =
337 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
338 uint8_t AllocType = (uint8_t)AllocationType::None;
339 for (auto &Edge : *Edges) {
340 AllocType |= Edge->AllocTypes;
341 // Bail early if alloc type reached both, no further refinement.
342 if (AllocType == BothTypes)
343 return AllocType;
344 }
345 return AllocType;
346 }
347
348 // The context ids set for this node is empty if its edge context ids are
349 // also all empty.
350 bool emptyContextIds() const {
351 auto *Edges = getEdgesWithAllocInfo();
352 if (!Edges)
353 return true;
354 for (auto &Edge : *Edges) {
355 if (!Edge->getContextIds().empty())
356 return false;
357 }
358 return true;
359 }
360
361 // List of clones of this ContextNode, initially empty.
362 std::vector<ContextNode *> Clones;
363
364 // If a clone, points to the original uncloned node.
365 ContextNode *CloneOf = nullptr;
366
367 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
368
369 ContextNode(bool IsAllocation, CallInfo C)
370 : IsAllocation(IsAllocation), Call(C) {}
371
372 void addClone(ContextNode *Clone) {
373 if (CloneOf) {
374 CloneOf->Clones.push_back(Clone);
375 Clone->CloneOf = CloneOf;
376 } else {
377 Clones.push_back(Clone);
378 assert(!Clone->CloneOf);
379 Clone->CloneOf = this;
380 }
381 }
382
383 ContextNode *getOrigNode() {
384 if (!CloneOf)
385 return this;
386 return CloneOf;
387 }
388
389 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
390 unsigned int ContextId);
391
392 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
393 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
394 void eraseCalleeEdge(const ContextEdge *Edge);
395 void eraseCallerEdge(const ContextEdge *Edge);
396
397 void setCall(CallInfo C) { Call = C; }
398
399 bool hasCall() const { return (bool)Call.call(); }
400
401 void printCall(raw_ostream &OS) const { Call.print(OS); }
402
403 // True if this node was effectively removed from the graph, in which case
404 // it should have an allocation type of None and empty context ids.
405 bool isRemoved() const {
406 assert((AllocTypes == (uint8_t)AllocationType::None) ==
407 emptyContextIds());
408 return AllocTypes == (uint8_t)AllocationType::None;
409 }
410
411 void dump() const;
412 void print(raw_ostream &OS) const;
413
414 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
415 Node.print(OS);
416 return OS;
417 }
418 };
419
420 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
421 /// callee.
422 struct ContextEdge {
423 ContextNode *Callee;
424 ContextNode *Caller;
425
426 // This will be formed by ORing together the AllocationType enum values
427 // for contexts including this edge.
428 uint8_t AllocTypes = 0;
429
430 // The set of IDs for contexts including this edge.
431 DenseSet<uint32_t> ContextIds;
432
433 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
434 DenseSet<uint32_t> ContextIds)
435 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
436 ContextIds(std::move(ContextIds)) {}
437
438 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
439
440 // Helper to clear the fields of this edge when we are removing it from the
441 // graph.
442 inline void clear() {
443 ContextIds.clear();
444 AllocTypes = (uint8_t)AllocationType::None;
445 Caller = nullptr;
446 Callee = nullptr;
447 }
448
449 // Check if edge was removed from the graph. This is useful while iterating
450 // over a copy of edge lists when performing operations that mutate the
451 // graph in ways that might remove one of the edges.
452 inline bool isRemoved() const {
453 if (Callee || Caller)
454 return false;
455 // Any edges that have been removed from the graph but are still in a
456 // shared_ptr somewhere should have all fields null'ed out by clear()
457 // above.
458 assert(AllocTypes == (uint8_t)AllocationType::None);
459 assert(ContextIds.empty());
460 return true;
461 }
462
463 void dump() const;
464 void print(raw_ostream &OS) const;
465
466 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
467 Edge.print(OS);
468 return OS;
469 }
470 };
471
472 /// Helpers to remove edges that have allocation type None (due to not
473 /// carrying any context ids) after transformations.
474 void removeNoneTypeCalleeEdges(ContextNode *Node);
475 void removeNoneTypeCallerEdges(ContextNode *Node);
476 void
477 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
479
480protected:
481 /// Get a list of nodes corresponding to the stack ids in the given callsite
482 /// context.
483 template <class NodeT, class IteratorT>
484 std::vector<uint64_t>
485 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
486
487 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
488 /// metadata (or summary).
489 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
490
491 /// Adds nodes for the given MIB stack ids.
492 template <class NodeT, class IteratorT>
493 void addStackNodesForMIB(ContextNode *AllocNode,
494 CallStack<NodeT, IteratorT> &StackContext,
495 CallStack<NodeT, IteratorT> &CallsiteContext,
497 ArrayRef<ContextTotalSize> ContextSizeInfo);
498
499 /// Matches all callsite metadata (or summary) to the nodes created for
500 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
501 /// inlining performed on those callsite instructions.
502 void updateStackNodes();
503
504 /// Update graph to conservatively handle any callsite stack nodes that target
505 /// multiple different callee target functions.
506 void handleCallsitesWithMultipleTargets();
507
508 // Try to partition calls on the given node (already placed into the AllCalls
509 // array) by callee function, creating new copies of Node as needed to hold
510 // calls with different callees, and moving the callee edges appropriately.
511 // Returns true if partitioning was successful.
512 bool partitionCallsByCallee(
513 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
514 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
515
516 /// Save lists of calls with MemProf metadata in each function, for faster
517 /// iteration.
518 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
519
520 /// Map from callsite node to the enclosing caller function.
521 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
522
523private:
524 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
525
526 // Structure to keep track of information for each call as we are matching
527 // non-allocation callsites onto context nodes created from the allocation
528 // call metadata / summary contexts.
529 struct CallContextInfo {
530 // The callsite we're trying to match.
531 CallTy Call;
532 // The callsites stack ids that have a context node in the graph.
533 std::vector<uint64_t> StackIds;
534 // The function containing this callsite.
535 const FuncTy *Func;
536 // Initially empty, if needed this will be updated to contain the context
537 // ids for use in a new context node created for this callsite.
538 DenseSet<uint32_t> ContextIds;
539 };
540
541 /// Helper to remove edge from graph, updating edge iterator if it is provided
542 /// (in which case CalleeIter indicates which edge list is being iterated).
543 /// This will also perform the necessary clearing of the ContextEdge members
544 /// to enable later checking if the edge has been removed (since we may have
545 /// other copies of the shared_ptr in existence, and in fact rely on this to
546 /// enable removal while iterating over a copy of a node's edge list).
547 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
548 bool CalleeIter = true);
549
550 /// Assigns the given Node to calls at or inlined into the location with
551 /// the Node's stack id, after post order traversing and processing its
552 /// caller nodes. Uses the call information recorded in the given
553 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
554 /// as needed. Called by updateStackNodes which sets up the given
555 /// StackIdToMatchingCalls map.
556 void assignStackNodesPostOrder(
557 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
558 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
559 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
560
561 /// Duplicates the given set of context ids, updating the provided
562 /// map from each original id with the newly generated context ids,
563 /// and returning the new duplicated id set.
564 DenseSet<uint32_t> duplicateContextIds(
565 const DenseSet<uint32_t> &StackSequenceContextIds,
566 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
567
568 /// Propagates all duplicated context ids across the graph.
569 void propagateDuplicateContextIds(
570 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
571
572 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
573 /// else to its callers. Also updates OrigNode's edges to remove any context
574 /// ids moved to the newly created edge.
575 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
576 bool TowardsCallee,
577 DenseSet<uint32_t> RemainingContextIds);
578
579 /// Get the stack id corresponding to the given Id or Index (for IR this will
580 /// return itself, for a summary index this will return the id recorded in the
581 /// index for that stack id index value).
582 uint64_t getStackId(uint64_t IdOrIndex) const {
583 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
584 }
585
586 /// Returns true if the given call targets the callee of the given edge, or if
587 /// we were able to identify the call chain through intermediate tail calls.
588 /// In the latter case new context nodes are added to the graph for the
589 /// identified tail calls, and their synthesized nodes are added to
590 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
591 /// the updated edges and to prepare it for an increment in the caller.
592 bool
593 calleesMatch(CallTy Call, EdgeIter &EI,
594 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
595
596 // Return the callee function of the given call, or nullptr if it can't be
597 // determined
598 const FuncTy *getCalleeFunc(CallTy Call) {
599 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
600 }
601
602 /// Returns true if the given call targets the given function, or if we were
603 /// able to identify the call chain through intermediate tail calls (in which
604 /// case FoundCalleeChain will be populated).
605 bool calleeMatchesFunc(
606 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
607 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
608 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
609 Call, Func, CallerFunc, FoundCalleeChain);
610 }
611
612 /// Returns true if both call instructions have the same callee.
613 bool sameCallee(CallTy Call1, CallTy Call2) {
614 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
615 }
616
617 /// Get a list of nodes corresponding to the stack ids in the given
618 /// callsite's context.
619 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
620 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
621 Call);
622 }
623
624 /// Get the last stack id in the context for callsite.
625 uint64_t getLastStackId(CallTy Call) {
626 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
627 }
628
629 /// Update the allocation call to record type of allocated memory.
630 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
631 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
632 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
633 }
634
635 /// Get the AllocationType assigned to the given allocation instruction clone.
636 AllocationType getAllocationCallType(const CallInfo &Call) const {
637 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
638 }
639
640 /// Update non-allocation call to invoke (possibly cloned) function
641 /// CalleeFunc.
642 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
643 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
644 }
645
646 /// Clone the given function for the given callsite, recording mapping of all
647 /// of the functions tracked calls to their new versions in the CallMap.
648 /// Assigns new clones to clone number CloneNo.
649 FuncInfo cloneFunctionForCallsite(
650 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
651 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
652 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
653 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
654 }
655
656 /// Gets a label to use in the dot graph for the given call clone in the given
657 /// function.
658 std::string getLabel(const FuncTy *Func, const CallTy Call,
659 unsigned CloneNo) const {
660 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
661 }
662
663 // Create and return a new ContextNode.
664 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
665 CallInfo C = CallInfo()) {
666 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
667 auto *NewNode = NodeOwner.back().get();
668 if (F)
669 NodeToCallingFunc[NewNode] = F;
670 return NewNode;
671 }
672
673 /// Helpers to find the node corresponding to the given call or stackid.
674 ContextNode *getNodeForInst(const CallInfo &C);
675 ContextNode *getNodeForAlloc(const CallInfo &C);
676 ContextNode *getNodeForStackId(uint64_t StackId);
677
678 /// Computes the alloc type corresponding to the given context ids, by
679 /// unioning their recorded alloc types.
680 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
681
682 /// Returns the allocation type of the intersection of the contexts of two
683 /// nodes (based on their provided context id sets), optimized for the case
684 /// when Node1Ids is smaller than Node2Ids.
685 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
686 const DenseSet<uint32_t> &Node2Ids);
687
688 /// Returns the allocation type of the intersection of the contexts of two
689 /// nodes (based on their provided context id sets).
690 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
691 const DenseSet<uint32_t> &Node2Ids);
692
693 /// Create a clone of Edge's callee and move Edge to that new callee node,
694 /// performing the necessary context id and allocation type updates.
695 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
696 /// moved to an edge to the new callee.
697 ContextNode *
698 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
699 DenseSet<uint32_t> ContextIdsToMove = {});
700
701 /// Change the callee of Edge to existing callee clone NewCallee, performing
702 /// the necessary context id and allocation type updates.
703 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
704 /// moved to an edge to the new callee.
705 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
706 ContextNode *NewCallee,
707 bool NewClone = false,
708 DenseSet<uint32_t> ContextIdsToMove = {});
709
710 /// Change the caller of the edge at the given callee edge iterator to be
711 /// NewCaller, performing the necessary context id and allocation type
712 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
713 /// a simplified version of it as we always move the given edge and all of its
714 /// context ids.
715 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
716 ContextNode *NewCaller);
717
718 /// Recursively perform cloning on the graph for the given Node and its
719 /// callers, in order to uniquely identify the allocation behavior of an
720 /// allocation given its context. The context ids of the allocation being
721 /// processed are given in AllocContextIds.
722 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
723 const DenseSet<uint32_t> &AllocContextIds);
724
725 /// Map from each context ID to the AllocationType assigned to that context.
726 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
727
728 /// Map from each contextID to the profiled full contexts and their total
729 /// sizes (there may be more than one due to context trimming),
730 /// optionally populated when requested (via MemProfReportHintedSizes or
731 /// MinClonedColdBytePercent).
732 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
733
734 /// Identifies the context node created for a stack id when adding the MIB
735 /// contexts to the graph. This is used to locate the context nodes when
736 /// trying to assign the corresponding callsites with those stack ids to these
737 /// nodes.
738 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
739
740 /// Maps to track the calls to their corresponding nodes in the graph.
741 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
742 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
743
744 /// Owner of all ContextNode unique_ptrs.
745 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
746
747 /// Perform sanity checks on graph when requested.
748 void check() const;
749
750 /// Keeps track of the last unique context id assigned.
751 unsigned int LastContextId = 0;
752};
753
754template <typename DerivedCCG, typename FuncTy, typename CallTy>
755using ContextNode =
756 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
757template <typename DerivedCCG, typename FuncTy, typename CallTy>
758using ContextEdge =
759 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
760template <typename DerivedCCG, typename FuncTy, typename CallTy>
761using FuncInfo =
762 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
763template <typename DerivedCCG, typename FuncTy, typename CallTy>
764using CallInfo =
765 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
766
767/// CRTP derived class for graphs built from IR (regular LTO).
768class ModuleCallsiteContextGraph
769 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
770 Instruction *> {
771public:
772 ModuleCallsiteContextGraph(
773 Module &M,
775
776private:
777 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778 Instruction *>;
779
780 uint64_t getStackId(uint64_t IdOrIndex) const;
781 const Function *getCalleeFunc(Instruction *Call);
782 bool calleeMatchesFunc(
783 Instruction *Call, const Function *Func, const Function *CallerFunc,
784 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
785 bool sameCallee(Instruction *Call1, Instruction *Call2);
786 bool findProfiledCalleeThroughTailCalls(
787 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
788 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
789 bool &FoundMultipleCalleeChains);
790 uint64_t getLastStackId(Instruction *Call);
791 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
792 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
793 AllocationType getAllocationCallType(const CallInfo &Call) const;
794 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
795 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
796 Instruction *>::FuncInfo
797 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
798 std::map<CallInfo, CallInfo> &CallMap,
799 std::vector<CallInfo> &CallsWithMetadataInFunc,
800 unsigned CloneNo);
801 std::string getLabel(const Function *Func, const Instruction *Call,
802 unsigned CloneNo) const;
803
804 const Module &Mod;
806};
807
808/// Represents a call in the summary index graph, which can either be an
809/// allocation or an interior callsite node in an allocation's context.
810/// Holds a pointer to the corresponding data structure in the index.
811struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
812 IndexCall() : PointerUnion() {}
813 IndexCall(std::nullptr_t) : IndexCall() {}
815 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
816 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
817
818 IndexCall *operator->() { return this; }
819
820 void print(raw_ostream &OS) const {
822 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
823 OS << *AI;
824 } else {
825 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
826 assert(CI);
827 OS << *CI;
828 }
829 }
830};
831} // namespace
832
833namespace llvm {
834template <> struct simplify_type<IndexCall> {
836 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
837};
838template <> struct simplify_type<const IndexCall> {
840 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
841};
842} // namespace llvm
843
844namespace {
845/// CRTP derived class for graphs built from summary index (ThinLTO).
846class IndexCallsiteContextGraph
847 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
848 IndexCall> {
849public:
850 IndexCallsiteContextGraph(
851 ModuleSummaryIndex &Index,
853 isPrevailing);
854
855 ~IndexCallsiteContextGraph() {
856 // Now that we are done with the graph it is safe to add the new
857 // CallsiteInfo structs to the function summary vectors. The graph nodes
858 // point into locations within these vectors, so we don't want to add them
859 // any earlier.
860 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
861 auto *FS = I.first;
862 for (auto &Callsite : I.second)
863 FS->addCallsite(*Callsite.second);
864 }
865 }
866
867private:
868 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
869 IndexCall>;
870
871 uint64_t getStackId(uint64_t IdOrIndex) const;
872 const FunctionSummary *getCalleeFunc(IndexCall &Call);
873 bool calleeMatchesFunc(
874 IndexCall &Call, const FunctionSummary *Func,
875 const FunctionSummary *CallerFunc,
876 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
877 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
878 bool findProfiledCalleeThroughTailCalls(
879 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
880 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
881 bool &FoundMultipleCalleeChains);
882 uint64_t getLastStackId(IndexCall &Call);
883 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
884 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
885 AllocationType getAllocationCallType(const CallInfo &Call) const;
886 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
887 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
888 IndexCall>::FuncInfo
889 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
890 std::map<CallInfo, CallInfo> &CallMap,
891 std::vector<CallInfo> &CallsWithMetadataInFunc,
892 unsigned CloneNo);
893 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
894 unsigned CloneNo) const;
895
896 // Saves mapping from function summaries containing memprof records back to
897 // its VI, for use in checking and debugging.
898 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
899
902 isPrevailing;
903
904 // Saves/owns the callsite info structures synthesized for missing tail call
905 // frames that we discover while building the graph.
906 // It maps from the summary of the function making the tail call, to a map
907 // of callee ValueInfo to corresponding synthesized callsite info.
908 std::unordered_map<FunctionSummary *,
909 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
910 FunctionCalleesToSynthesizedCallsiteInfos;
911};
912} // namespace
913
914namespace llvm {
915template <>
916struct DenseMapInfo<typename CallsiteContextGraph<
917 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
919template <>
920struct DenseMapInfo<typename CallsiteContextGraph<
921 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
922 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
923template <>
924struct DenseMapInfo<IndexCall>
925 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
926} // end namespace llvm
927
928namespace {
929
930// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
931// type we should actually use on the corresponding allocation.
932// If we can't clone a node that has NotCold+Cold alloc type, we will fall
933// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
934// from NotCold.
935AllocationType allocTypeToUse(uint8_t AllocTypes) {
936 assert(AllocTypes != (uint8_t)AllocationType::None);
937 if (AllocTypes ==
938 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
939 return AllocationType::NotCold;
940 else
941 return (AllocationType)AllocTypes;
942}
943
944// Helper to check if the alloc types for all edges recorded in the
945// InAllocTypes vector match the alloc types for all edges in the Edges
946// vector.
947template <typename DerivedCCG, typename FuncTy, typename CallTy>
948bool allocTypesMatch(
949 const std::vector<uint8_t> &InAllocTypes,
950 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
951 &Edges) {
952 // This should be called only when the InAllocTypes vector was computed for
953 // this set of Edges. Make sure the sizes are the same.
954 assert(InAllocTypes.size() == Edges.size());
955 return std::equal(
956 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
957 [](const uint8_t &l,
958 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
959 // Can share if one of the edges is None type - don't
960 // care about the type along that edge as it doesn't
961 // exist for those context ids.
962 if (l == (uint8_t)AllocationType::None ||
963 r->AllocTypes == (uint8_t)AllocationType::None)
964 return true;
965 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
966 });
967}
968
969// Helper to check if the alloc types for all edges recorded in the
970// InAllocTypes vector match the alloc types for callee edges in the given
971// clone. Because the InAllocTypes were computed from the original node's callee
972// edges, and other cloning could have happened after this clone was created, we
973// need to find the matching clone callee edge, which may or may not exist.
974template <typename DerivedCCG, typename FuncTy, typename CallTy>
975bool allocTypesMatchClone(
976 const std::vector<uint8_t> &InAllocTypes,
977 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
978 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
979 assert(Node);
980 // InAllocTypes should have been computed for the original node's callee
981 // edges.
982 assert(InAllocTypes.size() == Node->CalleeEdges.size());
983 // First create a map of the clone callee edge callees to the edge alloc type.
985 EdgeCalleeMap;
986 for (const auto &E : Clone->CalleeEdges) {
987 assert(!EdgeCalleeMap.contains(E->Callee));
988 EdgeCalleeMap[E->Callee] = E->AllocTypes;
989 }
990 // Next, walk the original node's callees, and look for the corresponding
991 // clone edge to that callee.
992 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
993 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
994 // Not found is ok, we will simply add an edge if we use this clone.
995 if (Iter == EdgeCalleeMap.end())
996 continue;
997 // Can share if one of the edges is None type - don't
998 // care about the type along that edge as it doesn't
999 // exist for those context ids.
1000 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1001 Iter->second == (uint8_t)AllocationType::None)
1002 continue;
1003 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1004 return false;
1005 }
1006 return true;
1007}
1008
1009} // end anonymous namespace
1010
1011template <typename DerivedCCG, typename FuncTy, typename CallTy>
1012typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1013CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1014 const CallInfo &C) {
1015 ContextNode *Node = getNodeForAlloc(C);
1016 if (Node)
1017 return Node;
1018
1019 return NonAllocationCallToContextNodeMap.lookup(C);
1020}
1021
1022template <typename DerivedCCG, typename FuncTy, typename CallTy>
1023typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1024CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1025 const CallInfo &C) {
1026 return AllocationCallToContextNodeMap.lookup(C);
1027}
1028
1029template <typename DerivedCCG, typename FuncTy, typename CallTy>
1030typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1031CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1032 uint64_t StackId) {
1033 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1034 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1035 return StackEntryNode->second;
1036 return nullptr;
1037}
1038
1039template <typename DerivedCCG, typename FuncTy, typename CallTy>
1040void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1041 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1042 unsigned int ContextId) {
1043 for (auto &Edge : CallerEdges) {
1044 if (Edge->Caller == Caller) {
1045 Edge->AllocTypes |= (uint8_t)AllocType;
1046 Edge->getContextIds().insert(ContextId);
1047 return;
1048 }
1049 }
1050 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1051 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1052 CallerEdges.push_back(Edge);
1053 Caller->CalleeEdges.push_back(Edge);
1054}
1055
1056template <typename DerivedCCG, typename FuncTy, typename CallTy>
1057void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1058 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1059 assert(!EI || (*EI)->get() == Edge);
1060 // Save the Caller and Callee pointers so we can erase Edge from their edge
1061 // lists after clearing Edge below. We do the clearing first in case it is
1062 // destructed after removing from the edge lists (if those were the last
1063 // shared_ptr references to Edge).
1064 auto *Callee = Edge->Callee;
1065 auto *Caller = Edge->Caller;
1066
1067 // Make sure the edge fields are cleared out so we can properly detect
1068 // removed edges if Edge is not destructed because there is still a shared_ptr
1069 // reference.
1070 Edge->clear();
1071
1072 if (!EI) {
1073 Callee->eraseCallerEdge(Edge);
1074 Caller->eraseCalleeEdge(Edge);
1075 } else if (CalleeIter) {
1076 Callee->eraseCallerEdge(Edge);
1077 *EI = Caller->CalleeEdges.erase(*EI);
1078 } else {
1079 Caller->eraseCalleeEdge(Edge);
1080 *EI = Callee->CallerEdges.erase(*EI);
1081 }
1082}
1083
1084template <typename DerivedCCG, typename FuncTy, typename CallTy>
1085void CallsiteContextGraph<
1086 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1087 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1088 auto Edge = *EI;
1089 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1090 assert(Edge->ContextIds.empty());
1091 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1092 } else
1093 ++EI;
1094 }
1095}
1096
1097template <typename DerivedCCG, typename FuncTy, typename CallTy>
1098void CallsiteContextGraph<
1099 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1100 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1101 auto Edge = *EI;
1102 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1103 assert(Edge->ContextIds.empty());
1104 Edge->Caller->eraseCalleeEdge(Edge.get());
1105 EI = Node->CallerEdges.erase(EI);
1106 } else
1107 ++EI;
1108 }
1109}
1110
1111template <typename DerivedCCG, typename FuncTy, typename CallTy>
1112typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1113CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1114 findEdgeFromCallee(const ContextNode *Callee) {
1115 for (const auto &Edge : CalleeEdges)
1116 if (Edge->Callee == Callee)
1117 return Edge.get();
1118 return nullptr;
1119}
1120
1121template <typename DerivedCCG, typename FuncTy, typename CallTy>
1122typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1123CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1124 findEdgeFromCaller(const ContextNode *Caller) {
1125 for (const auto &Edge : CallerEdges)
1126 if (Edge->Caller == Caller)
1127 return Edge.get();
1128 return nullptr;
1129}
1130
1131template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1133 eraseCalleeEdge(const ContextEdge *Edge) {
1134 auto EI = llvm::find_if(
1135 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1136 return CalleeEdge.get() == Edge;
1137 });
1138 assert(EI != CalleeEdges.end());
1139 CalleeEdges.erase(EI);
1140}
1141
1142template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1144 eraseCallerEdge(const ContextEdge *Edge) {
1145 auto EI = llvm::find_if(
1146 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1147 return CallerEdge.get() == Edge;
1148 });
1149 assert(EI != CallerEdges.end());
1150 CallerEdges.erase(EI);
1151}
1152
1153template <typename DerivedCCG, typename FuncTy, typename CallTy>
1154uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1155 DenseSet<uint32_t> &ContextIds) {
1156 uint8_t BothTypes =
1157 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1158 uint8_t AllocType = (uint8_t)AllocationType::None;
1159 for (auto Id : ContextIds) {
1160 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1161 // Bail early if alloc type reached both, no further refinement.
1162 if (AllocType == BothTypes)
1163 return AllocType;
1164 }
1165 return AllocType;
1166}
1167
1168template <typename DerivedCCG, typename FuncTy, typename CallTy>
1169uint8_t
1170CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1171 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1172 uint8_t BothTypes =
1173 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1174 uint8_t AllocType = (uint8_t)AllocationType::None;
1175 for (auto Id : Node1Ids) {
1176 if (!Node2Ids.count(Id))
1177 continue;
1178 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1179 // Bail early if alloc type reached both, no further refinement.
1180 if (AllocType == BothTypes)
1181 return AllocType;
1182 }
1183 return AllocType;
1184}
1185
1186template <typename DerivedCCG, typename FuncTy, typename CallTy>
1187uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1188 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1189 if (Node1Ids.size() < Node2Ids.size())
1190 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1191 else
1192 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1193}
1194
1195template <typename DerivedCCG, typename FuncTy, typename CallTy>
1196typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1197CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1198 CallInfo Call, const FuncTy *F) {
1199 assert(!getNodeForAlloc(Call));
1200 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1201 AllocationCallToContextNodeMap[Call] = AllocNode;
1202 // Use LastContextId as a uniq id for MIB allocation nodes.
1203 AllocNode->OrigStackOrAllocId = LastContextId;
1204 // Alloc type should be updated as we add in the MIBs. We should assert
1205 // afterwards that it is not still None.
1206 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1207
1208 return AllocNode;
1209}
1210
1211static std::string getAllocTypeString(uint8_t AllocTypes) {
1212 if (!AllocTypes)
1213 return "None";
1214 std::string Str;
1215 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1216 Str += "NotCold";
1217 if (AllocTypes & (uint8_t)AllocationType::Cold)
1218 Str += "Cold";
1219 return Str;
1220}
1221
1222template <typename DerivedCCG, typename FuncTy, typename CallTy>
1223template <class NodeT, class IteratorT>
1224void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1225 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1227 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1228 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1229 // is done.
1230 if (AllocType == AllocationType::Hot)
1231 AllocType = AllocationType::NotCold;
1232
1233 ContextIdToAllocationType[++LastContextId] = AllocType;
1234
1235 if (!ContextSizeInfo.empty()) {
1236 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1237 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1238 }
1239
1240 // Update alloc type and context ids for this MIB.
1241 AllocNode->AllocTypes |= (uint8_t)AllocType;
1242
1243 // Now add or update nodes for each stack id in alloc's context.
1244 // Later when processing the stack ids on non-alloc callsites we will adjust
1245 // for any inlining in the context.
1246 ContextNode *PrevNode = AllocNode;
1247 // Look for recursion (direct recursion should have been collapsed by
1248 // module summary analysis, here we should just be detecting mutual
1249 // recursion). Mark these nodes so we don't try to clone.
1250 SmallSet<uint64_t, 8> StackIdSet;
1251 // Skip any on the allocation call (inlining).
1252 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1253 ContextIter != StackContext.end(); ++ContextIter) {
1254 auto StackId = getStackId(*ContextIter);
1255 ContextNode *StackNode = getNodeForStackId(StackId);
1256 if (!StackNode) {
1257 StackNode = createNewNode(/*IsAllocation=*/false);
1258 StackEntryIdToContextNodeMap[StackId] = StackNode;
1259 StackNode->OrigStackOrAllocId = StackId;
1260 }
1261 // Marking a node recursive will prevent its cloning completely, even for
1262 // non-recursive contexts flowing through it.
1264 auto Ins = StackIdSet.insert(StackId);
1265 if (!Ins.second)
1266 StackNode->Recursive = true;
1267 }
1268 StackNode->AllocTypes |= (uint8_t)AllocType;
1269 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1270 PrevNode = StackNode;
1271 }
1272}
1273
1274template <typename DerivedCCG, typename FuncTy, typename CallTy>
1276CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1277 const DenseSet<uint32_t> &StackSequenceContextIds,
1278 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1279 DenseSet<uint32_t> NewContextIds;
1280 for (auto OldId : StackSequenceContextIds) {
1281 NewContextIds.insert(++LastContextId);
1282 OldToNewContextIds[OldId].insert(LastContextId);
1283 assert(ContextIdToAllocationType.count(OldId));
1284 // The new context has the same allocation type as original.
1285 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1286 }
1287 return NewContextIds;
1288}
1289
1290template <typename DerivedCCG, typename FuncTy, typename CallTy>
1291void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1292 propagateDuplicateContextIds(
1293 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1294 // Build a set of duplicated context ids corresponding to the input id set.
1295 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1296 DenseSet<uint32_t> NewIds;
1297 for (auto Id : ContextIds)
1298 if (auto NewId = OldToNewContextIds.find(Id);
1299 NewId != OldToNewContextIds.end())
1300 NewIds.insert(NewId->second.begin(), NewId->second.end());
1301 return NewIds;
1302 };
1303
1304 // Recursively update context ids sets along caller edges.
1305 auto UpdateCallers = [&](ContextNode *Node,
1307 auto &&UpdateCallers) -> void {
1308 for (const auto &Edge : Node->CallerEdges) {
1309 auto Inserted = Visited.insert(Edge.get());
1310 if (!Inserted.second)
1311 continue;
1312 ContextNode *NextNode = Edge->Caller;
1313 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1314 // Only need to recursively iterate to NextNode via this caller edge if
1315 // it resulted in any added ids to NextNode.
1316 if (!NewIdsToAdd.empty()) {
1317 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1318 UpdateCallers(NextNode, Visited, UpdateCallers);
1319 }
1320 }
1321 };
1322
1324 for (auto &Entry : AllocationCallToContextNodeMap) {
1325 auto *Node = Entry.second;
1326 UpdateCallers(Node, Visited, UpdateCallers);
1327 }
1328}
1329
1330template <typename DerivedCCG, typename FuncTy, typename CallTy>
1331void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1332 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1333 // This must be passed by value to make a copy since it will be adjusted
1334 // as ids are moved.
1335 DenseSet<uint32_t> RemainingContextIds) {
1336 auto &OrigEdges =
1337 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1338 // Increment iterator in loop so that we can remove edges as needed.
1339 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1340 auto Edge = *EI;
1341 // Remove any matching context ids from Edge, return set that were found and
1342 // removed, these are the new edge's context ids. Also update the remaining
1343 // (not found ids).
1344 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1345 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1346 NotFoundContextIds);
1347 RemainingContextIds.swap(NotFoundContextIds);
1348 // If no matching context ids for this edge, skip it.
1349 if (NewEdgeContextIds.empty()) {
1350 ++EI;
1351 continue;
1352 }
1353 if (TowardsCallee) {
1354 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1355 auto NewEdge = std::make_shared<ContextEdge>(
1356 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1357 NewNode->CalleeEdges.push_back(NewEdge);
1358 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1359 } else {
1360 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1361 auto NewEdge = std::make_shared<ContextEdge>(
1362 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1363 NewNode->CallerEdges.push_back(NewEdge);
1364 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1365 }
1366 // Remove old edge if context ids empty.
1367 if (Edge->getContextIds().empty()) {
1368 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1369 continue;
1370 }
1371 ++EI;
1372 }
1373}
1374
1375template <typename DerivedCCG, typename FuncTy, typename CallTy>
1376static void checkEdge(
1377 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1378 // Confirm that alloc type is not None and that we have at least one context
1379 // id.
1380 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1381 assert(!Edge->ContextIds.empty());
1382}
1383
1384template <typename DerivedCCG, typename FuncTy, typename CallTy>
1385static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1386 bool CheckEdges = true) {
1387 if (Node->isRemoved())
1388 return;
1389#ifndef NDEBUG
1390 // Compute node's context ids once for use in asserts.
1391 auto NodeContextIds = Node->getContextIds();
1392#endif
1393 // Node's context ids should be the union of both its callee and caller edge
1394 // context ids.
1395 if (Node->CallerEdges.size()) {
1396 DenseSet<uint32_t> CallerEdgeContextIds(
1397 Node->CallerEdges.front()->ContextIds);
1398 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1399 if (CheckEdges)
1400 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1401 set_union(CallerEdgeContextIds, Edge->ContextIds);
1402 }
1403 // Node can have more context ids than callers if some contexts terminate at
1404 // node and some are longer. If we are allowing recursive callsites but
1405 // haven't disabled recursive contexts, this will be violated for
1406 // incompletely cloned recursive cycles, so skip the checking in that case.
1408 NodeContextIds == CallerEdgeContextIds ||
1409 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1410 }
1411 if (Node->CalleeEdges.size()) {
1412 DenseSet<uint32_t> CalleeEdgeContextIds(
1413 Node->CalleeEdges.front()->ContextIds);
1414 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1415 if (CheckEdges)
1416 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1417 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1418 }
1419 assert(NodeContextIds == CalleeEdgeContextIds);
1420 }
1421 // FIXME: Since this checking is only invoked under an option, we should
1422 // change the error checking from using assert to something that will trigger
1423 // an error on a release build.
1424#ifndef NDEBUG
1425 // Make sure we don't end up with duplicate edges between the same caller and
1426 // callee.
1428 for (const auto &E : Node->CalleeEdges)
1429 NodeSet.insert(E->Callee);
1430 assert(NodeSet.size() == Node->CalleeEdges.size());
1431#endif
1432}
1433
1434template <typename DerivedCCG, typename FuncTy, typename CallTy>
1435void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1436 assignStackNodesPostOrder(
1437 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1438 DenseMap<uint64_t, std::vector<CallContextInfo>>
1439 &StackIdToMatchingCalls,
1440 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1441 auto Inserted = Visited.insert(Node);
1442 if (!Inserted.second)
1443 return;
1444 // Post order traversal. Iterate over a copy since we may add nodes and
1445 // therefore new callers during the recursive call, invalidating any
1446 // iterator over the original edge vector. We don't need to process these
1447 // new nodes as they were already processed on creation.
1448 auto CallerEdges = Node->CallerEdges;
1449 for (auto &Edge : CallerEdges) {
1450 // Skip any that have been removed during the recursion.
1451 if (Edge->isRemoved()) {
1452 assert(!is_contained(Node->CallerEdges, Edge));
1453 continue;
1454 }
1455 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1456 CallToMatchingCall);
1457 }
1458
1459 // If this node's stack id is in the map, update the graph to contain new
1460 // nodes representing any inlining at interior callsites. Note we move the
1461 // associated context ids over to the new nodes.
1462
1463 // Ignore this node if it is for an allocation or we didn't record any
1464 // stack id lists ending at it.
1465 if (Node->IsAllocation ||
1466 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1467 return;
1468
1469 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1470 // Handle the simple case first. A single call with a single stack id.
1471 // In this case there is no need to create any new context nodes, simply
1472 // assign the context node for stack id to this Call.
1473 if (Calls.size() == 1) {
1474 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1475 if (Ids.size() == 1) {
1476 assert(SavedContextIds.empty());
1477 // It should be this Node
1478 assert(Node == getNodeForStackId(Ids[0]));
1479 if (Node->Recursive)
1480 return;
1481 Node->setCall(Call);
1482 NonAllocationCallToContextNodeMap[Call] = Node;
1483 NodeToCallingFunc[Node] = Func;
1484 return;
1485 }
1486 }
1487
1488#ifndef NDEBUG
1489 // Find the node for the last stack id, which should be the same
1490 // across all calls recorded for this id, and is this node's id.
1491 uint64_t LastId = Node->OrigStackOrAllocId;
1492 ContextNode *LastNode = getNodeForStackId(LastId);
1493 // We should only have kept stack ids that had nodes.
1494 assert(LastNode);
1495 assert(LastNode == Node);
1496#else
1497 ContextNode *LastNode = Node;
1498#endif
1499
1500 // Compute the last node's context ids once, as it is shared by all calls in
1501 // this entry.
1502 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1503
1504 bool PrevIterCreatedNode = false;
1505 bool CreatedNode = false;
1506 for (unsigned I = 0; I < Calls.size();
1507 I++, PrevIterCreatedNode = CreatedNode) {
1508 CreatedNode = false;
1509 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1510 // Skip any for which we didn't assign any ids, these don't get a node in
1511 // the graph.
1512 if (SavedContextIds.empty()) {
1513 // If this call has a matching call (located in the same function and
1514 // having the same stack ids), simply add it to the context node created
1515 // for its matching call earlier. These can be treated the same through
1516 // cloning and get updated at the same time.
1517 if (!CallToMatchingCall.contains(Call))
1518 continue;
1519 auto MatchingCall = CallToMatchingCall[Call];
1520 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1521 // This should only happen if we had a prior iteration, and it didn't
1522 // create a node because of the below recomputation of context ids
1523 // finding none remaining and continuing early.
1524 assert(I > 0 && !PrevIterCreatedNode);
1525 continue;
1526 }
1527 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1528 Call);
1529 continue;
1530 }
1531
1532 assert(LastId == Ids.back());
1533
1534 // Recompute the context ids for this stack id sequence (the
1535 // intersection of the context ids of the corresponding nodes).
1536 // Start with the ids we saved in the map for this call, which could be
1537 // duplicated context ids. We have to recompute as we might have overlap
1538 // overlap between the saved context ids for different last nodes, and
1539 // removed them already during the post order traversal.
1540 set_intersect(SavedContextIds, LastNodeContextIds);
1541 ContextNode *PrevNode = LastNode;
1542 bool Skip = false;
1543 // Iterate backwards through the stack Ids, starting after the last Id
1544 // in the list, which was handled once outside for all Calls.
1545 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1546 auto Id = *IdIter;
1547 ContextNode *CurNode = getNodeForStackId(Id);
1548 // We should only have kept stack ids that had nodes and weren't
1549 // recursive.
1550 assert(CurNode);
1551 assert(!CurNode->Recursive);
1552
1553 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1554 if (!Edge) {
1555 Skip = true;
1556 break;
1557 }
1558 PrevNode = CurNode;
1559
1560 // Update the context ids, which is the intersection of the ids along
1561 // all edges in the sequence.
1562 set_intersect(SavedContextIds, Edge->getContextIds());
1563
1564 // If we now have no context ids for clone, skip this call.
1565 if (SavedContextIds.empty()) {
1566 Skip = true;
1567 break;
1568 }
1569 }
1570 if (Skip)
1571 continue;
1572
1573 // Create new context node.
1574 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1575 NonAllocationCallToContextNodeMap[Call] = NewNode;
1576 CreatedNode = true;
1577 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1578
1579 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1580 assert(FirstNode);
1581
1582 // Connect to callees of innermost stack frame in inlined call chain.
1583 // This updates context ids for FirstNode's callee's to reflect those
1584 // moved to NewNode.
1585 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1586
1587 // Connect to callers of outermost stack frame in inlined call chain.
1588 // This updates context ids for FirstNode's caller's to reflect those
1589 // moved to NewNode.
1590 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1591
1592 // Now we need to remove context ids from edges/nodes between First and
1593 // Last Node.
1594 PrevNode = nullptr;
1595 for (auto Id : Ids) {
1596 ContextNode *CurNode = getNodeForStackId(Id);
1597 // We should only have kept stack ids that had nodes.
1598 assert(CurNode);
1599
1600 // Remove the context ids moved to NewNode from CurNode, and the
1601 // edge from the prior node.
1602 if (PrevNode) {
1603 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1604 assert(PrevEdge);
1605 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1606 if (PrevEdge->getContextIds().empty())
1607 removeEdgeFromGraph(PrevEdge);
1608 }
1609 // Since we update the edges from leaf to tail, only look at the callee
1610 // edges. This isn't an alloc node, so if there are no callee edges, the
1611 // alloc type is None.
1612 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1613 ? (uint8_t)AllocationType::None
1614 : CurNode->computeAllocType();
1615 PrevNode = CurNode;
1616 }
1617 if (VerifyNodes) {
1618 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1619 for (auto Id : Ids) {
1620 ContextNode *CurNode = getNodeForStackId(Id);
1621 // We should only have kept stack ids that had nodes.
1622 assert(CurNode);
1623 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1624 }
1625 }
1626 }
1627}
1628
1629template <typename DerivedCCG, typename FuncTy, typename CallTy>
1630void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1631 // Map of stack id to all calls with that as the last (outermost caller)
1632 // callsite id that has a context node (some might not due to pruning
1633 // performed during matching of the allocation profile contexts).
1634 // The CallContextInfo contains the Call and a list of its stack ids with
1635 // ContextNodes, the function containing Call, and the set of context ids
1636 // the analysis will eventually identify for use in any new node created
1637 // for that callsite.
1639 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1640 for (auto &Call : CallsWithMetadata) {
1641 // Ignore allocations, already handled.
1642 if (AllocationCallToContextNodeMap.count(Call))
1643 continue;
1644 auto StackIdsWithContextNodes =
1645 getStackIdsWithContextNodesForCall(Call.call());
1646 // If there were no nodes created for MIBs on allocs (maybe this was in
1647 // the unambiguous part of the MIB stack that was pruned), ignore.
1648 if (StackIdsWithContextNodes.empty())
1649 continue;
1650 // Otherwise, record this Call along with the list of ids for the last
1651 // (outermost caller) stack id with a node.
1652 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1653 {Call.call(), StackIdsWithContextNodes, Func, {}});
1654 }
1655 }
1656
1657 // First make a pass through all stack ids that correspond to a call,
1658 // as identified in the above loop. Compute the context ids corresponding to
1659 // each of these calls when they correspond to multiple stack ids due to
1660 // due to inlining. Perform any duplication of context ids required when
1661 // there is more than one call with the same stack ids. Their (possibly newly
1662 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1663 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1664 // Save a map from each call to any that are found to match it. I.e. located
1665 // in the same function and have the same (possibly pruned) stack ids. We use
1666 // this to avoid creating extra graph nodes as they can be treated the same.
1667 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1668 for (auto &It : StackIdToMatchingCalls) {
1669 auto &Calls = It.getSecond();
1670 // Skip single calls with a single stack id. These don't need a new node.
1671 if (Calls.size() == 1) {
1672 auto &Ids = Calls[0].StackIds;
1673 if (Ids.size() == 1)
1674 continue;
1675 }
1676 // In order to do the best and maximal matching of inlined calls to context
1677 // node sequences we will sort the vectors of stack ids in descending order
1678 // of length, and within each length, lexicographically by stack id. The
1679 // latter is so that we can specially handle calls that have identical stack
1680 // id sequences (either due to cloning or artificially because of the MIB
1681 // context pruning). Those with the same Ids are then sorted by function to
1682 // facilitate efficiently mapping them to the same context node.
1683 // Because the functions are pointers, to ensure a stable sort first assign
1684 // each function pointer to its first index in the Calls array, and then use
1685 // that to sort by.
1687 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1688 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1689 std::stable_sort(
1690 Calls.begin(), Calls.end(),
1691 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1692 return A.StackIds.size() > B.StackIds.size() ||
1693 (A.StackIds.size() == B.StackIds.size() &&
1694 (A.StackIds < B.StackIds ||
1695 (A.StackIds == B.StackIds &&
1696 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1697 });
1698
1699 // Find the node for the last stack id, which should be the same
1700 // across all calls recorded for this id, and is the id for this
1701 // entry in the StackIdToMatchingCalls map.
1702 uint64_t LastId = It.getFirst();
1703 ContextNode *LastNode = getNodeForStackId(LastId);
1704 // We should only have kept stack ids that had nodes.
1705 assert(LastNode);
1706
1707 if (LastNode->Recursive)
1708 continue;
1709
1710 // Initialize the context ids with the last node's. We will subsequently
1711 // refine the context ids by computing the intersection along all edges.
1712 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1713 assert(!LastNodeContextIds.empty());
1714
1715#ifndef NDEBUG
1716 // Save the set of functions seen for a particular set of the same stack
1717 // ids. This is used to ensure that they have been correctly sorted to be
1718 // adjacent in the Calls list, since we rely on that to efficiently place
1719 // all such matching calls onto the same context node.
1720 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1721#endif
1722
1723 for (unsigned I = 0; I < Calls.size(); I++) {
1724 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1725 assert(SavedContextIds.empty());
1726 assert(LastId == Ids.back());
1727
1728#ifndef NDEBUG
1729 // If this call has a different set of ids than the last one, clear the
1730 // set used to ensure they are sorted properly.
1731 if (I > 0 && Ids != Calls[I - 1].StackIds)
1732 MatchingIdsFuncSet.clear();
1733#endif
1734
1735 // First compute the context ids for this stack id sequence (the
1736 // intersection of the context ids of the corresponding nodes).
1737 // Start with the remaining saved ids for the last node.
1738 assert(!LastNodeContextIds.empty());
1739 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1740
1741 ContextNode *PrevNode = LastNode;
1742 ContextNode *CurNode = LastNode;
1743 bool Skip = false;
1744
1745 // Iterate backwards through the stack Ids, starting after the last Id
1746 // in the list, which was handled once outside for all Calls.
1747 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1748 auto Id = *IdIter;
1749 CurNode = getNodeForStackId(Id);
1750 // We should only have kept stack ids that had nodes.
1751 assert(CurNode);
1752
1753 if (CurNode->Recursive) {
1754 Skip = true;
1755 break;
1756 }
1757
1758 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1759 // If there is no edge then the nodes belong to different MIB contexts,
1760 // and we should skip this inlined context sequence. For example, this
1761 // particular inlined context may include stack ids A->B, and we may
1762 // indeed have nodes for both A and B, but it is possible that they were
1763 // never profiled in sequence in a single MIB for any allocation (i.e.
1764 // we might have profiled an allocation that involves the callsite A,
1765 // but through a different one of its callee callsites, and we might
1766 // have profiled an allocation that involves callsite B, but reached
1767 // from a different caller callsite).
1768 if (!Edge) {
1769 Skip = true;
1770 break;
1771 }
1772 PrevNode = CurNode;
1773
1774 // Update the context ids, which is the intersection of the ids along
1775 // all edges in the sequence.
1776 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1777
1778 // If we now have no context ids for clone, skip this call.
1779 if (StackSequenceContextIds.empty()) {
1780 Skip = true;
1781 break;
1782 }
1783 }
1784 if (Skip)
1785 continue;
1786
1787 // If some of this call's stack ids did not have corresponding nodes (due
1788 // to pruning), don't include any context ids for contexts that extend
1789 // beyond these nodes. Otherwise we would be matching part of unrelated /
1790 // not fully matching stack contexts. To do this, subtract any context ids
1791 // found in caller nodes of the last node found above.
1792 if (Ids.back() != getLastStackId(Call)) {
1793 for (const auto &PE : LastNode->CallerEdges) {
1794 set_subtract(StackSequenceContextIds, PE->getContextIds());
1795 if (StackSequenceContextIds.empty())
1796 break;
1797 }
1798 // If we now have no context ids for clone, skip this call.
1799 if (StackSequenceContextIds.empty())
1800 continue;
1801 }
1802
1803#ifndef NDEBUG
1804 // If the prior call had the same stack ids this set would not be empty.
1805 // Check if we already have a call that "matches" because it is located
1806 // in the same function. If the Calls list was sorted properly we should
1807 // not encounter this situation as all such entries should be adjacent
1808 // and processed in bulk further below.
1809 assert(!MatchingIdsFuncSet.contains(Func));
1810
1811 MatchingIdsFuncSet.insert(Func);
1812#endif
1813
1814 // Check if the next set of stack ids is the same (since the Calls vector
1815 // of tuples is sorted by the stack ids we can just look at the next one).
1816 // If so, save them in the CallToMatchingCall map so that they get
1817 // assigned to the same context node, and skip them.
1818 bool DuplicateContextIds = false;
1819 for (unsigned J = I + 1; J < Calls.size(); J++) {
1820 auto &CallCtxInfo = Calls[J];
1821 auto &NextIds = CallCtxInfo.StackIds;
1822 if (NextIds != Ids)
1823 break;
1824 auto *NextFunc = CallCtxInfo.Func;
1825 if (NextFunc != Func) {
1826 // We have another Call with the same ids but that cannot share this
1827 // node, must duplicate ids for it.
1828 DuplicateContextIds = true;
1829 break;
1830 }
1831 auto &NextCall = CallCtxInfo.Call;
1832 CallToMatchingCall[NextCall] = Call;
1833 // Update I so that it gets incremented correctly to skip this call.
1834 I = J;
1835 }
1836
1837 // If we don't have duplicate context ids, then we can assign all the
1838 // context ids computed for the original node sequence to this call.
1839 // If there are duplicate calls with the same stack ids then we synthesize
1840 // new context ids that are duplicates of the originals. These are
1841 // assigned to SavedContextIds, which is a reference into the map entry
1842 // for this call, allowing us to access these ids later on.
1843 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1844 StackSequenceContextIds.size());
1845 SavedContextIds =
1846 DuplicateContextIds
1847 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1848 : StackSequenceContextIds;
1849 assert(!SavedContextIds.empty());
1850
1851 if (!DuplicateContextIds) {
1852 // Update saved last node's context ids to remove those that are
1853 // assigned to other calls, so that it is ready for the next call at
1854 // this stack id.
1855 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1856 if (LastNodeContextIds.empty())
1857 break;
1858 }
1859 }
1860 }
1861
1862 // Propagate the duplicate context ids over the graph.
1863 propagateDuplicateContextIds(OldToNewContextIds);
1864
1865 if (VerifyCCG)
1866 check();
1867
1868 // Now perform a post-order traversal over the graph, starting with the
1869 // allocation nodes, essentially processing nodes from callers to callees.
1870 // For any that contains an id in the map, update the graph to contain new
1871 // nodes representing any inlining at interior callsites. Note we move the
1872 // associated context ids over to the new nodes.
1874 for (auto &Entry : AllocationCallToContextNodeMap)
1875 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1876 CallToMatchingCall);
1877 if (VerifyCCG)
1878 check();
1879}
1880
1881uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1883 Call->getMetadata(LLVMContext::MD_callsite));
1884 return CallsiteContext.back();
1885}
1886
1887uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1888 assert(isa<CallsiteInfo *>(Call));
1890 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1891 // Need to convert index into stack id.
1892 return Index.getStackIdAtIndex(CallsiteContext.back());
1893}
1894
1895static const std::string MemProfCloneSuffix = ".memprof.";
1896
1897static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1898 // We use CloneNo == 0 to refer to the original version, which doesn't get
1899 // renamed with a suffix.
1900 if (!CloneNo)
1901 return Base.str();
1902 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1903}
1904
1905static bool isMemProfClone(const Function &F) {
1906 return F.getName().contains(MemProfCloneSuffix);
1907}
1908
1909std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1910 const Instruction *Call,
1911 unsigned CloneNo) const {
1912 return (Twine(Call->getFunction()->getName()) + " -> " +
1913 cast<CallBase>(Call)->getCalledFunction()->getName())
1914 .str();
1915}
1916
1917std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1918 const IndexCall &Call,
1919 unsigned CloneNo) const {
1920 auto VI = FSToVIMap.find(Func);
1921 assert(VI != FSToVIMap.end());
1922 if (isa<AllocInfo *>(Call))
1923 return (VI->second.name() + " -> alloc").str();
1924 else {
1925 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
1926 return (VI->second.name() + " -> " +
1927 getMemProfFuncName(Callsite->Callee.name(),
1928 Callsite->Clones[CloneNo]))
1929 .str();
1930 }
1931}
1932
1933std::vector<uint64_t>
1934ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1935 Instruction *Call) {
1937 Call->getMetadata(LLVMContext::MD_callsite));
1938 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1939 CallsiteContext);
1940}
1941
1942std::vector<uint64_t>
1943IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1944 assert(isa<CallsiteInfo *>(Call));
1946 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1947 return getStackIdsWithContextNodes<CallsiteInfo,
1949 CallsiteContext);
1950}
1951
1952template <typename DerivedCCG, typename FuncTy, typename CallTy>
1953template <class NodeT, class IteratorT>
1954std::vector<uint64_t>
1955CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1956 CallStack<NodeT, IteratorT> &CallsiteContext) {
1957 std::vector<uint64_t> StackIds;
1958 for (auto IdOrIndex : CallsiteContext) {
1959 auto StackId = getStackId(IdOrIndex);
1960 ContextNode *Node = getNodeForStackId(StackId);
1961 if (!Node)
1962 break;
1963 StackIds.push_back(StackId);
1964 }
1965 return StackIds;
1966}
1967
1968ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1969 Module &M,
1971 : Mod(M), OREGetter(OREGetter) {
1972 for (auto &F : M) {
1973 std::vector<CallInfo> CallsWithMetadata;
1974 for (auto &BB : F) {
1975 for (auto &I : BB) {
1976 if (!isa<CallBase>(I))
1977 continue;
1978 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1979 CallsWithMetadata.push_back(&I);
1980 auto *AllocNode = addAllocNode(&I, &F);
1981 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1982 assert(CallsiteMD);
1983 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1984 // Add all of the MIBs and their stack nodes.
1985 for (auto &MDOp : MemProfMD->operands()) {
1986 auto *MIBMD = cast<const MDNode>(MDOp);
1987 std::vector<ContextTotalSize> ContextSizeInfo;
1988 // Collect the context size information if it exists.
1989 if (MIBMD->getNumOperands() > 2) {
1990 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
1991 MDNode *ContextSizePair =
1992 dyn_cast<MDNode>(MIBMD->getOperand(I));
1993 assert(ContextSizePair->getNumOperands() == 2);
1994 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
1995 ContextSizePair->getOperand(0))
1996 ->getZExtValue();
1997 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
1998 ContextSizePair->getOperand(1))
1999 ->getZExtValue();
2000 ContextSizeInfo.push_back({FullStackId, TotalSize});
2001 }
2002 }
2006 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2007 AllocNode, StackContext, CallsiteContext,
2008 getMIBAllocType(MIBMD), ContextSizeInfo);
2009 }
2010 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2011 // Memprof and callsite metadata on memory allocations no longer
2012 // needed.
2013 I.setMetadata(LLVMContext::MD_memprof, nullptr);
2014 I.setMetadata(LLVMContext::MD_callsite, nullptr);
2015 }
2016 // For callsite metadata, add to list for this function for later use.
2017 else if (I.getMetadata(LLVMContext::MD_callsite)) {
2018 CallsWithMetadata.push_back(&I);
2019 }
2020 }
2021 }
2022 if (!CallsWithMetadata.empty())
2023 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2024 }
2025
2026 if (DumpCCG) {
2027 dbgs() << "CCG before updating call stack chains:\n";
2028 dbgs() << *this;
2029 }
2030
2031 if (ExportToDot)
2032 exportToDot("prestackupdate");
2033
2034 updateStackNodes();
2035
2036 handleCallsitesWithMultipleTargets();
2037
2038 // Strip off remaining callsite metadata, no longer needed.
2039 for (auto &FuncEntry : FuncToCallsWithMetadata)
2040 for (auto &Call : FuncEntry.second)
2041 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2042}
2043
2044IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2045 ModuleSummaryIndex &Index,
2047 isPrevailing)
2048 : Index(Index), isPrevailing(isPrevailing) {
2049 for (auto &I : Index) {
2050 auto VI = Index.getValueInfo(I);
2051 for (auto &S : VI.getSummaryList()) {
2052 // We should only add the prevailing nodes. Otherwise we may try to clone
2053 // in a weak copy that won't be linked (and may be different than the
2054 // prevailing version).
2055 // We only keep the memprof summary on the prevailing copy now when
2056 // building the combined index, as a space optimization, however don't
2057 // rely on this optimization. The linker doesn't resolve local linkage
2058 // values so don't check whether those are prevailing.
2059 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2060 !isPrevailing(VI.getGUID(), S.get()))
2061 continue;
2062 auto *FS = dyn_cast<FunctionSummary>(S.get());
2063 if (!FS)
2064 continue;
2065 std::vector<CallInfo> CallsWithMetadata;
2066 if (!FS->allocs().empty()) {
2067 for (auto &AN : FS->mutableAllocs()) {
2068 // This can happen because of recursion elimination handling that
2069 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2070 // We still added them to the summary because we need to be able to
2071 // correlate properly in applyImport in the backends.
2072 if (AN.MIBs.empty())
2073 continue;
2074 IndexCall AllocCall(&AN);
2075 CallsWithMetadata.push_back(AllocCall);
2076 auto *AllocNode = addAllocNode(AllocCall, FS);
2077 // Pass an empty CallStack to the CallsiteContext (second)
2078 // parameter, since for ThinLTO we already collapsed out the inlined
2079 // stack ids on the allocation call during ModuleSummaryAnalysis.
2081 EmptyContext;
2082 unsigned I = 0;
2083 assert(
2085 AN.ContextSizeInfos.size() == AN.MIBs.size());
2086 // Now add all of the MIBs and their stack nodes.
2087 for (auto &MIB : AN.MIBs) {
2089 StackContext(&MIB);
2090 std::vector<ContextTotalSize> ContextSizeInfo;
2091 if (!AN.ContextSizeInfos.empty()) {
2092 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2093 ContextSizeInfo.push_back({FullStackId, TotalSize});
2094 }
2095 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2096 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2097 ContextSizeInfo);
2098 I++;
2099 }
2100 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2101 // Initialize version 0 on the summary alloc node to the current alloc
2102 // type, unless it has both types in which case make it default, so
2103 // that in the case where we aren't able to clone the original version
2104 // always ends up with the default allocation behavior.
2105 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2106 }
2107 }
2108 // For callsite metadata, add to list for this function for later use.
2109 if (!FS->callsites().empty())
2110 for (auto &SN : FS->mutableCallsites()) {
2111 IndexCall StackNodeCall(&SN);
2112 CallsWithMetadata.push_back(StackNodeCall);
2113 }
2114
2115 if (!CallsWithMetadata.empty())
2116 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2117
2118 if (!FS->allocs().empty() || !FS->callsites().empty())
2119 FSToVIMap[FS] = VI;
2120 }
2121 }
2122
2123 if (DumpCCG) {
2124 dbgs() << "CCG before updating call stack chains:\n";
2125 dbgs() << *this;
2126 }
2127
2128 if (ExportToDot)
2129 exportToDot("prestackupdate");
2130
2131 updateStackNodes();
2132
2133 handleCallsitesWithMultipleTargets();
2134}
2135
2136template <typename DerivedCCG, typename FuncTy, typename CallTy>
2137void CallsiteContextGraph<DerivedCCG, FuncTy,
2138 CallTy>::handleCallsitesWithMultipleTargets() {
2139 // Look for and workaround callsites that call multiple functions.
2140 // This can happen for indirect calls, which needs better handling, and in
2141 // more rare cases (e.g. macro expansion).
2142 // TODO: To fix this for indirect calls we will want to perform speculative
2143 // devirtualization using either the normal PGO info with ICP, or using the
2144 // information in the profiled MemProf contexts. We can do this prior to
2145 // this transformation for regular LTO, and for ThinLTO we can simulate that
2146 // effect in the summary and perform the actual speculative devirtualization
2147 // while cloning in the ThinLTO backend.
2148
2149 // Keep track of the new nodes synthesized for discovered tail calls missing
2150 // from the profiled contexts.
2151 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2152
2153 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2154 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2155 auto *Node = Entry.second;
2156 assert(Node->Clones.empty());
2157 // Check all node callees and see if in the same function.
2158 // We need to check all of the calls recorded in this Node, because in some
2159 // cases we may have had multiple calls with the same debug info calling
2160 // different callees. This can happen, for example, when an object is
2161 // constructed in the paramter list - the destructor call of the object has
2162 // the same debug info (line/col) as the call the object was passed to.
2163 // Here we will prune any that don't match all callee nodes.
2164 std::vector<CallInfo> AllCalls;
2165 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2166 AllCalls.push_back(Node->Call);
2167 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2168 Node->MatchingCalls.end());
2169
2170 // First see if we can partition the calls by callee function, creating new
2171 // nodes to host each set of calls calling the same callees. This is
2172 // necessary for support indirect calls with ThinLTO, for which we
2173 // synthesized CallsiteInfo records for each target. They will all have the
2174 // same callsite stack ids and would be sharing a context node at this
2175 // point. We need to perform separate cloning for each, which will be
2176 // applied along with speculative devirtualization in the ThinLTO backends
2177 // as needed. Note this does not currently support looking through tail
2178 // calls, it is unclear if we need that for indirect call targets.
2179 // First partition calls by callee func. Map indexed by func, value is
2180 // struct with list of matching calls, assigned node.
2181 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2182 continue;
2183
2184 auto It = AllCalls.begin();
2185 // Iterate through the calls until we find the first that matches.
2186 for (; It != AllCalls.end(); ++It) {
2187 auto ThisCall = *It;
2188 bool Match = true;
2189 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2190 ++EI) {
2191 auto Edge = *EI;
2192 if (!Edge->Callee->hasCall())
2193 continue;
2194 assert(NodeToCallingFunc.count(Edge->Callee));
2195 // Check if the called function matches that of the callee node.
2196 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2197 Match = false;
2198 break;
2199 }
2200 }
2201 // Found a call that matches the callee nodes, we can quit now.
2202 if (Match) {
2203 // If the first match is not the primary call on the Node, update it
2204 // now. We will update the list of matching calls further below.
2205 if (Node->Call != ThisCall) {
2206 Node->setCall(ThisCall);
2207 // We need to update the NonAllocationCallToContextNodeMap, but don't
2208 // want to do this during iteration over that map, so save the calls
2209 // that need updated entries.
2210 NewCallToNode.push_back({ThisCall, Node});
2211 }
2212 break;
2213 }
2214 }
2215 // We will update this list below (or leave it cleared if there was no
2216 // match found above).
2217 Node->MatchingCalls.clear();
2218 // If we hit the end of the AllCalls vector, no call matching the callee
2219 // nodes was found, clear the call information in the node.
2220 if (It == AllCalls.end()) {
2221 RemovedEdgesWithMismatchedCallees++;
2222 // Work around by setting Node to have a null call, so it gets
2223 // skipped during cloning. Otherwise assignFunctions will assert
2224 // because its data structures are not designed to handle this case.
2225 Node->setCall(CallInfo());
2226 continue;
2227 }
2228 // Now add back any matching calls that call the same function as the
2229 // matching primary call on Node.
2230 for (++It; It != AllCalls.end(); ++It) {
2231 auto ThisCall = *It;
2232 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2233 continue;
2234 Node->MatchingCalls.push_back(ThisCall);
2235 }
2236 }
2237
2238 // Remove all mismatched nodes identified in the above loop from the node map
2239 // (checking whether they have a null call which is set above). For a
2240 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2241 // to do the removal via remove_if than by individually erasing entries above.
2242 // Also remove any entries if we updated the node's primary call above.
2243 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2244 return !it.second->hasCall() || it.second->Call != it.first;
2245 });
2246
2247 // Add entries for any new primary calls recorded above.
2248 for (auto &[Call, Node] : NewCallToNode)
2249 NonAllocationCallToContextNodeMap[Call] = Node;
2250
2251 // Add the new nodes after the above loop so that the iteration is not
2252 // invalidated.
2253 for (auto &[Call, Node] : TailCallToContextNodeMap)
2254 NonAllocationCallToContextNodeMap[Call] = Node;
2255}
2256
2257template <typename DerivedCCG, typename FuncTy, typename CallTy>
2258bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2259 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2260 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2261 // Struct to keep track of all the calls having the same callee function,
2262 // and the node we eventually assign to them. Eventually we will record the
2263 // context node assigned to this group of calls.
2264 struct CallsWithSameCallee {
2265 std::vector<CallInfo> Calls;
2266 ContextNode *Node = nullptr;
2267 };
2268
2269 // First partition calls by callee function. Build map from each function
2270 // to the list of matching calls.
2272 for (auto ThisCall : AllCalls) {
2273 auto *F = getCalleeFunc(ThisCall.call());
2274 if (F)
2275 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2276 }
2277
2278 // Next, walk through all callee edges. For each callee node, get its
2279 // containing function and see if it was recorded in the above map (meaning we
2280 // have at least one matching call). Build another map from each callee node
2281 // with a matching call to the structure instance created above containing all
2282 // the calls.
2284 for (const auto &Edge : Node->CalleeEdges) {
2285 if (!Edge->Callee->hasCall())
2286 continue;
2287 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2288 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2289 CalleeNodeToCallInfo[Edge->Callee] =
2290 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2291 }
2292
2293 // If there are entries in the second map, then there were no matching
2294 // calls/callees, nothing to do here. Return so we can go to the handling that
2295 // looks through tail calls.
2296 if (CalleeNodeToCallInfo.empty())
2297 return false;
2298
2299 // Walk through all callee edges again. Any and all callee edges that didn't
2300 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2301 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2302 // ignored during cloning. If it is in the map, then we use the node recorded
2303 // in that entry (creating it if needed), and move the callee edge to it.
2304 // The first callee will use the original node instead of creating a new one.
2305 // Note that any of the original calls on this node (in AllCalls) that didn't
2306 // have a callee function automatically get dropped from the node as part of
2307 // this process.
2308 ContextNode *UnmatchedCalleesNode = nullptr;
2309 // Track whether we already assigned original node to a callee.
2310 bool UsedOrigNode = false;
2311 assert(NodeToCallingFunc[Node]);
2312 // Iterate over a copy of Node's callee edges, since we may need to remove
2313 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2314 // makes it less error-prone.
2315 auto CalleeEdges = Node->CalleeEdges;
2316 for (auto &Edge : CalleeEdges) {
2317 if (!Edge->Callee->hasCall())
2318 continue;
2319
2320 // Will be updated below to point to whatever (caller) node this callee edge
2321 // should be moved to.
2322 ContextNode *CallerNodeToUse = nullptr;
2323
2324 // Handle the case where there were no matching calls first. Move this
2325 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2326 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2327 if (!UnmatchedCalleesNode)
2328 UnmatchedCalleesNode =
2329 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2330 CallerNodeToUse = UnmatchedCalleesNode;
2331 } else {
2332 // Look up the information recorded for this callee node, and use the
2333 // recorded caller node (creating it if needed).
2334 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2335 if (!Info->Node) {
2336 // If we haven't assigned any callees to the original node use it.
2337 if (!UsedOrigNode) {
2338 Info->Node = Node;
2339 // Clear the set of matching calls which will be updated below.
2340 Node->MatchingCalls.clear();
2341 UsedOrigNode = true;
2342 } else
2343 Info->Node =
2344 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2345 assert(!Info->Calls.empty());
2346 // The first call becomes the primary call for this caller node, and the
2347 // rest go in the matching calls list.
2348 Info->Node->setCall(Info->Calls.front());
2349 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2350 Info->Calls.begin() + 1,
2351 Info->Calls.end());
2352 // Save the primary call to node correspondence so that we can update
2353 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2354 // caller of this function.
2355 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2356 }
2357 CallerNodeToUse = Info->Node;
2358 }
2359
2360 // Don't need to move edge if we are using the original node;
2361 if (CallerNodeToUse == Node)
2362 continue;
2363
2364 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2365 }
2366 // Now that we are done moving edges, clean up any caller edges that ended
2367 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2368 // caller edges from Node are replicated onto the new callers, and it
2369 // simplifies the handling to leave them until we have moved all
2370 // edges/context ids.
2371 for (auto &I : CalleeNodeToCallInfo)
2372 removeNoneTypeCallerEdges(I.second->Node);
2373 if (UnmatchedCalleesNode)
2374 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2375 removeNoneTypeCallerEdges(Node);
2376
2377 return true;
2378}
2379
2380uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2381 // In the Module (IR) case this is already the Id.
2382 return IdOrIndex;
2383}
2384
2385uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2386 // In the Index case this is an index into the stack id list in the summary
2387 // index, convert it to an Id.
2388 return Index.getStackIdAtIndex(IdOrIndex);
2389}
2390
2391template <typename DerivedCCG, typename FuncTy, typename CallTy>
2392bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2393 CallTy Call, EdgeIter &EI,
2394 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2395 auto Edge = *EI;
2396 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2397 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2398 // Will be populated in order of callee to caller if we find a chain of tail
2399 // calls between the profiled caller and callee.
2400 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2401 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2402 FoundCalleeChain))
2403 return false;
2404
2405 // The usual case where the profiled callee matches that of the IR/summary.
2406 if (FoundCalleeChain.empty())
2407 return true;
2408
2409 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2410 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2411 // If there is already an edge between these nodes, simply update it and
2412 // return.
2413 if (CurEdge) {
2414 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2415 Edge->ContextIds.end());
2416 CurEdge->AllocTypes |= Edge->AllocTypes;
2417 return;
2418 }
2419 // Otherwise, create a new edge and insert it into the caller and callee
2420 // lists.
2421 auto NewEdge = std::make_shared<ContextEdge>(
2422 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2423 Callee->CallerEdges.push_back(NewEdge);
2424 if (Caller == Edge->Caller) {
2425 // If we are inserting the new edge into the current edge's caller, insert
2426 // the new edge before the current iterator position, and then increment
2427 // back to the current edge.
2428 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2429 ++EI;
2430 assert(*EI == Edge &&
2431 "Iterator position not restored after insert and increment");
2432 } else
2433 Caller->CalleeEdges.push_back(NewEdge);
2434 };
2435
2436 // Create new nodes for each found callee and connect in between the profiled
2437 // caller and callee.
2438 auto *CurCalleeNode = Edge->Callee;
2439 for (auto &[NewCall, Func] : FoundCalleeChain) {
2440 ContextNode *NewNode = nullptr;
2441 // First check if we have already synthesized a node for this tail call.
2442 if (TailCallToContextNodeMap.count(NewCall)) {
2443 NewNode = TailCallToContextNodeMap[NewCall];
2444 NewNode->AllocTypes |= Edge->AllocTypes;
2445 } else {
2446 FuncToCallsWithMetadata[Func].push_back({NewCall});
2447 // Create Node and record node info.
2448 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2449 TailCallToContextNodeMap[NewCall] = NewNode;
2450 NewNode->AllocTypes = Edge->AllocTypes;
2451 }
2452
2453 // Hook up node to its callee node
2454 AddEdge(NewNode, CurCalleeNode);
2455
2456 CurCalleeNode = NewNode;
2457 }
2458
2459 // Hook up edge's original caller to new callee node.
2460 AddEdge(Edge->Caller, CurCalleeNode);
2461
2462#ifndef NDEBUG
2463 // Save this because Edge's fields get cleared below when removed.
2464 auto *Caller = Edge->Caller;
2465#endif
2466
2467 // Remove old edge
2468 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2469
2470 // To simplify the increment of EI in the caller, subtract one from EI.
2471 // In the final AddEdge call we would have either added a new callee edge,
2472 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2473 // that there is at least one callee edge.
2474 assert(!Caller->CalleeEdges.empty());
2475 --EI;
2476
2477 return true;
2478}
2479
2480bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2481 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2482 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2483 bool &FoundMultipleCalleeChains) {
2484 // Stop recursive search if we have already explored the maximum specified
2485 // depth.
2487 return false;
2488
2489 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2490 FoundCalleeChain.push_back({Callsite, F});
2491 };
2492
2493 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2494 if (!CalleeFunc) {
2495 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2496 assert(Alias);
2497 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2498 assert(CalleeFunc);
2499 }
2500
2501 // Look for tail calls in this function, and check if they either call the
2502 // profiled callee directly, or indirectly (via a recursive search).
2503 // Only succeed if there is a single unique tail call chain found between the
2504 // profiled caller and callee, otherwise we could perform incorrect cloning.
2505 bool FoundSingleCalleeChain = false;
2506 for (auto &BB : *CalleeFunc) {
2507 for (auto &I : BB) {
2508 auto *CB = dyn_cast<CallBase>(&I);
2509 if (!CB || !CB->isTailCall())
2510 continue;
2511 auto *CalledValue = CB->getCalledOperand();
2512 auto *CalledFunction = CB->getCalledFunction();
2513 if (CalledValue && !CalledFunction) {
2514 CalledValue = CalledValue->stripPointerCasts();
2515 // Stripping pointer casts can reveal a called function.
2516 CalledFunction = dyn_cast<Function>(CalledValue);
2517 }
2518 // Check if this is an alias to a function. If so, get the
2519 // called aliasee for the checks below.
2520 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2521 assert(!CalledFunction &&
2522 "Expected null called function in callsite for alias");
2523 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2524 }
2525 if (!CalledFunction)
2526 continue;
2527 if (CalledFunction == ProfiledCallee) {
2528 if (FoundSingleCalleeChain) {
2529 FoundMultipleCalleeChains = true;
2530 return false;
2531 }
2532 FoundSingleCalleeChain = true;
2533 FoundProfiledCalleeCount++;
2534 FoundProfiledCalleeDepth += Depth;
2535 if (Depth > FoundProfiledCalleeMaxDepth)
2536 FoundProfiledCalleeMaxDepth = Depth;
2537 SaveCallsiteInfo(&I, CalleeFunc);
2538 } else if (findProfiledCalleeThroughTailCalls(
2539 ProfiledCallee, CalledFunction, Depth + 1,
2540 FoundCalleeChain, FoundMultipleCalleeChains)) {
2541 // findProfiledCalleeThroughTailCalls should not have returned
2542 // true if FoundMultipleCalleeChains.
2543 assert(!FoundMultipleCalleeChains);
2544 if (FoundSingleCalleeChain) {
2545 FoundMultipleCalleeChains = true;
2546 return false;
2547 }
2548 FoundSingleCalleeChain = true;
2549 SaveCallsiteInfo(&I, CalleeFunc);
2550 } else if (FoundMultipleCalleeChains)
2551 return false;
2552 }
2553 }
2554
2555 return FoundSingleCalleeChain;
2556}
2557
2558const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2559 auto *CB = dyn_cast<CallBase>(Call);
2560 if (!CB->getCalledOperand() || CB->isIndirectCall())
2561 return nullptr;
2562 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2563 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2564 if (Alias)
2565 return dyn_cast<Function>(Alias->getAliasee());
2566 return dyn_cast<Function>(CalleeVal);
2567}
2568
2569bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2570 Instruction *Call, const Function *Func, const Function *CallerFunc,
2571 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2572 auto *CB = dyn_cast<CallBase>(Call);
2573 if (!CB->getCalledOperand() || CB->isIndirectCall())
2574 return false;
2575 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2576 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2577 if (CalleeFunc == Func)
2578 return true;
2579 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2580 if (Alias && Alias->getAliasee() == Func)
2581 return true;
2582
2583 // Recursively search for the profiled callee through tail calls starting with
2584 // the actual Callee. The discovered tail call chain is saved in
2585 // FoundCalleeChain, and we will fixup the graph to include these callsites
2586 // after returning.
2587 // FIXME: We will currently redo the same recursive walk if we find the same
2588 // mismatched callee from another callsite. We can improve this with more
2589 // bookkeeping of the created chain of new nodes for each mismatch.
2590 unsigned Depth = 1;
2591 bool FoundMultipleCalleeChains = false;
2592 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2593 FoundCalleeChain,
2594 FoundMultipleCalleeChains)) {
2595 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2596 << Func->getName() << " from " << CallerFunc->getName()
2597 << " that actually called " << CalleeVal->getName()
2598 << (FoundMultipleCalleeChains
2599 ? " (found multiple possible chains)"
2600 : "")
2601 << "\n");
2602 if (FoundMultipleCalleeChains)
2603 FoundProfiledCalleeNonUniquelyCount++;
2604 return false;
2605 }
2606
2607 return true;
2608}
2609
2610bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2611 Instruction *Call2) {
2612 auto *CB1 = cast<CallBase>(Call1);
2613 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2614 return false;
2615 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2616 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2617 auto *CB2 = cast<CallBase>(Call2);
2618 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2619 return false;
2620 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2621 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2622 return CalleeFunc1 == CalleeFunc2;
2623}
2624
2625bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2626 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2627 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2628 bool &FoundMultipleCalleeChains) {
2629 // Stop recursive search if we have already explored the maximum specified
2630 // depth.
2632 return false;
2633
2634 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2635 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2636 // been synthesized.
2637 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2638 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2639 // StackIds is empty (we don't have debug info available in the index for
2640 // these callsites)
2641 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2642 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2643 CallsiteInfo *NewCallsiteInfo =
2644 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2645 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2646 };
2647
2648 // Look for tail calls in this function, and check if they either call the
2649 // profiled callee directly, or indirectly (via a recursive search).
2650 // Only succeed if there is a single unique tail call chain found between the
2651 // profiled caller and callee, otherwise we could perform incorrect cloning.
2652 bool FoundSingleCalleeChain = false;
2653 for (auto &S : CurCallee.getSummaryList()) {
2654 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2655 !isPrevailing(CurCallee.getGUID(), S.get()))
2656 continue;
2657 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2658 if (!FS)
2659 continue;
2660 auto FSVI = CurCallee;
2661 auto *AS = dyn_cast<AliasSummary>(S.get());
2662 if (AS)
2663 FSVI = AS->getAliaseeVI();
2664 for (auto &CallEdge : FS->calls()) {
2665 if (!CallEdge.second.hasTailCall())
2666 continue;
2667 if (CallEdge.first == ProfiledCallee) {
2668 if (FoundSingleCalleeChain) {
2669 FoundMultipleCalleeChains = true;
2670 return false;
2671 }
2672 FoundSingleCalleeChain = true;
2673 FoundProfiledCalleeCount++;
2674 FoundProfiledCalleeDepth += Depth;
2675 if (Depth > FoundProfiledCalleeMaxDepth)
2676 FoundProfiledCalleeMaxDepth = Depth;
2677 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2678 // Add FS to FSToVIMap in case it isn't already there.
2679 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2680 FSToVIMap[FS] = FSVI;
2681 } else if (findProfiledCalleeThroughTailCalls(
2682 ProfiledCallee, CallEdge.first, Depth + 1,
2683 FoundCalleeChain, FoundMultipleCalleeChains)) {
2684 // findProfiledCalleeThroughTailCalls should not have returned
2685 // true if FoundMultipleCalleeChains.
2686 assert(!FoundMultipleCalleeChains);
2687 if (FoundSingleCalleeChain) {
2688 FoundMultipleCalleeChains = true;
2689 return false;
2690 }
2691 FoundSingleCalleeChain = true;
2692 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2693 // Add FS to FSToVIMap in case it isn't already there.
2694 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2695 FSToVIMap[FS] = FSVI;
2696 } else if (FoundMultipleCalleeChains)
2697 return false;
2698 }
2699 }
2700
2701 return FoundSingleCalleeChain;
2702}
2703
2704const FunctionSummary *
2705IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2706 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2707 if (Callee.getSummaryList().empty())
2708 return nullptr;
2709 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2710}
2711
2712bool IndexCallsiteContextGraph::calleeMatchesFunc(
2713 IndexCall &Call, const FunctionSummary *Func,
2714 const FunctionSummary *CallerFunc,
2715 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2716 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2717 // If there is no summary list then this is a call to an externally defined
2718 // symbol.
2719 AliasSummary *Alias =
2720 Callee.getSummaryList().empty()
2721 ? nullptr
2722 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2723 assert(FSToVIMap.count(Func));
2724 auto FuncVI = FSToVIMap[Func];
2725 if (Callee == FuncVI ||
2726 // If callee is an alias, check the aliasee, since only function
2727 // summary base objects will contain the stack node summaries and thus
2728 // get a context node.
2729 (Alias && Alias->getAliaseeVI() == FuncVI))
2730 return true;
2731
2732 // Recursively search for the profiled callee through tail calls starting with
2733 // the actual Callee. The discovered tail call chain is saved in
2734 // FoundCalleeChain, and we will fixup the graph to include these callsites
2735 // after returning.
2736 // FIXME: We will currently redo the same recursive walk if we find the same
2737 // mismatched callee from another callsite. We can improve this with more
2738 // bookkeeping of the created chain of new nodes for each mismatch.
2739 unsigned Depth = 1;
2740 bool FoundMultipleCalleeChains = false;
2741 if (!findProfiledCalleeThroughTailCalls(
2742 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2743 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2744 << " from " << FSToVIMap[CallerFunc]
2745 << " that actually called " << Callee
2746 << (FoundMultipleCalleeChains
2747 ? " (found multiple possible chains)"
2748 : "")
2749 << "\n");
2750 if (FoundMultipleCalleeChains)
2751 FoundProfiledCalleeNonUniquelyCount++;
2752 return false;
2753 }
2754
2755 return true;
2756}
2757
2758bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2759 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2760 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2761 return Callee1 == Callee2;
2762}
2763
2764template <typename DerivedCCG, typename FuncTy, typename CallTy>
2765void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2766 const {
2767 print(dbgs());
2768 dbgs() << "\n";
2769}
2770
2771template <typename DerivedCCG, typename FuncTy, typename CallTy>
2772void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2773 raw_ostream &OS) const {
2774 OS << "Node " << this << "\n";
2775 OS << "\t";
2776 printCall(OS);
2777 if (Recursive)
2778 OS << " (recursive)";
2779 OS << "\n";
2780 if (!MatchingCalls.empty()) {
2781 OS << "\tMatchingCalls:\n";
2782 for (auto &MatchingCall : MatchingCalls) {
2783 OS << "\t";
2784 MatchingCall.print(OS);
2785 OS << "\n";
2786 }
2787 }
2788 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2789 OS << "\tContextIds:";
2790 // Make a copy of the computed context ids that we can sort for stability.
2791 auto ContextIds = getContextIds();
2792 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2793 std::sort(SortedIds.begin(), SortedIds.end());
2794 for (auto Id : SortedIds)
2795 OS << " " << Id;
2796 OS << "\n";
2797 OS << "\tCalleeEdges:\n";
2798 for (auto &Edge : CalleeEdges)
2799 OS << "\t\t" << *Edge << "\n";
2800 OS << "\tCallerEdges:\n";
2801 for (auto &Edge : CallerEdges)
2802 OS << "\t\t" << *Edge << "\n";
2803 if (!Clones.empty()) {
2804 OS << "\tClones: ";
2805 ListSeparator LS;
2806 for (auto *Clone : Clones)
2807 OS << LS << Clone;
2808 OS << "\n";
2809 } else if (CloneOf) {
2810 OS << "\tClone of " << CloneOf << "\n";
2811 }
2812}
2813
2814template <typename DerivedCCG, typename FuncTy, typename CallTy>
2815void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2816 const {
2817 print(dbgs());
2818 dbgs() << "\n";
2819}
2820
2821template <typename DerivedCCG, typename FuncTy, typename CallTy>
2822void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2823 raw_ostream &OS) const {
2824 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2825 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2826 OS << " ContextIds:";
2827 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2828 std::sort(SortedIds.begin(), SortedIds.end());
2829 for (auto Id : SortedIds)
2830 OS << " " << Id;
2831}
2832
2833template <typename DerivedCCG, typename FuncTy, typename CallTy>
2834void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2835 print(dbgs());
2836}
2837
2838template <typename DerivedCCG, typename FuncTy, typename CallTy>
2839void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2840 raw_ostream &OS) const {
2841 OS << "Callsite Context Graph:\n";
2842 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2843 for (const auto Node : nodes<GraphType>(this)) {
2844 if (Node->isRemoved())
2845 continue;
2846 Node->print(OS);
2847 OS << "\n";
2848 }
2849}
2850
2851template <typename DerivedCCG, typename FuncTy, typename CallTy>
2852void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2853 raw_ostream &OS) const {
2854 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2855 for (const auto Node : nodes<GraphType>(this)) {
2856 if (Node->isRemoved())
2857 continue;
2858 if (!Node->IsAllocation)
2859 continue;
2860 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2861 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
2862 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2863 std::sort(SortedIds.begin(), SortedIds.end());
2864 for (auto Id : SortedIds) {
2865 auto TypeI = ContextIdToAllocationType.find(Id);
2866 assert(TypeI != ContextIdToAllocationType.end());
2867 auto CSI = ContextIdToContextSizeInfos.find(Id);
2868 if (CSI != ContextIdToContextSizeInfos.end()) {
2869 for (auto &Info : CSI->second) {
2870 OS << "MemProf hinting: "
2871 << getAllocTypeString((uint8_t)TypeI->second)
2872 << " full allocation context " << Info.FullStackId
2873 << " with total size " << Info.TotalSize << " is "
2874 << getAllocTypeString(Node->AllocTypes) << " after cloning";
2875 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2876 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2877 << " due to cold byte percent";
2878 OS << "\n";
2879 }
2880 }
2881 }
2882 }
2883}
2884
2885template <typename DerivedCCG, typename FuncTy, typename CallTy>
2886void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2887 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2888 for (const auto Node : nodes<GraphType>(this)) {
2889 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2890 for (auto &Edge : Node->CallerEdges)
2891 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2892 }
2893}
2894
2895template <typename DerivedCCG, typename FuncTy, typename CallTy>
2896struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2897 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2898 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2899
2900 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2901 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2902
2905 decltype(&getNode)>;
2906
2908 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2909 }
2910
2912 return nodes_iterator(G->NodeOwner.end(), &getNode);
2913 }
2914
2916 return G->NodeOwner.begin()->get();
2917 }
2918
2919 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2920 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2922 return P->Callee;
2923 }
2924
2926 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2927 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2928 decltype(&GetCallee)>;
2929
2931 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2932 }
2933
2935 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2936 }
2937};
2938
2939template <typename DerivedCCG, typename FuncTy, typename CallTy>
2940struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2941 : public DefaultDOTGraphTraits {
2942 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2943
2944 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2946 using NodeRef = typename GTraits::NodeRef;
2947 using ChildIteratorType = typename GTraits::ChildIteratorType;
2948
2949 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2950 std::string LabelString =
2951 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2952 Twine(Node->OrigStackOrAllocId))
2953 .str();
2954 LabelString += "\n";
2955 if (Node->hasCall()) {
2956 auto Func = G->NodeToCallingFunc.find(Node);
2957 assert(Func != G->NodeToCallingFunc.end());
2958 LabelString +=
2959 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2960 } else {
2961 LabelString += "null call";
2962 if (Node->Recursive)
2963 LabelString += " (recursive)";
2964 else
2965 LabelString += " (external)";
2966 }
2967 return LabelString;
2968 }
2969
2970 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2971 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2972 getContextIds(Node->getContextIds()) + "\"")
2973 .str();
2974 AttributeString +=
2975 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2976 AttributeString += ",style=\"filled\"";
2977 if (Node->CloneOf) {
2978 AttributeString += ",color=\"blue\"";
2979 AttributeString += ",style=\"filled,bold,dashed\"";
2980 } else
2981 AttributeString += ",style=\"filled\"";
2982 return AttributeString;
2983 }
2984
2985 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2986 GraphType) {
2987 auto &Edge = *(ChildIter.getCurrent());
2988 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2989 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2990 .str();
2991 }
2992
2993 // Since the NodeOwners list includes nodes that are no longer connected to
2994 // the graph, skip them here.
2995 static bool isNodeHidden(NodeRef Node, GraphType) {
2996 return Node->isRemoved();
2997 }
2998
2999private:
3000 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3001 std::string IdString = "ContextIds:";
3002 if (ContextIds.size() < 100) {
3003 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3004 std::sort(SortedIds.begin(), SortedIds.end());
3005 for (auto Id : SortedIds)
3006 IdString += (" " + Twine(Id)).str();
3007 } else {
3008 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3009 }
3010 return IdString;
3011 }
3012
3013 static std::string getColor(uint8_t AllocTypes) {
3014 if (AllocTypes == (uint8_t)AllocationType::NotCold)
3015 // Color "brown1" actually looks like a lighter red.
3016 return "brown1";
3017 if (AllocTypes == (uint8_t)AllocationType::Cold)
3018 return "cyan";
3019 if (AllocTypes ==
3021 // Lighter purple.
3022 return "mediumorchid1";
3023 return "gray";
3024 }
3025
3026 static std::string getNodeId(NodeRef Node) {
3027 std::stringstream SStream;
3028 SStream << std::hex << "N0x" << (unsigned long long)Node;
3029 std::string Result = SStream.str();
3030 return Result;
3031 }
3032};
3033
3034template <typename DerivedCCG, typename FuncTy, typename CallTy>
3035void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3036 std::string Label) const {
3037 WriteGraph(this, "", false, Label,
3038 DotFilePathPrefix + "ccg." + Label + ".dot");
3039}
3040
3041template <typename DerivedCCG, typename FuncTy, typename CallTy>
3042typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3043CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3044 const std::shared_ptr<ContextEdge> &Edge,
3045 DenseSet<uint32_t> ContextIdsToMove) {
3046 ContextNode *Node = Edge->Callee;
3047 assert(NodeToCallingFunc.count(Node));
3048 ContextNode *Clone =
3049 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3050 Node->addClone(Clone);
3051 Clone->MatchingCalls = Node->MatchingCalls;
3052 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3053 ContextIdsToMove);
3054 return Clone;
3055}
3056
3057template <typename DerivedCCG, typename FuncTy, typename CallTy>
3058void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3059 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3060 ContextNode *NewCallee, bool NewClone,
3061 DenseSet<uint32_t> ContextIdsToMove) {
3062 // NewCallee and Edge's current callee must be clones of the same original
3063 // node (Edge's current callee may be the original node too).
3064 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3065
3066 ContextNode *OldCallee = Edge->Callee;
3067
3068 // We might already have an edge to the new callee from earlier cloning for a
3069 // different allocation. If one exists we will reuse it.
3070 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3071
3072 // Callers will pass an empty ContextIdsToMove set when they want to move the
3073 // edge. Copy in Edge's ids for simplicity.
3074 if (ContextIdsToMove.empty())
3075 ContextIdsToMove = Edge->getContextIds();
3076
3077 // If we are moving all of Edge's ids, then just move the whole Edge.
3078 // Otherwise only move the specified subset, to a new edge if needed.
3079 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3080 // First, update the alloc types on New Callee from Edge.
3081 // Do this before we potentially clear Edge's fields below!
3082 NewCallee->AllocTypes |= Edge->AllocTypes;
3083 // Moving the whole Edge.
3084 if (ExistingEdgeToNewCallee) {
3085 // Since we already have an edge to NewCallee, simply move the ids
3086 // onto it, and remove the existing Edge.
3087 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3088 ContextIdsToMove.end());
3089 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3090 assert(Edge->ContextIds == ContextIdsToMove);
3091 removeEdgeFromGraph(Edge.get());
3092 } else {
3093 // Otherwise just reconnect Edge to NewCallee.
3094 Edge->Callee = NewCallee;
3095 NewCallee->CallerEdges.push_back(Edge);
3096 // Remove it from callee where it was previously connected.
3097 OldCallee->eraseCallerEdge(Edge.get());
3098 // Don't need to update Edge's context ids since we are simply
3099 // reconnecting it.
3100 }
3101 } else {
3102 // Only moving a subset of Edge's ids.
3103 // Compute the alloc type of the subset of ids being moved.
3104 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3105 if (ExistingEdgeToNewCallee) {
3106 // Since we already have an edge to NewCallee, simply move the ids
3107 // onto it.
3108 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3109 ContextIdsToMove.end());
3110 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3111 } else {
3112 // Otherwise, create a new edge to NewCallee for the ids being moved.
3113 auto NewEdge = std::make_shared<ContextEdge>(
3114 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3115 Edge->Caller->CalleeEdges.push_back(NewEdge);
3116 NewCallee->CallerEdges.push_back(NewEdge);
3117 }
3118 // In either case, need to update the alloc types on NewCallee, and remove
3119 // those ids and update the alloc type on the original Edge.
3120 NewCallee->AllocTypes |= CallerEdgeAllocType;
3121 set_subtract(Edge->ContextIds, ContextIdsToMove);
3122 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3123 }
3124 // Now walk the old callee node's callee edges and move Edge's context ids
3125 // over to the corresponding edge into the clone (which is created here if
3126 // this is a newly created clone).
3127 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3128 // The context ids moving to the new callee are the subset of this edge's
3129 // context ids and the context ids on the caller edge being moved.
3130 DenseSet<uint32_t> EdgeContextIdsToMove =
3131 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3132 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3133 OldCalleeEdge->AllocTypes =
3134 computeAllocType(OldCalleeEdge->getContextIds());
3135 if (!NewClone) {
3136 // Update context ids / alloc type on corresponding edge to NewCallee.
3137 // There is a chance this may not exist if we are reusing an existing
3138 // clone, specifically during function assignment, where we would have
3139 // removed none type edges after creating the clone. If we can't find
3140 // a corresponding edge there, fall through to the cloning below.
3141 if (auto *NewCalleeEdge =
3142 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
3143 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3144 EdgeContextIdsToMove.end());
3145 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3146 continue;
3147 }
3148 }
3149 auto NewEdge = std::make_shared<ContextEdge>(
3150 OldCalleeEdge->Callee, NewCallee,
3151 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3152 NewCallee->CalleeEdges.push_back(NewEdge);
3153 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3154 }
3155 // Recompute the node alloc type now that its callee edges have been
3156 // updated (since we will compute from those edges).
3157 OldCallee->AllocTypes = OldCallee->computeAllocType();
3158 // OldCallee alloc type should be None iff its context id set is now empty.
3159 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3160 OldCallee->emptyContextIds());
3161 if (VerifyCCG) {
3162 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3163 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3164 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3165 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3166 /*CheckEdges=*/false);
3167 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3168 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3169 /*CheckEdges=*/false);
3170 }
3171}
3172
3173template <typename DerivedCCG, typename FuncTy, typename CallTy>
3174void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3175 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3176 ContextNode *NewCaller) {
3177
3178 ContextNode *OldCaller = Edge->Caller;
3179 OldCaller->eraseCalleeEdge(Edge.get());
3180
3181 // We might already have an edge to the new caller. If one exists we will
3182 // reuse it.
3183 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
3184
3185 if (ExistingEdgeToNewCaller) {
3186 // Since we already have an edge to NewCaller, simply move the ids
3187 // onto it, and remove the existing Edge.
3188 ExistingEdgeToNewCaller->getContextIds().insert(
3189 Edge->getContextIds().begin(), Edge->getContextIds().end());
3190 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3191 Edge->ContextIds.clear();
3192 Edge->AllocTypes = (uint8_t)AllocationType::None;
3193 Edge->Callee->eraseCallerEdge(Edge.get());
3194 } else {
3195 // Otherwise just reconnect Edge to NewCaller.
3196 Edge->Caller = NewCaller;
3197 NewCaller->CalleeEdges.push_back(Edge);
3198 // Don't need to update Edge's context ids since we are simply
3199 // reconnecting it.
3200 }
3201 // In either case, need to update the alloc types on New Caller.
3202 NewCaller->AllocTypes |= Edge->AllocTypes;
3203
3204 // Now walk the old caller node's caller edges and move Edge's context ids
3205 // over to the corresponding edge into the node (which is created here if
3206 // this is a newly created node). We can tell whether this is a newly created
3207 // node by seeing if it has any caller edges yet.
3208#ifndef NDEBUG
3209 bool IsNewNode = NewCaller->CallerEdges.empty();
3210#endif
3211 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3212 // The context ids moving to the new caller are the subset of this edge's
3213 // context ids and the context ids on the callee edge being moved.
3214 DenseSet<uint32_t> EdgeContextIdsToMove =
3215 set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
3216 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3217 OldCallerEdge->AllocTypes =
3218 computeAllocType(OldCallerEdge->getContextIds());
3219 // In this function we expect that any pre-existing node already has edges
3220 // from the same callers as the old node. That should be true in the current
3221 // use case, where we will remove None-type edges after copying over all
3222 // caller edges from the callee.
3223 auto *ExistingCallerEdge =
3224 NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
3225 assert(IsNewNode || ExistingCallerEdge);
3226 if (ExistingCallerEdge) {
3227 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3228 EdgeContextIdsToMove.end());
3229 ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3230 continue;
3231 }
3232 auto NewEdge = std::make_shared<ContextEdge>(
3233 NewCaller, OldCallerEdge->Caller,
3234 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3235 NewCaller->CallerEdges.push_back(NewEdge);
3236 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3237 }
3238 // Recompute the node alloc type now that its caller edges have been
3239 // updated (since we will compute from those edges).
3240 OldCaller->AllocTypes = OldCaller->computeAllocType();
3241 // OldCaller alloc type should be None iff its context id set is now empty.
3242 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3243 OldCaller->emptyContextIds());
3244 if (VerifyCCG) {
3245 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3246 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3247 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3248 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3249 /*CheckEdges=*/false);
3250 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3251 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3252 /*CheckEdges=*/false);
3253 }
3254}
3255
3256template <typename DerivedCCG, typename FuncTy, typename CallTy>
3257void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3258 recursivelyRemoveNoneTypeCalleeEdges(
3259 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3260 auto Inserted = Visited.insert(Node);
3261 if (!Inserted.second)
3262 return;
3263
3264 removeNoneTypeCalleeEdges(Node);
3265
3266 for (auto *Clone : Node->Clones)
3267 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3268
3269 // The recursive call may remove some of this Node's caller edges.
3270 // Iterate over a copy and skip any that were removed.
3271 auto CallerEdges = Node->CallerEdges;
3272 for (auto &Edge : CallerEdges) {
3273 // Skip any that have been removed by an earlier recursive call.
3274 if (Edge->isRemoved()) {
3275 assert(!is_contained(Node->CallerEdges, Edge));
3276 continue;
3277 }
3278 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3279 }
3280}
3281
3282template <typename DerivedCCG, typename FuncTy, typename CallTy>
3283void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3285 for (auto &Entry : AllocationCallToContextNodeMap) {
3286 Visited.clear();
3287 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3288 }
3289 Visited.clear();
3290 for (auto &Entry : AllocationCallToContextNodeMap)
3291 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3292 if (VerifyCCG)
3293 check();
3294}
3295
3296// helper function to check an AllocType is cold or notcold or both.
3298 return (AllocType == (uint8_t)AllocationType::Cold) ||
3300 (AllocType ==
3302}
3303
3304template <typename DerivedCCG, typename FuncTy, typename CallTy>
3305void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3306 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3307 const DenseSet<uint32_t> &AllocContextIds) {
3308 if (VerifyNodes)
3309 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3310 assert(!Node->CloneOf);
3311
3312 // If Node as a null call, then either it wasn't found in the module (regular
3313 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3314 // cloning (e.g. recursion, calls multiple targets, etc).
3315 // Do this here so that we don't try to recursively clone callers below, which
3316 // isn't useful at least for this node.
3317 if (!Node->hasCall())
3318 return;
3319
3320#ifndef NDEBUG
3321 auto Insert =
3322#endif
3323 Visited.insert(Node);
3324 // We should not have visited this node yet.
3325 assert(Insert.second);
3326 // The recursive call to identifyClones may delete the current edge from the
3327 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3328 // in an iterator and having recursive call erase from it. Other edges may
3329 // also get removed during the recursion, which will have null Callee and
3330 // Caller pointers (and are deleted later), so we skip those below.
3331 {
3332 auto CallerEdges = Node->CallerEdges;
3333 for (auto &Edge : CallerEdges) {
3334 // Skip any that have been removed by an earlier recursive call.
3335 if (Edge->isRemoved()) {
3336 assert(!is_contained(Node->CallerEdges, Edge));
3337 continue;
3338 }
3339 // Ignore any caller we previously visited via another edge.
3340 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3341 identifyClones(Edge->Caller, Visited, AllocContextIds);
3342 }
3343 }
3344 }
3345
3346 // Check if we reached an unambiguous call or have have only a single caller.
3347 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3348 return;
3349
3350 // We need to clone.
3351
3352 // Try to keep the original version as alloc type NotCold. This will make
3353 // cases with indirect calls or any other situation with an unknown call to
3354 // the original function get the default behavior. We do this by sorting the
3355 // CallerEdges of the Node we will clone by alloc type.
3356 //
3357 // Give NotCold edge the lowest sort priority so those edges are at the end of
3358 // the caller edges vector, and stay on the original version (since the below
3359 // code clones greedily until it finds all remaining edges have the same type
3360 // and leaves the remaining ones on the original Node).
3361 //
3362 // We shouldn't actually have any None type edges, so the sorting priority for
3363 // that is arbitrary, and we assert in that case below.
3364 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3365 /*Cold*/ 1,
3366 /*NotColdCold*/ 2};
3367 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3368 [&](const std::shared_ptr<ContextEdge> &A,
3369 const std::shared_ptr<ContextEdge> &B) {
3370 // Nodes with non-empty context ids should be sorted before
3371 // those with empty context ids.
3372 if (A->ContextIds.empty())
3373 // Either B ContextIds are non-empty (in which case we
3374 // should return false because B < A), or B ContextIds
3375 // are empty, in which case they are equal, and we should
3376 // maintain the original relative ordering.
3377 return false;
3378 if (B->ContextIds.empty())
3379 return true;
3380
3381 if (A->AllocTypes == B->AllocTypes)
3382 // Use the first context id for each edge as a
3383 // tie-breaker.
3384 return *A->ContextIds.begin() < *B->ContextIds.begin();
3385 return AllocTypeCloningPriority[A->AllocTypes] <
3386 AllocTypeCloningPriority[B->AllocTypes];
3387 });
3388
3389 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3390
3391 DenseSet<uint32_t> RecursiveContextIds;
3392 // If we are allowing recursive callsites, but have also disabled recursive
3393 // contexts, look for context ids that show up in multiple caller edges.
3395 DenseSet<uint32_t> AllCallerContextIds;
3396 for (auto &CE : Node->CallerEdges) {
3397 // Resize to the largest set of caller context ids, since we know the
3398 // final set will be at least that large.
3399 AllCallerContextIds.reserve(CE->getContextIds().size());
3400 for (auto Id : CE->getContextIds())
3401 if (!AllCallerContextIds.insert(Id).second)
3402 RecursiveContextIds.insert(Id);
3403 }
3404 }
3405
3406 // Iterate until we find no more opportunities for disambiguating the alloc
3407 // types via cloning. In most cases this loop will terminate once the Node
3408 // has a single allocation type, in which case no more cloning is needed.
3409 // Iterate over a copy of Node's caller edges, since we may need to remove
3410 // edges in the moveEdgeTo* methods, and this simplifies the handling and
3411 // makes it less error-prone.
3412 auto CallerEdges = Node->CallerEdges;
3413 for (auto &CallerEdge : CallerEdges) {
3414 // See if cloning the prior caller edge left this node with a single alloc
3415 // type or a single caller. In that case no more cloning of Node is needed.
3416 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3417 break;
3418
3419 // If the caller was not successfully matched to a call in the IR/summary,
3420 // there is no point in trying to clone for it as we can't update that call.
3421 if (!CallerEdge->Caller->hasCall())
3422 continue;
3423
3424 // Only need to process the ids along this edge pertaining to the given
3425 // allocation.
3426 auto CallerEdgeContextsForAlloc =
3427 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3428 if (!RecursiveContextIds.empty())
3429 CallerEdgeContextsForAlloc =
3430 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3431 if (CallerEdgeContextsForAlloc.empty())
3432 continue;
3433
3434 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3435
3436 // Compute the node callee edge alloc types corresponding to the context ids
3437 // for this caller edge.
3438 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3439 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3440 for (auto &CalleeEdge : Node->CalleeEdges)
3441 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3442 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3443
3444 // Don't clone if doing so will not disambiguate any alloc types amongst
3445 // caller edges (including the callee edges that would be cloned).
3446 // Otherwise we will simply move all edges to the clone.
3447 //
3448 // First check if by cloning we will disambiguate the caller allocation
3449 // type from node's allocation type. Query allocTypeToUse so that we don't
3450 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3451 // neither of these should be None type.
3452 //
3453 // Then check if by cloning node at least one of the callee edges will be
3454 // disambiguated by splitting out different context ids.
3455 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3456 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3457 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3458 allocTypeToUse(Node->AllocTypes) &&
3459 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3460 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges))
3461 continue;
3462
3463 // First see if we can use an existing clone. Check each clone and its
3464 // callee edges for matching alloc types.
3465 ContextNode *Clone = nullptr;
3466 for (auto *CurClone : Node->Clones) {
3467 if (allocTypeToUse(CurClone->AllocTypes) !=
3468 allocTypeToUse(CallerAllocTypeForAlloc))
3469 continue;
3470
3471 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3472 hasSingleAllocType(CallerAllocTypeForAlloc);
3473 // The above check should mean that if both have single alloc types that
3474 // they should be equal.
3475 assert(!BothSingleAlloc ||
3476 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3477
3478 // If either both have a single alloc type (which are the same), or if the
3479 // clone's callee edges have the same alloc types as those for the current
3480 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3481 // then we can reuse this clone.
3482 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3483 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3484 Clone = CurClone;
3485 break;
3486 }
3487 }
3488
3489 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3490 if (Clone)
3491 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3492 CallerEdgeContextsForAlloc);
3493 else
3494 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3495
3496 // Sanity check that no alloc types on clone or its edges are None.
3497 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3498 }
3499
3500 // We should still have some context ids on the original Node.
3501 assert(!Node->emptyContextIds());
3502
3503 // Sanity check that no alloc types on node or edges are None.
3504 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3505
3506 if (VerifyNodes)
3507 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3508}
3509
3510void ModuleCallsiteContextGraph::updateAllocationCall(
3512 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3513 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3514 "memprof", AllocTypeString);
3515 cast<CallBase>(Call.call())->addFnAttr(A);
3516 OREGetter(Call.call()->getFunction())
3517 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3518 << ore::NV("AllocationCall", Call.call()) << " in clone "
3519 << ore::NV("Caller", Call.call()->getFunction())
3520 << " marked with memprof allocation attribute "
3521 << ore::NV("Attribute", AllocTypeString));
3522}
3523
3524void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3526 auto *AI = cast<AllocInfo *>(Call.call());
3527 assert(AI);
3528 assert(AI->Versions.size() > Call.cloneNo());
3529 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3530}
3531
3533ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3534 const auto *CB = cast<CallBase>(Call.call());
3535 if (!CB->getAttributes().hasFnAttr("memprof"))
3536 return AllocationType::None;
3537 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3540}
3541
3543IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3544 const auto *AI = cast<AllocInfo *>(Call.call());
3545 assert(AI->Versions.size() > Call.cloneNo());
3546 return (AllocationType)AI->Versions[Call.cloneNo()];
3547}
3548
3549void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3550 FuncInfo CalleeFunc) {
3551 if (CalleeFunc.cloneNo() > 0)
3552 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3553 OREGetter(CallerCall.call()->getFunction())
3554 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3555 << ore::NV("Call", CallerCall.call()) << " in clone "
3556 << ore::NV("Caller", CallerCall.call()->getFunction())
3557 << " assigned to call function clone "
3558 << ore::NV("Callee", CalleeFunc.func()));
3559}
3560
3561void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3562 FuncInfo CalleeFunc) {
3563 auto *CI = cast<CallsiteInfo *>(CallerCall.call());
3564 assert(CI &&
3565 "Caller cannot be an allocation which should not have profiled calls");
3566 assert(CI->Clones.size() > CallerCall.cloneNo());
3567 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3568}
3569
3570CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3571 Instruction *>::FuncInfo
3572ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3573 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3574 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3575 // Use existing LLVM facilities for cloning and obtaining Call in clone
3576 ValueToValueMapTy VMap;
3577 auto *NewFunc = CloneFunction(Func.func(), VMap);
3578 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3579 assert(!Func.func()->getParent()->getFunction(Name));
3580 NewFunc->setName(Name);
3581 for (auto &Inst : CallsWithMetadataInFunc) {
3582 // This map always has the initial version in it.
3583 assert(Inst.cloneNo() == 0);
3584 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3585 }
3586 OREGetter(Func.func())
3587 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3588 << "created clone " << ore::NV("NewFunction", NewFunc));
3589 return {NewFunc, CloneNo};
3590}
3591
3592CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3593 IndexCall>::FuncInfo
3594IndexCallsiteContextGraph::cloneFunctionForCallsite(
3595 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3596 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3597 // Check how many clones we have of Call (and therefore function).
3598 // The next clone number is the current size of versions array.
3599 // Confirm this matches the CloneNo provided by the caller, which is based on
3600 // the number of function clones we have.
3601 assert(CloneNo == (isa<AllocInfo *>(Call.call())
3602 ? cast<AllocInfo *>(Call.call())->Versions.size()
3603 : cast<CallsiteInfo *>(Call.call())->Clones.size()));
3604 // Walk all the instructions in this function. Create a new version for
3605 // each (by adding an entry to the Versions/Clones summary array), and copy
3606 // over the version being called for the function clone being cloned here.
3607 // Additionally, add an entry to the CallMap for the new function clone,
3608 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3609 // to the new call clone.
3610 for (auto &Inst : CallsWithMetadataInFunc) {
3611 // This map always has the initial version in it.
3612 assert(Inst.cloneNo() == 0);
3613 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
3614 assert(AI->Versions.size() == CloneNo);
3615 // We assign the allocation type later (in updateAllocationCall), just add
3616 // an entry for it here.
3617 AI->Versions.push_back(0);
3618 } else {
3619 auto *CI = cast<CallsiteInfo *>(Inst.call());
3620 assert(CI && CI->Clones.size() == CloneNo);
3621 // We assign the clone number later (in updateCall), just add an entry for
3622 // it here.
3623 CI->Clones.push_back(0);
3624 }
3625 CallMap[Inst] = {Inst.call(), CloneNo};
3626 }
3627 return {Func.func(), CloneNo};
3628}
3629
3630// This method assigns cloned callsites to functions, cloning the functions as
3631// needed. The assignment is greedy and proceeds roughly as follows:
3632//
3633// For each function Func:
3634// For each call with graph Node having clones:
3635// Initialize ClonesWorklist to Node and its clones
3636// Initialize NodeCloneCount to 0
3637// While ClonesWorklist is not empty:
3638// Clone = pop front ClonesWorklist
3639// NodeCloneCount++
3640// If Func has been cloned less than NodeCloneCount times:
3641// If NodeCloneCount is 1:
3642// Assign Clone to original Func
3643// Continue
3644// Create a new function clone
3645// If other callers not assigned to call a function clone yet:
3646// Assign them to call new function clone
3647// Continue
3648// Assign any other caller calling the cloned version to new clone
3649//
3650// For each caller of Clone:
3651// If caller is assigned to call a specific function clone:
3652// If we cannot assign Clone to that function clone:
3653// Create new callsite Clone NewClone
3654// Add NewClone to ClonesWorklist
3655// Continue
3656// Assign Clone to existing caller's called function clone
3657// Else:
3658// If Clone not already assigned to a function clone:
3659// Assign to first function clone without assignment
3660// Assign caller to selected function clone
3661template <typename DerivedCCG, typename FuncTy, typename CallTy>
3662bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3663 bool Changed = false;
3664
3665 // Keep track of the assignment of nodes (callsites) to function clones they
3666 // call.
3667 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3668
3669 // Update caller node to call function version CalleeFunc, by recording the
3670 // assignment in CallsiteToCalleeFuncCloneMap.
3671 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3672 const FuncInfo &CalleeFunc) {
3673 assert(Caller->hasCall());
3674 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3675 };
3676
3677 // Walk all functions for which we saw calls with memprof metadata, and handle
3678 // cloning for each of its calls.
3679 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3680 FuncInfo OrigFunc(Func);
3681 // Map from each clone of OrigFunc to a map of remappings of each call of
3682 // interest (from original uncloned call to the corresponding cloned call in
3683 // that function clone).
3684 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3685 for (auto &Call : CallsWithMetadata) {
3686 ContextNode *Node = getNodeForInst(Call);
3687 // Skip call if we do not have a node for it (all uses of its stack ids
3688 // were either on inlined chains or pruned from the MIBs), or if we did
3689 // not create any clones for it.
3690 if (!Node || Node->Clones.empty())
3691 continue;
3692 assert(Node->hasCall() &&
3693 "Not having a call should have prevented cloning");
3694
3695 // Track the assignment of function clones to clones of the current
3696 // callsite Node being handled.
3697 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3698
3699 // Assign callsite version CallsiteClone to function version FuncClone,
3700 // and also assign (possibly cloned) Call to CallsiteClone.
3701 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3702 CallInfo &Call,
3703 ContextNode *CallsiteClone,
3704 bool IsAlloc) {
3705 // Record the clone of callsite node assigned to this function clone.
3706 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3707
3708 assert(FuncClonesToCallMap.count(FuncClone));
3709 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3710 CallInfo CallClone(Call);
3711 if (CallMap.count(Call))
3712 CallClone = CallMap[Call];
3713 CallsiteClone->setCall(CallClone);
3714 // Need to do the same for all matching calls.
3715 for (auto &MatchingCall : Node->MatchingCalls) {
3716 CallInfo CallClone(MatchingCall);
3717 if (CallMap.count(MatchingCall))
3718 CallClone = CallMap[MatchingCall];
3719 // Updates the call in the list.
3720 MatchingCall = CallClone;
3721 }
3722 };
3723
3724 // Keep track of the clones of callsite Node that need to be assigned to
3725 // function clones. This list may be expanded in the loop body below if we
3726 // find additional cloning is required.
3727 std::deque<ContextNode *> ClonesWorklist;
3728 // Ignore original Node if we moved all of its contexts to clones.
3729 if (!Node->emptyContextIds())
3730 ClonesWorklist.push_back(Node);
3731 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3732 Node->Clones.end());
3733
3734 // Now walk through all of the clones of this callsite Node that we need,
3735 // and determine the assignment to a corresponding clone of the current
3736 // function (creating new function clones as needed).
3737 unsigned NodeCloneCount = 0;
3738 while (!ClonesWorklist.empty()) {
3739 ContextNode *Clone = ClonesWorklist.front();
3740 ClonesWorklist.pop_front();
3741 NodeCloneCount++;
3742 if (VerifyNodes)
3743 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3744
3745 // Need to create a new function clone if we have more callsite clones
3746 // than existing function clones, which would have been assigned to an
3747 // earlier clone in the list (we assign callsite clones to function
3748 // clones greedily).
3749 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3750 // If this is the first callsite copy, assign to original function.
3751 if (NodeCloneCount == 1) {
3752 // Since FuncClonesToCallMap is empty in this case, no clones have
3753 // been created for this function yet, and no callers should have
3754 // been assigned a function clone for this callee node yet.
3756 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3757 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3758 }));
3759 // Initialize with empty call map, assign Clone to original function
3760 // and its callers, and skip to the next clone.
3761 FuncClonesToCallMap[OrigFunc] = {};
3762 AssignCallsiteCloneToFuncClone(
3763 OrigFunc, Call, Clone,
3764 AllocationCallToContextNodeMap.count(Call));
3765 for (auto &CE : Clone->CallerEdges) {
3766 // Ignore any caller that does not have a recorded callsite Call.
3767 if (!CE->Caller->hasCall())
3768 continue;
3769 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3770 }
3771 continue;
3772 }
3773
3774 // First locate which copy of OrigFunc to clone again. If a caller
3775 // of this callsite clone was already assigned to call a particular
3776 // function clone, we need to redirect all of those callers to the
3777 // new function clone, and update their other callees within this
3778 // function.
3779 FuncInfo PreviousAssignedFuncClone;
3780 auto EI = llvm::find_if(
3781 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3782 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3783 });
3784 bool CallerAssignedToCloneOfFunc = false;
3785 if (EI != Clone->CallerEdges.end()) {
3786 const std::shared_ptr<ContextEdge> &Edge = *EI;
3787 PreviousAssignedFuncClone =
3788 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3789 CallerAssignedToCloneOfFunc = true;
3790 }
3791
3792 // Clone function and save it along with the CallInfo map created
3793 // during cloning in the FuncClonesToCallMap.
3794 std::map<CallInfo, CallInfo> NewCallMap;
3795 unsigned CloneNo = FuncClonesToCallMap.size();
3796 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3797 "should already exist in the map");
3798 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3799 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3800 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3801 FunctionClonesAnalysis++;
3802 Changed = true;
3803
3804 // If no caller callsites were already assigned to a clone of this
3805 // function, we can simply assign this clone to the new func clone
3806 // and update all callers to it, then skip to the next clone.
3807 if (!CallerAssignedToCloneOfFunc) {
3808 AssignCallsiteCloneToFuncClone(
3809 NewFuncClone, Call, Clone,
3810 AllocationCallToContextNodeMap.count(Call));
3811 for (auto &CE : Clone->CallerEdges) {
3812 // Ignore any caller that does not have a recorded callsite Call.
3813 if (!CE->Caller->hasCall())
3814 continue;
3815 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3816 }
3817 continue;
3818 }
3819
3820 // We may need to do additional node cloning in this case.
3821 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3822 // that were previously assigned to call PreviousAssignedFuncClone,
3823 // to record that they now call NewFuncClone.
3824 // The none type edge removal may remove some of this Clone's caller
3825 // edges, if it is reached via another of its caller's callees.
3826 // Iterate over a copy and skip any that were removed.
3827 auto CallerEdges = Clone->CallerEdges;
3828 for (auto CE : CallerEdges) {
3829 // Skip any that have been removed on an earlier iteration.
3830 if (CE->isRemoved()) {
3831 assert(!is_contained(Clone->CallerEdges, CE));
3832 continue;
3833 }
3834 assert(CE);
3835 // Ignore any caller that does not have a recorded callsite Call.
3836 if (!CE->Caller->hasCall())
3837 continue;
3838
3839 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3840 // We subsequently fall through to later handling that
3841 // will perform any additional cloning required for
3842 // callers that were calling other function clones.
3843 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3844 PreviousAssignedFuncClone)
3845 continue;
3846
3847 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3848
3849 // If we are cloning a function that was already assigned to some
3850 // callers, then essentially we are creating new callsite clones
3851 // of the other callsites in that function that are reached by those
3852 // callers. Clone the other callees of the current callsite's caller
3853 // that were already assigned to PreviousAssignedFuncClone
3854 // accordingly. This is important since we subsequently update the
3855 // calls from the nodes in the graph and their assignments to callee
3856 // functions recorded in CallsiteToCalleeFuncCloneMap.
3857 // The none type edge removal may remove some of this caller's
3858 // callee edges, if it is reached via another of its callees.
3859 // Iterate over a copy and skip any that were removed.
3860 auto CalleeEdges = CE->Caller->CalleeEdges;
3861 for (auto CalleeEdge : CalleeEdges) {
3862 // Skip any that have been removed on an earlier iteration when
3863 // cleaning up newly None type callee edges.
3864 if (CalleeEdge->isRemoved()) {
3865 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3866 continue;
3867 }
3868 assert(CalleeEdge);
3869 ContextNode *Callee = CalleeEdge->Callee;
3870 // Skip the current callsite, we are looking for other
3871 // callsites Caller calls, as well as any that does not have a
3872 // recorded callsite Call.
3873 if (Callee == Clone || !Callee->hasCall())
3874 continue;
3875 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3876 removeNoneTypeCalleeEdges(NewClone);
3877 // Moving the edge may have resulted in some none type
3878 // callee edges on the original Callee.
3879 removeNoneTypeCalleeEdges(Callee);
3880 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3881 // If the Callee node was already assigned to call a specific
3882 // function version, make sure its new clone is assigned to call
3883 // that same function clone.
3884 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3885 RecordCalleeFuncOfCallsite(
3886 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3887 // Update NewClone with the new Call clone of this callsite's Call
3888 // created for the new function clone created earlier.
3889 // Recall that we have already ensured when building the graph
3890 // that each caller can only call callsites within the same
3891 // function, so we are guaranteed that Callee Call is in the
3892 // current OrigFunc.
3893 // CallMap is set up as indexed by original Call at clone 0.
3894 CallInfo OrigCall(Callee->getOrigNode()->Call);
3895 OrigCall.setCloneNo(0);
3896 std::map<CallInfo, CallInfo> &CallMap =
3897 FuncClonesToCallMap[NewFuncClone];
3898 assert(CallMap.count(OrigCall));
3899 CallInfo NewCall(CallMap[OrigCall]);
3900 assert(NewCall);
3901 NewClone->setCall(NewCall);
3902 // Need to do the same for all matching calls.
3903 for (auto &MatchingCall : NewClone->MatchingCalls) {
3904 CallInfo OrigMatchingCall(MatchingCall);
3905 OrigMatchingCall.setCloneNo(0);
3906 assert(CallMap.count(OrigMatchingCall));
3907 CallInfo NewCall(CallMap[OrigMatchingCall]);
3908 assert(NewCall);
3909 // Updates the call in the list.
3910 MatchingCall = NewCall;
3911 }
3912 }
3913 }
3914 // Fall through to handling below to perform the recording of the
3915 // function for this callsite clone. This enables handling of cases
3916 // where the callers were assigned to different clones of a function.
3917 }
3918
3919 // See if we can use existing function clone. Walk through
3920 // all caller edges to see if any have already been assigned to
3921 // a clone of this callsite's function. If we can use it, do so. If not,
3922 // because that function clone is already assigned to a different clone
3923 // of this callsite, then we need to clone again.
3924 // Basically, this checking is needed to handle the case where different
3925 // caller functions/callsites may need versions of this function
3926 // containing different mixes of callsite clones across the different
3927 // callsites within the function. If that happens, we need to create
3928 // additional function clones to handle the various combinations.
3929 //
3930 // Keep track of any new clones of this callsite created by the
3931 // following loop, as well as any existing clone that we decided to
3932 // assign this clone to.
3933 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3934 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3935 // Iterate over a copy of Clone's caller edges, since we may need to
3936 // remove edges in the moveEdgeTo* methods, and this simplifies the
3937 // handling and makes it less error-prone.
3938 auto CloneCallerEdges = Clone->CallerEdges;
3939 for (auto &Edge : CloneCallerEdges) {
3940 // Ignore any caller that does not have a recorded callsite Call.
3941 if (!Edge->Caller->hasCall())
3942 continue;
3943 // If this caller already assigned to call a version of OrigFunc, need
3944 // to ensure we can assign this callsite clone to that function clone.
3945 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3946 FuncInfo FuncCloneCalledByCaller =
3947 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3948 // First we need to confirm that this function clone is available
3949 // for use by this callsite node clone.
3950 //
3951 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3952 // its callsite clones, one of those callsite clones X could have
3953 // been assigned to the same function clone called by Edge's caller
3954 // - if Edge's caller calls another callsite within Node's original
3955 // function, and that callsite has another caller reaching clone X.
3956 // We need to clone Node again in this case.
3957 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3958 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3959 Clone) ||
3960 // Detect when we have multiple callers of this callsite that
3961 // have already been assigned to specific, and different, clones
3962 // of OrigFunc (due to other unrelated callsites in Func they
3963 // reach via call contexts). Is this Clone of callsite Node
3964 // assigned to a different clone of OrigFunc? If so, clone Node
3965 // again.
3966 (FuncCloneAssignedToCurCallsiteClone &&
3967 FuncCloneAssignedToCurCallsiteClone !=
3968 FuncCloneCalledByCaller)) {
3969 // We need to use a different newly created callsite clone, in
3970 // order to assign it to another new function clone on a
3971 // subsequent iteration over the Clones array (adjusted below).
3972 // Note we specifically do not reset the
3973 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3974 // when this new clone is processed later we know which version of
3975 // the function to copy (so that other callsite clones we have
3976 // assigned to that function clone are properly cloned over). See
3977 // comments in the function cloning handling earlier.
3978
3979 // Check if we already have cloned this callsite again while
3980 // walking through caller edges, for a caller calling the same
3981 // function clone. If so, we can move this edge to that new clone
3982 // rather than creating yet another new clone.
3983 if (FuncCloneToNewCallsiteCloneMap.count(
3984 FuncCloneCalledByCaller)) {
3985 ContextNode *NewClone =
3986 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3987 moveEdgeToExistingCalleeClone(Edge, NewClone);
3988 // Cleanup any none type edges cloned over.
3989 removeNoneTypeCalleeEdges(NewClone);
3990 } else {
3991 // Create a new callsite clone.
3992 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
3993 removeNoneTypeCalleeEdges(NewClone);
3994 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3995 NewClone;
3996 // Add to list of clones and process later.
3997 ClonesWorklist.push_back(NewClone);
3998 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3999 }
4000 // Moving the caller edge may have resulted in some none type
4001 // callee edges.
4002 removeNoneTypeCalleeEdges(Clone);
4003 // We will handle the newly created callsite clone in a subsequent
4004 // iteration over this Node's Clones.
4005 continue;
4006 }
4007
4008 // Otherwise, we can use the function clone already assigned to this
4009 // caller.
4010 if (!FuncCloneAssignedToCurCallsiteClone) {
4011 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4012 // Assign Clone to FuncCloneCalledByCaller
4013 AssignCallsiteCloneToFuncClone(
4014 FuncCloneCalledByCaller, Call, Clone,
4015 AllocationCallToContextNodeMap.count(Call));
4016 } else
4017 // Don't need to do anything - callsite is already calling this
4018 // function clone.
4019 assert(FuncCloneAssignedToCurCallsiteClone ==
4020 FuncCloneCalledByCaller);
4021
4022 } else {
4023 // We have not already assigned this caller to a version of
4024 // OrigFunc. Do the assignment now.
4025
4026 // First check if we have already assigned this callsite clone to a
4027 // clone of OrigFunc for another caller during this iteration over
4028 // its caller edges.
4029 if (!FuncCloneAssignedToCurCallsiteClone) {
4030 // Find first function in FuncClonesToCallMap without an assigned
4031 // clone of this callsite Node. We should always have one
4032 // available at this point due to the earlier cloning when the
4033 // FuncClonesToCallMap size was smaller than the clone number.
4034 for (auto &CF : FuncClonesToCallMap) {
4035 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4036 FuncCloneAssignedToCurCallsiteClone = CF.first;
4037 break;
4038 }
4039 }
4040 assert(FuncCloneAssignedToCurCallsiteClone);
4041 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4042 AssignCallsiteCloneToFuncClone(
4043 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4044 AllocationCallToContextNodeMap.count(Call));
4045 } else
4046 assert(FuncCloneToCurNodeCloneMap
4047 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4048 // Update callers to record function version called.
4049 RecordCalleeFuncOfCallsite(Edge->Caller,
4050 FuncCloneAssignedToCurCallsiteClone);
4051 }
4052 }
4053 }
4054 if (VerifyCCG) {
4055 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4056 for (const auto &PE : Node->CalleeEdges)
4057 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4058 for (const auto &CE : Node->CallerEdges)
4059 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4060 for (auto *Clone : Node->Clones) {
4061 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4062 for (const auto &PE : Clone->CalleeEdges)
4063 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4064 for (const auto &CE : Clone->CallerEdges)
4065 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4066 }
4067 }
4068 }
4069 }
4070
4071 uint8_t BothTypes =
4073
4074 auto UpdateCalls = [&](ContextNode *Node,
4076 auto &&UpdateCalls) {
4077 auto Inserted = Visited.insert(Node);
4078 if (!Inserted.second)
4079 return;
4080
4081 for (auto *Clone : Node->Clones)
4082 UpdateCalls(Clone, Visited, UpdateCalls);
4083
4084 for (auto &Edge : Node->CallerEdges)
4085 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4086
4087 // Skip if either no call to update, or if we ended up with no context ids
4088 // (we moved all edges onto other clones).
4089 if (!Node->hasCall() || Node->emptyContextIds())
4090 return;
4091
4092 if (Node->IsAllocation) {
4093 auto AT = allocTypeToUse(Node->AllocTypes);
4094 // If the allocation type is ambiguous, and more aggressive hinting
4095 // has been enabled via the MinClonedColdBytePercent flag, see if this
4096 // allocation should be hinted cold anyway because its fraction cold bytes
4097 // allocated is at least the given threshold.
4098 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4099 !ContextIdToContextSizeInfos.empty()) {
4100 uint64_t TotalCold = 0;
4101 uint64_t Total = 0;
4102 for (auto Id : Node->getContextIds()) {
4103 auto TypeI = ContextIdToAllocationType.find(Id);
4104 assert(TypeI != ContextIdToAllocationType.end());
4105 auto CSI = ContextIdToContextSizeInfos.find(Id);
4106 if (CSI != ContextIdToContextSizeInfos.end()) {
4107 for (auto &Info : CSI->second) {
4108 Total += Info.TotalSize;
4109 if (TypeI->second == AllocationType::Cold)
4110 TotalCold += Info.TotalSize;
4111 }
4112 }
4113 }
4114 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4116 }
4117 updateAllocationCall(Node->Call, AT);
4118 assert(Node->MatchingCalls.empty());
4119 return;
4120 }
4121
4122 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4123 return;
4124
4125 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4126 updateCall(Node->Call, CalleeFunc);
4127 // Update all the matching calls as well.
4128 for (auto &Call : Node->MatchingCalls)
4129 updateCall(Call, CalleeFunc);
4130 };
4131
4132 // Performs DFS traversal starting from allocation nodes to update calls to
4133 // reflect cloning decisions recorded earlier. For regular LTO this will
4134 // update the actual calls in the IR to call the appropriate function clone
4135 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4136 // are recorded in the summary entries.
4138 for (auto &Entry : AllocationCallToContextNodeMap)
4139 UpdateCalls(Entry.second, Visited, UpdateCalls);
4140
4141 return Changed;
4142}
4143
4145 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4147 &FuncToAliasMap) {
4148 // The first "clone" is the original copy, we should only call this if we
4149 // needed to create new clones.
4150 assert(NumClones > 1);
4152 VMaps.reserve(NumClones - 1);
4153 FunctionsClonedThinBackend++;
4154 for (unsigned I = 1; I < NumClones; I++) {
4155 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4156 auto *NewF = CloneFunction(&F, *VMaps.back());
4157 FunctionClonesThinBackend++;
4158 // Strip memprof and callsite metadata from clone as they are no longer
4159 // needed.
4160 for (auto &BB : *NewF) {
4161 for (auto &Inst : BB) {
4162 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4163 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4164 }
4165 }
4166 std::string Name = getMemProfFuncName(F.getName(), I);
4167 auto *PrevF = M.getFunction(Name);
4168 if (PrevF) {
4169 // We might have created this when adjusting callsite in another
4170 // function. It should be a declaration.
4171 assert(PrevF->isDeclaration());
4172 NewF->takeName(PrevF);
4173 PrevF->replaceAllUsesWith(NewF);
4174 PrevF->eraseFromParent();
4175 } else
4176 NewF->setName(Name);
4177 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4178 << "created clone " << ore::NV("NewFunction", NewF));
4179
4180 // Now handle aliases to this function, and clone those as well.
4181 if (!FuncToAliasMap.count(&F))
4182 continue;
4183 for (auto *A : FuncToAliasMap[&F]) {
4184 std::string Name = getMemProfFuncName(A->getName(), I);
4185 auto *PrevA = M.getNamedAlias(Name);
4186 auto *NewA = GlobalAlias::create(A->getValueType(),
4187 A->getType()->getPointerAddressSpace(),
4188 A->getLinkage(), Name, NewF);
4189 NewA->copyAttributesFrom(A);
4190 if (PrevA) {
4191 // We might have created this when adjusting callsite in another
4192 // function. It should be a declaration.
4193 assert(PrevA->isDeclaration());
4194 NewA->takeName(PrevA);
4195 PrevA->replaceAllUsesWith(NewA);
4196 PrevA->eraseFromParent();
4197 }
4198 }
4199 }
4200 return VMaps;
4201}
4202
4203// Locate the summary for F. This is complicated by the fact that it might
4204// have been internalized or promoted.
4206 const ModuleSummaryIndex *ImportSummary) {
4207 // FIXME: Ideally we would retain the original GUID in some fashion on the
4208 // function (e.g. as metadata), but for now do our best to locate the
4209 // summary without that information.
4210 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4211 if (!TheFnVI)
4212 // See if theFn was internalized, by checking index directly with
4213 // original name (this avoids the name adjustment done by getGUID() for
4214 // internal symbols).
4215 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4216 if (TheFnVI)
4217 return TheFnVI;
4218 // Now query with the original name before any promotion was performed.
4219 StringRef OrigName =
4221 std::string OrigId = GlobalValue::getGlobalIdentifier(
4222 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
4223 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4224 if (TheFnVI)
4225 return TheFnVI;
4226 // Could be a promoted local imported from another module. We need to pass
4227 // down more info here to find the original module id. For now, try with
4228 // the OrigName which might have been stored in the OidGuidMap in the
4229 // index. This would not work if there were same-named locals in multiple
4230 // modules, however.
4231 auto OrigGUID =
4232 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
4233 if (OrigGUID)
4234 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
4235 return TheFnVI;
4236}
4237
4238bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4239 Module &M) {
4240 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4241 Symtab = std::make_unique<InstrProfSymtab>();
4242 // Don't add canonical names, to avoid multiple functions to the symtab
4243 // when they both have the same root name with "." suffixes stripped.
4244 // If we pick the wrong one then this could lead to incorrect ICP and calling
4245 // a memprof clone that we don't actually create (resulting in linker unsats).
4246 // What this means is that the GUID of the function (or its PGOFuncName
4247 // metadata) *must* match that in the VP metadata to allow promotion.
4248 // In practice this should not be a limitation, since local functions should
4249 // have PGOFuncName metadata and global function names shouldn't need any
4250 // special handling (they should not get the ".llvm.*" suffix that the
4251 // canonicalization handling is attempting to strip).
4252 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4253 std::string SymtabFailure = toString(std::move(E));
4254 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4255 return false;
4256 }
4257 return true;
4258}
4259
4260bool MemProfContextDisambiguation::applyImport(Module &M) {
4261 assert(ImportSummary);
4262 bool Changed = false;
4263
4264 // We also need to clone any aliases that reference cloned functions, because
4265 // the modified callsites may invoke via the alias. Keep track of the aliases
4266 // for each function.
4267 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4268 FuncToAliasMap;
4269 for (auto &A : M.aliases()) {
4270 auto *Aliasee = A.getAliaseeObject();
4271 if (auto *F = dyn_cast<Function>(Aliasee))
4272 FuncToAliasMap[F].insert(&A);
4273 }
4274
4275 if (!initializeIndirectCallPromotionInfo(M))
4276 return false;
4277
4278 for (auto &F : M) {
4279 if (F.isDeclaration() || isMemProfClone(F))
4280 continue;
4281
4283
4285 bool ClonesCreated = false;
4286 unsigned NumClonesCreated = 0;
4287 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4288 // We should at least have version 0 which is the original copy.
4289 assert(NumClones > 0);
4290 // If only one copy needed use original.
4291 if (NumClones == 1)
4292 return;
4293 // If we already performed cloning of this function, confirm that the
4294 // requested number of clones matches (the thin link should ensure the
4295 // number of clones for each constituent callsite is consistent within
4296 // each function), before returning.
4297 if (ClonesCreated) {
4298 assert(NumClonesCreated == NumClones);
4299 return;
4300 }
4301 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4302 // The first "clone" is the original copy, which doesn't have a VMap.
4303 assert(VMaps.size() == NumClones - 1);
4304 Changed = true;
4305 ClonesCreated = true;
4306 NumClonesCreated = NumClones;
4307 };
4308
4309 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4310 Function *CalledFunction) {
4311 // Perform cloning if not yet done.
4312 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4313
4314 assert(!isMemProfClone(*CalledFunction));
4315
4316 // Update the calls per the summary info.
4317 // Save orig name since it gets updated in the first iteration
4318 // below.
4319 auto CalleeOrigName = CalledFunction->getName();
4320 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4321 // Do nothing if this version calls the original version of its
4322 // callee.
4323 if (!StackNode.Clones[J])
4324 continue;
4325 auto NewF = M.getOrInsertFunction(
4326 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4327 CalledFunction->getFunctionType());
4328 CallBase *CBClone;
4329 // Copy 0 is the original function.
4330 if (!J)
4331 CBClone = CB;
4332 else
4333 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4334 CBClone->setCalledFunction(NewF);
4335 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4336 << ore::NV("Call", CBClone) << " in clone "
4337 << ore::NV("Caller", CBClone->getFunction())
4338 << " assigned to call function clone "
4339 << ore::NV("Callee", NewF.getCallee()));
4340 }
4341 };
4342
4343 // Locate the summary for F.
4344 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4345 // If not found, this could be an imported local (see comment in
4346 // findValueInfoForFunc). Skip for now as it will be cloned in its original
4347 // module (where it would have been promoted to global scope so should
4348 // satisfy any reference in this module).
4349 if (!TheFnVI)
4350 continue;
4351
4352 auto *GVSummary =
4353 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4354 if (!GVSummary) {
4355 // Must have been imported, use the summary which matches the definition。
4356 // (might be multiple if this was a linkonce_odr).
4357 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4358 assert(SrcModuleMD &&
4359 "enable-import-metadata is needed to emit thinlto_src_module");
4360 StringRef SrcModule =
4361 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4362 for (auto &GVS : TheFnVI.getSummaryList()) {
4363 if (GVS->modulePath() == SrcModule) {
4364 GVSummary = GVS.get();
4365 break;
4366 }
4367 }
4368 assert(GVSummary && GVSummary->modulePath() == SrcModule);
4369 }
4370
4371 // If this was an imported alias skip it as we won't have the function
4372 // summary, and it should be cloned in the original module.
4373 if (isa<AliasSummary>(GVSummary))
4374 continue;
4375
4376 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4377
4378 if (FS->allocs().empty() && FS->callsites().empty())
4379 continue;
4380
4381 auto SI = FS->callsites().begin();
4382 auto AI = FS->allocs().begin();
4383
4384 // To handle callsite infos synthesized for tail calls which have missing
4385 // frames in the profiled context, map callee VI to the synthesized callsite
4386 // info.
4387 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4388 // Iterate the callsites for this function in reverse, since we place all
4389 // those synthesized for tail calls at the end.
4390 for (auto CallsiteIt = FS->callsites().rbegin();
4391 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4392 auto &Callsite = *CallsiteIt;
4393 // Stop as soon as we see a non-synthesized callsite info (see comment
4394 // above loop). All the entries added for discovered tail calls have empty
4395 // stack ids.
4396 if (!Callsite.StackIdIndices.empty())
4397 break;
4398 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4399 }
4400
4401 // Keeps track of needed ICP for the function.
4402 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4403
4404 // Assume for now that the instructions are in the exact same order
4405 // as when the summary was created, but confirm this is correct by
4406 // matching the stack ids.
4407 for (auto &BB : F) {
4408 for (auto &I : BB) {
4409 auto *CB = dyn_cast<CallBase>(&I);
4410 // Same handling as when creating module summary.
4411 if (!mayHaveMemprofSummary(CB))
4412 continue;
4413
4414 auto *CalledValue = CB->getCalledOperand();
4415 auto *CalledFunction = CB->getCalledFunction();
4416 if (CalledValue && !CalledFunction) {
4417 CalledValue = CalledValue->stripPointerCasts();
4418 // Stripping pointer casts can reveal a called function.
4419 CalledFunction = dyn_cast<Function>(CalledValue);
4420 }
4421 // Check if this is an alias to a function. If so, get the
4422 // called aliasee for the checks below.
4423 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4424 assert(!CalledFunction &&
4425 "Expected null called function in callsite for alias");
4426 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4427 }
4428
4430 I.getMetadata(LLVMContext::MD_callsite));
4431 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4432
4433 // Include allocs that were already assigned a memprof function
4434 // attribute in the statistics.
4435 if (CB->getAttributes().hasFnAttr("memprof")) {
4436 assert(!MemProfMD);
4437 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4438 ? AllocTypeColdThinBackend++
4439 : AllocTypeNotColdThinBackend++;
4440 OrigAllocsThinBackend++;
4441 AllocVersionsThinBackend++;
4442 if (!MaxAllocVersionsThinBackend)
4443 MaxAllocVersionsThinBackend = 1;
4444 continue;
4445 }
4446
4447 if (MemProfMD) {
4448 // Consult the next alloc node.
4449 assert(AI != FS->allocs().end());
4450 auto &AllocNode = *(AI++);
4451
4452 // Sanity check that the MIB stack ids match between the summary and
4453 // instruction metadata.
4454 auto MIBIter = AllocNode.MIBs.begin();
4455 for (auto &MDOp : MemProfMD->operands()) {
4456 assert(MIBIter != AllocNode.MIBs.end());
4457 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4458 MIBIter->StackIdIndices.begin();
4459 auto *MIBMD = cast<const MDNode>(MDOp);
4460 MDNode *StackMDNode = getMIBStackNode(MIBMD);
4461 assert(StackMDNode);
4462 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4463 auto ContextIterBegin =
4464 StackContext.beginAfterSharedPrefix(CallsiteContext);
4465 // Skip the checking on the first iteration.
4466 uint64_t LastStackContextId =
4467 (ContextIterBegin != StackContext.end() &&
4468 *ContextIterBegin == 0)
4469 ? 1
4470 : 0;
4471 for (auto ContextIter = ContextIterBegin;
4472 ContextIter != StackContext.end(); ++ContextIter) {
4473 // If this is a direct recursion, simply skip the duplicate
4474 // entries, to be consistent with how the summary ids were
4475 // generated during ModuleSummaryAnalysis.
4476 if (LastStackContextId == *ContextIter)
4477 continue;
4478 LastStackContextId = *ContextIter;
4479 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4480 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4481 *ContextIter);
4482 StackIdIndexIter++;
4483 }
4484 MIBIter++;
4485 }
4486
4487 // Perform cloning if not yet done.
4488 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4489
4490 OrigAllocsThinBackend++;
4491 AllocVersionsThinBackend += AllocNode.Versions.size();
4492 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4493 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4494
4495 // If there is only one version that means we didn't end up
4496 // considering this function for cloning, and in that case the alloc
4497 // will still be none type or should have gotten the default NotCold.
4498 // Skip that after calling clone helper since that does some sanity
4499 // checks that confirm we haven't decided yet that we need cloning.
4500 // We might have a single version that is cold due to the
4501 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4502 // case.
4503 if (AllocNode.Versions.size() == 1 &&
4504 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
4505 assert((AllocationType)AllocNode.Versions[0] ==
4507 (AllocationType)AllocNode.Versions[0] ==
4509 UnclonableAllocsThinBackend++;
4510 continue;
4511 }
4512
4513 // All versions should have a singular allocation type.
4514 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4515 return Type == ((uint8_t)AllocationType::NotCold |
4516 (uint8_t)AllocationType::Cold);
4517 }));
4518
4519 // Update the allocation types per the summary info.
4520 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4521 // Ignore any that didn't get an assigned allocation type.
4522 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4523 continue;
4524 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4525 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4526 : AllocTypeNotColdThinBackend++;
4527 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4528 auto A = llvm::Attribute::get(F.getContext(), "memprof",
4529 AllocTypeString);
4530 CallBase *CBClone;
4531 // Copy 0 is the original function.
4532 if (!J)
4533 CBClone = CB;
4534 else
4535 // Since VMaps are only created for new clones, we index with
4536 // clone J-1 (J==0 is the original clone and does not have a VMaps
4537 // entry).
4538 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4539 CBClone->addFnAttr(A);
4540 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4541 << ore::NV("AllocationCall", CBClone) << " in clone "
4542 << ore::NV("Caller", CBClone->getFunction())
4543 << " marked with memprof allocation attribute "
4544 << ore::NV("Attribute", AllocTypeString));
4545 }
4546 } else if (!CallsiteContext.empty()) {
4547 if (!CalledFunction) {
4548#ifndef NDEBUG
4549 // We should have skipped inline assembly calls.
4550 auto *CI = dyn_cast<CallInst>(CB);
4551 assert(!CI || !CI->isInlineAsm());
4552#endif
4553 // We should have skipped direct calls via a Constant.
4554 assert(CalledValue && !isa<Constant>(CalledValue));
4555
4556 // This is an indirect call, see if we have profile information and
4557 // whether any clones were recorded for the profiled targets (that
4558 // we synthesized CallsiteInfo summary records for when building the
4559 // index).
4560 auto NumClones =
4561 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4562
4563 // Perform cloning if not yet done. This is done here in case
4564 // we don't need to do ICP, but might need to clone this
4565 // function as it is the target of other cloned calls.
4566 if (NumClones)
4567 CloneFuncIfNeeded(NumClones);
4568 }
4569
4570 else {
4571 // Consult the next callsite node.
4572 assert(SI != FS->callsites().end());
4573 auto &StackNode = *(SI++);
4574
4575#ifndef NDEBUG
4576 // Sanity check that the stack ids match between the summary and
4577 // instruction metadata.
4578 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4579 for (auto StackId : CallsiteContext) {
4580 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4581 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4582 StackId);
4583 StackIdIndexIter++;
4584 }
4585#endif
4586
4587 CloneCallsite(StackNode, CB, CalledFunction);
4588 }
4589 } else if (CB->isTailCall() && CalledFunction) {
4590 // Locate the synthesized callsite info for the callee VI, if any was
4591 // created, and use that for cloning.
4592 ValueInfo CalleeVI =
4593 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
4594 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4595 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4596 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4597 CloneCallsite(Callsite->second, CB, CalledFunction);
4598 }
4599 }
4600 }
4601 }
4602
4603 // Now do any promotion required for cloning.
4604 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4605 }
4606
4607 // We skip some of the functions and instructions above, so remove all the
4608 // metadata in a single sweep here.
4609 for (auto &F : M) {
4610 // We can skip memprof clones because createFunctionClones already strips
4611 // the metadata from the newly created clones.
4612 if (F.isDeclaration() || isMemProfClone(F))
4613 continue;
4614 for (auto &BB : F) {
4615 for (auto &I : BB) {
4616 if (!isa<CallBase>(I))
4617 continue;
4618 I.setMetadata(LLVMContext::MD_memprof, nullptr);
4619 I.setMetadata(LLVMContext::MD_callsite, nullptr);
4620 }
4621 }
4622 }
4623
4624 return Changed;
4625}
4626
4627unsigned MemProfContextDisambiguation::recordICPInfo(
4628 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4630 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4631 // First see if we have profile information for this indirect call.
4632 uint32_t NumCandidates;
4633 uint64_t TotalCount;
4634 auto CandidateProfileData =
4635 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4636 NumCandidates);
4637 if (CandidateProfileData.empty())
4638 return 0;
4639
4640 // Iterate through all of the candidate profiled targets along with the
4641 // CallsiteInfo summary records synthesized for them when building the index,
4642 // and see if any are cloned and/or refer to clones.
4643 bool ICPNeeded = false;
4644 unsigned NumClones = 0;
4645 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4646 for (const auto &Candidate : CandidateProfileData) {
4647#ifndef NDEBUG
4648 auto CalleeValueInfo =
4649#endif
4650 ImportSummary->getValueInfo(Candidate.Value);
4651 // We might not have a ValueInfo if this is a distributed
4652 // ThinLTO backend and decided not to import that function.
4653 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4654 assert(SI != AllCallsites.end());
4655 auto &StackNode = *(SI++);
4656 // See if any of the clones of the indirect callsite for this
4657 // profiled target should call a cloned version of the profiled
4658 // target. We only need to do the ICP here if so.
4659 ICPNeeded |= llvm::any_of(StackNode.Clones,
4660 [](unsigned CloneNo) { return CloneNo != 0; });
4661 // Every callsite in the same function should have been cloned the same
4662 // number of times.
4663 assert(!NumClones || NumClones == StackNode.Clones.size());
4664 NumClones = StackNode.Clones.size();
4665 }
4666 if (!ICPNeeded)
4667 return NumClones;
4668 // Save information for ICP, which is performed later to avoid messing up the
4669 // current function traversal.
4670 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4671 TotalCount, CallsiteInfoStartIndex});
4672 return NumClones;
4673}
4674
4675void MemProfContextDisambiguation::performICP(
4676 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4677 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4678 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4680 // Now do any promotion required for cloning. Specifically, for each
4681 // recorded ICP candidate (which was only recorded because one clone of that
4682 // candidate should call a cloned target), we perform ICP (speculative
4683 // devirtualization) for each clone of the callsite, and update its callee
4684 // to the appropriate clone. Note that the ICP compares against the original
4685 // version of the target, which is what is in the vtable.
4686 for (auto &Info : ICallAnalysisInfo) {
4687 auto *CB = Info.CB;
4688 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4689 auto TotalCount = Info.TotalCount;
4690 unsigned NumPromoted = 0;
4691 unsigned NumClones = 0;
4692
4693 for (auto &Candidate : Info.CandidateProfileData) {
4694 auto &StackNode = AllCallsites[CallsiteIndex++];
4695
4696 // All calls in the same function must have the same number of clones.
4697 assert(!NumClones || NumClones == StackNode.Clones.size());
4698 NumClones = StackNode.Clones.size();
4699
4700 // See if the target is in the module. If it wasn't imported, it is
4701 // possible that this profile could have been collected on a different
4702 // target (or version of the code), and we need to be conservative
4703 // (similar to what is done in the ICP pass).
4704 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4705 if (TargetFunction == nullptr ||
4706 // Any ThinLTO global dead symbol removal should have already
4707 // occurred, so it should be safe to promote when the target is a
4708 // declaration.
4709 // TODO: Remove internal option once more fully tested.
4711 TargetFunction->isDeclaration())) {
4712 ORE.emit([&]() {
4713 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4714 << "Memprof cannot promote indirect call: target with md5sum "
4715 << ore::NV("target md5sum", Candidate.Value) << " not found";
4716 });
4717 // FIXME: See if we can use the new declaration importing support to
4718 // at least get the declarations imported for this case. Hot indirect
4719 // targets should have been imported normally, however.
4720 continue;
4721 }
4722
4723 // Check if legal to promote
4724 const char *Reason = nullptr;
4725 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4726 ORE.emit([&]() {
4727 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4728 << "Memprof cannot promote indirect call to "
4729 << ore::NV("TargetFunction", TargetFunction)
4730 << " with count of " << ore::NV("TotalCount", TotalCount)
4731 << ": " << Reason;
4732 });
4733 continue;
4734 }
4735
4736 assert(!isMemProfClone(*TargetFunction));
4737
4738 // Handle each call clone, applying ICP so that each clone directly
4739 // calls the specified callee clone, guarded by the appropriate ICP
4740 // check.
4741 CallBase *CBClone = CB;
4742 for (unsigned J = 0; J < NumClones; J++) {
4743 // Copy 0 is the original function.
4744 if (J > 0)
4745 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4746 // We do the promotion using the original name, so that the comparison
4747 // is against the name in the vtable. Then just below, change the new
4748 // direct call to call the cloned function.
4749 auto &DirectCall =
4750 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4751 TotalCount, isSamplePGO, &ORE);
4752 auto *TargetToUse = TargetFunction;
4753 // Call original if this version calls the original version of its
4754 // callee.
4755 if (StackNode.Clones[J]) {
4756 TargetToUse =
4757 cast<Function>(M.getOrInsertFunction(
4758 getMemProfFuncName(TargetFunction->getName(),
4759 StackNode.Clones[J]),
4760 TargetFunction->getFunctionType())
4761 .getCallee());
4762 }
4763 DirectCall.setCalledFunction(TargetToUse);
4764 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4765 << ore::NV("Call", CBClone) << " in clone "
4766 << ore::NV("Caller", CBClone->getFunction())
4767 << " promoted and assigned to call function clone "
4768 << ore::NV("Callee", TargetToUse));
4769 }
4770
4771 // Update TotalCount (all clones should get same count above)
4772 TotalCount -= Candidate.Count;
4773 NumPromoted++;
4774 }
4775 // Adjust the MD.prof metadata for all clones, now that we have the new
4776 // TotalCount and the number promoted.
4777 CallBase *CBClone = CB;
4778 for (unsigned J = 0; J < NumClones; J++) {
4779 // Copy 0 is the original function.
4780 if (J > 0)
4781 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4782 // First delete the old one.
4783 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4784 // If all promoted, we don't need the MD.prof metadata.
4785 // Otherwise we need update with the un-promoted records back.
4786 if (TotalCount != 0)
4788 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4789 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4790 }
4791 }
4792}
4793
4794template <typename DerivedCCG, typename FuncTy, typename CallTy>
4795bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4796 if (DumpCCG) {
4797 dbgs() << "CCG before cloning:\n";
4798 dbgs() << *this;
4799 }
4800 if (ExportToDot)
4801 exportToDot("postbuild");
4802
4803 if (VerifyCCG) {
4804 check();
4805 }
4806
4807 identifyClones();
4808
4809 if (VerifyCCG) {
4810 check();
4811 }
4812
4813 if (DumpCCG) {
4814 dbgs() << "CCG after cloning:\n";
4815 dbgs() << *this;
4816 }
4817 if (ExportToDot)
4818 exportToDot("cloned");
4819
4820 bool Changed = assignFunctions();
4821
4822 if (DumpCCG) {
4823 dbgs() << "CCG after assigning function clones:\n";
4824 dbgs() << *this;
4825 }
4826 if (ExportToDot)
4827 exportToDot("clonefuncassign");
4828
4830 printTotalSizes(errs());
4831
4832 return Changed;
4833}
4834
4835bool MemProfContextDisambiguation::processModule(
4836 Module &M,
4838
4839 // If we have an import summary, then the cloning decisions were made during
4840 // the thin link on the index. Apply them and return.
4841 if (ImportSummary)
4842 return applyImport(M);
4843
4844 // TODO: If/when other types of memprof cloning are enabled beyond just for
4845 // hot and cold, we will need to change this to individually control the
4846 // AllocationType passed to addStackNodesForMIB during CCG construction.
4847 // Note that we specifically check this after applying imports above, so that
4848 // the option isn't needed to be passed to distributed ThinLTO backend
4849 // clang processes, which won't necessarily have visibility into the linker
4850 // dependences. Instead the information is communicated from the LTO link to
4851 // the backends via the combined summary index.
4852 if (!SupportsHotColdNew)
4853 return false;
4854
4855 ModuleCallsiteContextGraph CCG(M, OREGetter);
4856 return CCG.process();
4857}
4858
4860 const ModuleSummaryIndex *Summary, bool isSamplePGO)
4861 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4862 if (ImportSummary) {
4863 // The MemProfImportSummary should only be used for testing ThinLTO
4864 // distributed backend handling via opt, in which case we don't have a
4865 // summary from the pass pipeline.
4867 return;
4868 }
4869 if (MemProfImportSummary.empty())
4870 return;
4871
4872 auto ReadSummaryFile =
4874 if (!ReadSummaryFile) {
4875 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
4876 "Error loading file '" + MemProfImportSummary +
4877 "': ");
4878 return;
4879 }
4880 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
4881 if (!ImportSummaryForTestingOrErr) {
4882 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
4883 "Error parsing file '" + MemProfImportSummary +
4884 "': ");
4885 return;
4886 }
4887 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4888 ImportSummary = ImportSummaryForTesting.get();
4889}
4890
4893 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4894 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4896 };
4897 if (!processModule(M, OREGetter))
4898 return PreservedAnalyses::all();
4899 return PreservedAnalyses::none();
4900}
4901
4905 isPrevailing) {
4906 // TODO: If/when other types of memprof cloning are enabled beyond just for
4907 // hot and cold, we will need to change this to individually control the
4908 // AllocationType passed to addStackNodesForMIB during CCG construction.
4909 // The index was set from the option, so these should be in sync.
4910 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4911 if (!SupportsHotColdNew)
4912 return;
4913
4914 IndexCallsiteContextGraph CCG(Index, isPrevailing);
4915 CCG.process();
4916}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static bool isMemProfClone(const Function &F)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
cl::opt< unsigned > MinClonedColdBytePercent
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > AllowRecursiveCallsites("memprof-allow-recursive-callsites", cl::init(false), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles"))
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< bool > AllowRecursiveContexts("memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, cl::desc("Allow cloning of contexts through recursive cycles"))
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:249
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1474
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1380
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:557
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:410
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:596
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:184
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:99
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:211
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
S1Ty set_difference(const S1Ty &S1, const S2Ty &S2)
set_difference(A, B) - Return A - B
Definition: SetOperations.h:93
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1231
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
const char * toString(DWARFSectionKind Kind)
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
Summary of memprof callsite metadata.
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const
static SimpleType getSimplifiedValue(IndexCall &Val)
static SimpleType getSimplifiedValue(const IndexCall &Val)
Define a template that can be specialized by smart pointers to reflect the fact that they are automat...
Definition: Casting.h:34