LLVM 19.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/Constants.h"
38#include "llvm/IR/Module.h"
40#include "llvm/Pass.h"
45#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125namespace llvm {
127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130// Indicate we are linking with an allocator that supports hot/cold operator
131// new interfaces.
133 "supports-hot-cold-new", cl::init(false), cl::Hidden,
134 cl::desc("Linking with hot/cold operator new interfaces"));
135} // namespace llvm
136
137namespace {
138/// CRTP base for graphs built from either IR or ThinLTO summary index.
139///
140/// The graph represents the call contexts in all memprof metadata on allocation
141/// calls, with nodes for the allocations themselves, as well as for the calls
142/// in each context. The graph is initially built from the allocation memprof
143/// metadata (or summary) MIBs. It is then updated to match calls with callsite
144/// metadata onto the nodes, updating it to reflect any inlining performed on
145/// those calls.
146///
147/// Each MIB (representing an allocation's call context with allocation
148/// behavior) is assigned a unique context id during the graph build. The edges
149/// and nodes in the graph are decorated with the context ids they carry. This
150/// is used to correctly update the graph when cloning is performed so that we
151/// can uniquify the context for a single (possibly cloned) allocation.
152template <typename DerivedCCG, typename FuncTy, typename CallTy>
153class CallsiteContextGraph {
154public:
155 CallsiteContextGraph() = default;
156 CallsiteContextGraph(const CallsiteContextGraph &) = default;
157 CallsiteContextGraph(CallsiteContextGraph &&) = default;
158
159 /// Main entry point to perform analysis and transformations on graph.
160 bool process();
161
162 /// Perform cloning on the graph necessary to uniquely identify the allocation
163 /// behavior of an allocation based on its context.
164 void identifyClones();
165
166 /// Assign callsite clones to functions, cloning functions as needed to
167 /// accommodate the combinations of their callsite clones reached by callers.
168 /// For regular LTO this clones functions and callsites in the IR, but for
169 /// ThinLTO the cloning decisions are noted in the summaries and later applied
170 /// in applyImport.
171 bool assignFunctions();
172
173 void dump() const;
174 void print(raw_ostream &OS) const;
175
177 const CallsiteContextGraph &CCG) {
178 CCG.print(OS);
179 return OS;
180 }
181
182 friend struct GraphTraits<
183 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
184 friend struct DOTGraphTraits<
185 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
186
187 void exportToDot(std::string Label) const;
188
189 /// Represents a function clone via FuncTy pointer and clone number pair.
190 struct FuncInfo final
191 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
192 using Base = std::pair<FuncTy *, unsigned>;
193 FuncInfo(const Base &B) : Base(B) {}
194 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
195 explicit operator bool() const { return this->first != nullptr; }
196 FuncTy *func() const { return this->first; }
197 unsigned cloneNo() const { return this->second; }
198 };
199
200 /// Represents a callsite clone via CallTy and clone number pair.
201 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
202 using Base = std::pair<CallTy, unsigned>;
203 CallInfo(const Base &B) : Base(B) {}
204 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
205 : Base(Call, CloneNo) {}
206 explicit operator bool() const { return (bool)this->first; }
207 CallTy call() const { return this->first; }
208 unsigned cloneNo() const { return this->second; }
209 void setCloneNo(unsigned N) { this->second = N; }
210 void print(raw_ostream &OS) const {
211 if (!operator bool()) {
212 assert(!cloneNo());
213 OS << "null Call";
214 return;
215 }
216 call()->print(OS);
217 OS << "\t(clone " << cloneNo() << ")";
218 }
219 void dump() const {
220 print(dbgs());
221 dbgs() << "\n";
222 }
223 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
224 Call.print(OS);
225 return OS;
226 }
227 };
228
229 struct ContextEdge;
230
231 /// Node in the Callsite Context Graph
232 struct ContextNode {
233 // Keep this for now since in the IR case where we have an Instruction* it
234 // is not as immediately discoverable. Used for printing richer information
235 // when dumping graph.
236 bool IsAllocation;
237
238 // Keeps track of when the Call was reset to null because there was
239 // recursion.
240 bool Recursive = false;
241
242 // The corresponding allocation or interior call.
244
245 // For alloc nodes this is a unique id assigned when constructed, and for
246 // callsite stack nodes it is the original stack id when the node is
247 // constructed from the memprof MIB metadata on the alloc nodes. Note that
248 // this is only used when matching callsite metadata onto the stack nodes
249 // created when processing the allocation memprof MIBs, and for labeling
250 // nodes in the dot graph. Therefore we don't bother to assign a value for
251 // clones.
252 uint64_t OrigStackOrAllocId = 0;
253
254 // This will be formed by ORing together the AllocationType enum values
255 // for contexts including this node.
256 uint8_t AllocTypes = 0;
257
258 // Edges to all callees in the profiled call stacks.
259 // TODO: Should this be a map (from Callee node) for more efficient lookup?
260 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
261
262 // Edges to all callers in the profiled call stacks.
263 // TODO: Should this be a map (from Caller node) for more efficient lookup?
264 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
265
266 // Get the list of edges from which we can compute allocation information
267 // such as the context ids and allocation type of this node.
268 const std::vector<std::shared_ptr<ContextEdge>> *
269 getEdgesWithAllocInfo() const {
270 // If node has any callees, compute from those, otherwise compute from
271 // callers (i.e. if this is the leaf allocation node).
272 if (!CalleeEdges.empty())
273 return &CalleeEdges;
274 if (!CallerEdges.empty()) {
275 // A node with caller edges but no callee edges must be the allocation
276 // node.
277 assert(IsAllocation);
278 return &CallerEdges;
279 }
280 return nullptr;
281 }
282
283 // Compute the context ids for this node from the union of its edge context
284 // ids.
285 DenseSet<uint32_t> getContextIds() const {
286 DenseSet<uint32_t> ContextIds;
287 auto *Edges = getEdgesWithAllocInfo();
288 if (!Edges)
289 return {};
290 unsigned Count = 0;
291 for (auto &Edge : *Edges)
292 Count += Edge->getContextIds().size();
293 ContextIds.reserve(Count);
294 for (auto &Edge : *Edges)
295 ContextIds.insert(Edge->getContextIds().begin(),
296 Edge->getContextIds().end());
297 return ContextIds;
298 }
299
300 // Compute the allocation type for this node from the OR of its edge
301 // allocation types.
302 uint8_t computeAllocType() const {
303 auto *Edges = getEdgesWithAllocInfo();
304 if (!Edges)
305 return (uint8_t)AllocationType::None;
306 uint8_t BothTypes =
307 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
308 uint8_t AllocType = (uint8_t)AllocationType::None;
309 for (auto &Edge : *Edges) {
310 AllocType |= Edge->AllocTypes;
311 // Bail early if alloc type reached both, no further refinement.
312 if (AllocType == BothTypes)
313 return AllocType;
314 }
315 return AllocType;
316 }
317
318 // The context ids set for this node is empty if its edge context ids are
319 // also all empty.
320 bool emptyContextIds() const {
321 auto *Edges = getEdgesWithAllocInfo();
322 if (!Edges)
323 return true;
324 for (auto &Edge : *Edges) {
325 if (!Edge->getContextIds().empty())
326 return false;
327 }
328 return true;
329 }
330
331 // List of clones of this ContextNode, initially empty.
332 std::vector<ContextNode *> Clones;
333
334 // If a clone, points to the original uncloned node.
335 ContextNode *CloneOf = nullptr;
336
337 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
338
339 ContextNode(bool IsAllocation, CallInfo C)
340 : IsAllocation(IsAllocation), Call(C) {}
341
342 void addClone(ContextNode *Clone) {
343 if (CloneOf) {
344 CloneOf->Clones.push_back(Clone);
345 Clone->CloneOf = CloneOf;
346 } else {
347 Clones.push_back(Clone);
348 assert(!Clone->CloneOf);
349 Clone->CloneOf = this;
350 }
351 }
352
353 ContextNode *getOrigNode() {
354 if (!CloneOf)
355 return this;
356 return CloneOf;
357 }
358
359 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
360 unsigned int ContextId);
361
362 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
363 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
364 void eraseCalleeEdge(const ContextEdge *Edge);
365 void eraseCallerEdge(const ContextEdge *Edge);
366
367 void setCall(CallInfo C) { Call = C; }
368
369 bool hasCall() const { return (bool)Call.call(); }
370
371 void printCall(raw_ostream &OS) const { Call.print(OS); }
372
373 // True if this node was effectively removed from the graph, in which case
374 // it should have an allocation type of None and empty context ids.
375 bool isRemoved() const {
376 assert((AllocTypes == (uint8_t)AllocationType::None) ==
377 emptyContextIds());
378 return AllocTypes == (uint8_t)AllocationType::None;
379 }
380
381 void dump() const;
382 void print(raw_ostream &OS) const;
383
384 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
385 Node.print(OS);
386 return OS;
387 }
388 };
389
390 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
391 /// callee.
392 struct ContextEdge {
393 ContextNode *Callee;
394 ContextNode *Caller;
395
396 // This will be formed by ORing together the AllocationType enum values
397 // for contexts including this edge.
398 uint8_t AllocTypes = 0;
399
400 // The set of IDs for contexts including this edge.
401 DenseSet<uint32_t> ContextIds;
402
403 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
404 DenseSet<uint32_t> ContextIds)
405 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
406 ContextIds(std::move(ContextIds)) {}
407
408 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
409
410 void dump() const;
411 void print(raw_ostream &OS) const;
412
413 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
414 Edge.print(OS);
415 return OS;
416 }
417 };
418
419 /// Helpers to remove callee edges that have allocation type None (due to not
420 /// carrying any context ids) after transformations.
421 void removeNoneTypeCalleeEdges(ContextNode *Node);
422 void
423 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
425
426protected:
427 /// Get a list of nodes corresponding to the stack ids in the given callsite
428 /// context.
429 template <class NodeT, class IteratorT>
430 std::vector<uint64_t>
431 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
432
433 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
434 /// metadata (or summary).
435 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
436
437 /// Adds nodes for the given MIB stack ids.
438 template <class NodeT, class IteratorT>
439 void addStackNodesForMIB(ContextNode *AllocNode,
440 CallStack<NodeT, IteratorT> &StackContext,
441 CallStack<NodeT, IteratorT> &CallsiteContext,
443
444 /// Matches all callsite metadata (or summary) to the nodes created for
445 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
446 /// inlining performed on those callsite instructions.
447 void updateStackNodes();
448
449 /// Update graph to conservatively handle any callsite stack nodes that target
450 /// multiple different callee target functions.
451 void handleCallsitesWithMultipleTargets();
452
453 /// Save lists of calls with MemProf metadata in each function, for faster
454 /// iteration.
455 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
456
457 /// Map from callsite node to the enclosing caller function.
458 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
459
460private:
461 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
462
463 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
464 const FuncTy *, DenseSet<uint32_t>>;
465
466 /// Assigns the given Node to calls at or inlined into the location with
467 /// the Node's stack id, after post order traversing and processing its
468 /// caller nodes. Uses the call information recorded in the given
469 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
470 /// as needed. Called by updateStackNodes which sets up the given
471 /// StackIdToMatchingCalls map.
472 void assignStackNodesPostOrder(
473 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
474 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
475
476 /// Duplicates the given set of context ids, updating the provided
477 /// map from each original id with the newly generated context ids,
478 /// and returning the new duplicated id set.
479 DenseSet<uint32_t> duplicateContextIds(
480 const DenseSet<uint32_t> &StackSequenceContextIds,
481 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
482
483 /// Propagates all duplicated context ids across the graph.
484 void propagateDuplicateContextIds(
485 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
486
487 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
488 /// else to its callers. Also updates OrigNode's edges to remove any context
489 /// ids moved to the newly created edge.
490 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
491 bool TowardsCallee,
492 DenseSet<uint32_t> RemainingContextIds);
493
494 /// Get the stack id corresponding to the given Id or Index (for IR this will
495 /// return itself, for a summary index this will return the id recorded in the
496 /// index for that stack id index value).
497 uint64_t getStackId(uint64_t IdOrIndex) const {
498 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
499 }
500
501 /// Returns true if the given call targets the callee of the given edge, or if
502 /// we were able to identify the call chain through intermediate tail calls.
503 /// In the latter case new context nodes are added to the graph for the
504 /// identified tail calls, and their synthesized nodes are added to
505 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
506 /// next element after the input position (either incremented or updated after
507 /// removing the old edge).
508 bool
509 calleesMatch(CallTy Call, EdgeIter &EI,
510 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
511
512 /// Returns true if the given call targets the given function, or if we were
513 /// able to identify the call chain through intermediate tail calls (in which
514 /// case FoundCalleeChain will be populated).
515 bool calleeMatchesFunc(
516 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
517 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
518 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
519 Call, Func, CallerFunc, FoundCalleeChain);
520 }
521
522 /// Get a list of nodes corresponding to the stack ids in the given
523 /// callsite's context.
524 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
525 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
526 Call);
527 }
528
529 /// Get the last stack id in the context for callsite.
530 uint64_t getLastStackId(CallTy Call) {
531 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
532 }
533
534 /// Update the allocation call to record type of allocated memory.
535 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
536 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
537 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
538 }
539
540 /// Update non-allocation call to invoke (possibly cloned) function
541 /// CalleeFunc.
542 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
543 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
544 }
545
546 /// Clone the given function for the given callsite, recording mapping of all
547 /// of the functions tracked calls to their new versions in the CallMap.
548 /// Assigns new clones to clone number CloneNo.
549 FuncInfo cloneFunctionForCallsite(
550 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
551 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
552 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
553 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
554 }
555
556 /// Gets a label to use in the dot graph for the given call clone in the given
557 /// function.
558 std::string getLabel(const FuncTy *Func, const CallTy Call,
559 unsigned CloneNo) const {
560 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
561 }
562
563 /// Helpers to find the node corresponding to the given call or stackid.
564 ContextNode *getNodeForInst(const CallInfo &C);
565 ContextNode *getNodeForAlloc(const CallInfo &C);
566 ContextNode *getNodeForStackId(uint64_t StackId);
567
568 /// Computes the alloc type corresponding to the given context ids, by
569 /// unioning their recorded alloc types.
570 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
571
572 /// Returns the allocation type of the intersection of the contexts of two
573 /// nodes (based on their provided context id sets), optimized for the case
574 /// when Node1Ids is smaller than Node2Ids.
575 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
576 const DenseSet<uint32_t> &Node2Ids);
577
578 /// Returns the allocation type of the intersection of the contexts of two
579 /// nodes (based on their provided context id sets).
580 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
581 const DenseSet<uint32_t> &Node2Ids);
582
583 /// Create a clone of Edge's callee and move Edge to that new callee node,
584 /// performing the necessary context id and allocation type updates.
585 /// If callee's caller edge iterator is supplied, it is updated when removing
586 /// the edge from that list. If ContextIdsToMove is non-empty, only that
587 /// subset of Edge's ids are moved to an edge to the new callee.
588 ContextNode *
589 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
590 EdgeIter *CallerEdgeI = nullptr,
591 DenseSet<uint32_t> ContextIdsToMove = {});
592
593 /// Change the callee of Edge to existing callee clone NewCallee, performing
594 /// the necessary context id and allocation type updates.
595 /// If callee's caller edge iterator is supplied, it is updated when removing
596 /// the edge from that list. If ContextIdsToMove is non-empty, only that
597 /// subset of Edge's ids are moved to an edge to the new callee.
598 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
599 ContextNode *NewCallee,
600 EdgeIter *CallerEdgeI = nullptr,
601 bool NewClone = false,
602 DenseSet<uint32_t> ContextIdsToMove = {});
603
604 /// Recursively perform cloning on the graph for the given Node and its
605 /// callers, in order to uniquely identify the allocation behavior of an
606 /// allocation given its context. The context ids of the allocation being
607 /// processed are given in AllocContextIds.
608 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
609 const DenseSet<uint32_t> &AllocContextIds);
610
611 /// Map from each context ID to the AllocationType assigned to that context.
612 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
613
614 /// Identifies the context node created for a stack id when adding the MIB
615 /// contexts to the graph. This is used to locate the context nodes when
616 /// trying to assign the corresponding callsites with those stack ids to these
617 /// nodes.
618 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
619
620 /// Maps to track the calls to their corresponding nodes in the graph.
621 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
622 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
623
624 /// Owner of all ContextNode unique_ptrs.
625 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
626
627 /// Perform sanity checks on graph when requested.
628 void check() const;
629
630 /// Keeps track of the last unique context id assigned.
631 unsigned int LastContextId = 0;
632};
633
634template <typename DerivedCCG, typename FuncTy, typename CallTy>
635using ContextNode =
636 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
637template <typename DerivedCCG, typename FuncTy, typename CallTy>
638using ContextEdge =
639 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
640template <typename DerivedCCG, typename FuncTy, typename CallTy>
641using FuncInfo =
642 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
643template <typename DerivedCCG, typename FuncTy, typename CallTy>
644using CallInfo =
645 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
646
647/// CRTP derived class for graphs built from IR (regular LTO).
648class ModuleCallsiteContextGraph
649 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
650 Instruction *> {
651public:
652 ModuleCallsiteContextGraph(
653 Module &M,
655
656private:
657 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
658 Instruction *>;
659
660 uint64_t getStackId(uint64_t IdOrIndex) const;
661 bool calleeMatchesFunc(
662 Instruction *Call, const Function *Func, const Function *CallerFunc,
663 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
664 bool findProfiledCalleeThroughTailCalls(
665 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
666 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
667 bool &FoundMultipleCalleeChains);
668 uint64_t getLastStackId(Instruction *Call);
669 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
670 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
671 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
672 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
673 Instruction *>::FuncInfo
674 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
675 std::map<CallInfo, CallInfo> &CallMap,
676 std::vector<CallInfo> &CallsWithMetadataInFunc,
677 unsigned CloneNo);
678 std::string getLabel(const Function *Func, const Instruction *Call,
679 unsigned CloneNo) const;
680
681 const Module &Mod;
683};
684
685/// Represents a call in the summary index graph, which can either be an
686/// allocation or an interior callsite node in an allocation's context.
687/// Holds a pointer to the corresponding data structure in the index.
688struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
689 IndexCall() : PointerUnion() {}
690 IndexCall(std::nullptr_t) : IndexCall() {}
692 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
693 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
694
695 IndexCall *operator->() { return this; }
696
697 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
698
699 void print(raw_ostream &OS) const {
700 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
701 OS << *AI;
702 } else {
703 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
704 assert(CI);
705 OS << *CI;
706 }
707 }
708};
709
710/// CRTP derived class for graphs built from summary index (ThinLTO).
711class IndexCallsiteContextGraph
712 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
713 IndexCall> {
714public:
715 IndexCallsiteContextGraph(
718 isPrevailing);
719
720 ~IndexCallsiteContextGraph() {
721 // Now that we are done with the graph it is safe to add the new
722 // CallsiteInfo structs to the function summary vectors. The graph nodes
723 // point into locations within these vectors, so we don't want to add them
724 // any earlier.
725 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
726 auto *FS = I.first;
727 for (auto &Callsite : I.second)
728 FS->addCallsite(*Callsite.second);
729 }
730 }
731
732private:
733 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
734 IndexCall>;
735
736 uint64_t getStackId(uint64_t IdOrIndex) const;
737 bool calleeMatchesFunc(
738 IndexCall &Call, const FunctionSummary *Func,
739 const FunctionSummary *CallerFunc,
740 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
741 bool findProfiledCalleeThroughTailCalls(
742 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
743 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
744 bool &FoundMultipleCalleeChains);
745 uint64_t getLastStackId(IndexCall &Call);
746 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
747 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
748 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
749 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
750 IndexCall>::FuncInfo
751 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
752 std::map<CallInfo, CallInfo> &CallMap,
753 std::vector<CallInfo> &CallsWithMetadataInFunc,
754 unsigned CloneNo);
755 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
756 unsigned CloneNo) const;
757
758 // Saves mapping from function summaries containing memprof records back to
759 // its VI, for use in checking and debugging.
760 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
761
764 isPrevailing;
765
766 // Saves/owns the callsite info structures synthesized for missing tail call
767 // frames that we discover while building the graph.
768 // It maps from the summary of the function making the tail call, to a map
769 // of callee ValueInfo to corresponding synthesized callsite info.
770 std::unordered_map<FunctionSummary *,
771 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
772 FunctionCalleesToSynthesizedCallsiteInfos;
773};
774} // namespace
775
776namespace llvm {
777template <>
778struct DenseMapInfo<typename CallsiteContextGraph<
779 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
781template <>
782struct DenseMapInfo<typename CallsiteContextGraph<
783 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
784 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
785template <>
786struct DenseMapInfo<IndexCall>
787 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
788} // end namespace llvm
789
790namespace {
791
792struct FieldSeparator {
793 bool Skip = true;
794 const char *Sep;
795
796 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
797};
798
799raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
800 if (FS.Skip) {
801 FS.Skip = false;
802 return OS;
803 }
804 return OS << FS.Sep;
805}
806
807// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
808// type we should actually use on the corresponding allocation.
809// If we can't clone a node that has NotCold+Cold alloc type, we will fall
810// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
811// from NotCold.
812AllocationType allocTypeToUse(uint8_t AllocTypes) {
813 assert(AllocTypes != (uint8_t)AllocationType::None);
814 if (AllocTypes ==
815 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
816 return AllocationType::NotCold;
817 else
818 return (AllocationType)AllocTypes;
819}
820
821// Helper to check if the alloc types for all edges recorded in the
822// InAllocTypes vector match the alloc types for all edges in the Edges
823// vector.
824template <typename DerivedCCG, typename FuncTy, typename CallTy>
825bool allocTypesMatch(
826 const std::vector<uint8_t> &InAllocTypes,
827 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
828 &Edges) {
829 return std::equal(
830 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
831 [](const uint8_t &l,
832 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
833 // Can share if one of the edges is None type - don't
834 // care about the type along that edge as it doesn't
835 // exist for those context ids.
836 if (l == (uint8_t)AllocationType::None ||
837 r->AllocTypes == (uint8_t)AllocationType::None)
838 return true;
839 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
840 });
841}
842
843} // end anonymous namespace
844
845template <typename DerivedCCG, typename FuncTy, typename CallTy>
846typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
847CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
848 const CallInfo &C) {
849 ContextNode *Node = getNodeForAlloc(C);
850 if (Node)
851 return Node;
852
853 return NonAllocationCallToContextNodeMap.lookup(C);
854}
855
856template <typename DerivedCCG, typename FuncTy, typename CallTy>
857typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
858CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
859 const CallInfo &C) {
860 return AllocationCallToContextNodeMap.lookup(C);
861}
862
863template <typename DerivedCCG, typename FuncTy, typename CallTy>
864typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
865CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
866 uint64_t StackId) {
867 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
868 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
869 return StackEntryNode->second;
870 return nullptr;
871}
872
873template <typename DerivedCCG, typename FuncTy, typename CallTy>
874void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
875 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
876 unsigned int ContextId) {
877 for (auto &Edge : CallerEdges) {
878 if (Edge->Caller == Caller) {
879 Edge->AllocTypes |= (uint8_t)AllocType;
880 Edge->getContextIds().insert(ContextId);
881 return;
882 }
883 }
884 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
885 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
886 CallerEdges.push_back(Edge);
887 Caller->CalleeEdges.push_back(Edge);
888}
889
890template <typename DerivedCCG, typename FuncTy, typename CallTy>
891void CallsiteContextGraph<
892 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
893 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
894 auto Edge = *EI;
895 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
896 assert(Edge->ContextIds.empty());
897 Edge->Callee->eraseCallerEdge(Edge.get());
898 EI = Node->CalleeEdges.erase(EI);
899 } else
900 ++EI;
901 }
902}
903
904template <typename DerivedCCG, typename FuncTy, typename CallTy>
905typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
906CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
907 findEdgeFromCallee(const ContextNode *Callee) {
908 for (const auto &Edge : CalleeEdges)
909 if (Edge->Callee == Callee)
910 return Edge.get();
911 return nullptr;
912}
913
914template <typename DerivedCCG, typename FuncTy, typename CallTy>
915typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
916CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
917 findEdgeFromCaller(const ContextNode *Caller) {
918 for (const auto &Edge : CallerEdges)
919 if (Edge->Caller == Caller)
920 return Edge.get();
921 return nullptr;
922}
923
924template <typename DerivedCCG, typename FuncTy, typename CallTy>
925void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
926 eraseCalleeEdge(const ContextEdge *Edge) {
927 auto EI = llvm::find_if(
928 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
929 return CalleeEdge.get() == Edge;
930 });
931 assert(EI != CalleeEdges.end());
932 CalleeEdges.erase(EI);
933}
934
935template <typename DerivedCCG, typename FuncTy, typename CallTy>
936void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
937 eraseCallerEdge(const ContextEdge *Edge) {
938 auto EI = llvm::find_if(
939 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
940 return CallerEdge.get() == Edge;
941 });
942 assert(EI != CallerEdges.end());
943 CallerEdges.erase(EI);
944}
945
946template <typename DerivedCCG, typename FuncTy, typename CallTy>
947uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
948 DenseSet<uint32_t> &ContextIds) {
949 uint8_t BothTypes =
950 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
951 uint8_t AllocType = (uint8_t)AllocationType::None;
952 for (auto Id : ContextIds) {
953 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
954 // Bail early if alloc type reached both, no further refinement.
955 if (AllocType == BothTypes)
956 return AllocType;
957 }
958 return AllocType;
959}
960
961template <typename DerivedCCG, typename FuncTy, typename CallTy>
962uint8_t
963CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
964 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
965 uint8_t BothTypes =
966 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
967 uint8_t AllocType = (uint8_t)AllocationType::None;
968 for (auto Id : Node1Ids) {
969 if (!Node2Ids.count(Id))
970 continue;
971 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
972 // Bail early if alloc type reached both, no further refinement.
973 if (AllocType == BothTypes)
974 return AllocType;
975 }
976 return AllocType;
977}
978
979template <typename DerivedCCG, typename FuncTy, typename CallTy>
980uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
981 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
982 if (Node1Ids.size() < Node2Ids.size())
983 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
984 else
985 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
986}
987
988template <typename DerivedCCG, typename FuncTy, typename CallTy>
989typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
990CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
991 CallInfo Call, const FuncTy *F) {
992 assert(!getNodeForAlloc(Call));
993 NodeOwner.push_back(
994 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
995 ContextNode *AllocNode = NodeOwner.back().get();
996 AllocationCallToContextNodeMap[Call] = AllocNode;
997 NodeToCallingFunc[AllocNode] = F;
998 // Use LastContextId as a uniq id for MIB allocation nodes.
999 AllocNode->OrigStackOrAllocId = LastContextId;
1000 // Alloc type should be updated as we add in the MIBs. We should assert
1001 // afterwards that it is not still None.
1002 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1003
1004 return AllocNode;
1005}
1006
1007template <typename DerivedCCG, typename FuncTy, typename CallTy>
1008template <class NodeT, class IteratorT>
1009void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1010 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1012 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1013 // is done.
1014 if (AllocType == AllocationType::Hot)
1015 AllocType = AllocationType::NotCold;
1016
1017 ContextIdToAllocationType[++LastContextId] = AllocType;
1018
1019 // Update alloc type and context ids for this MIB.
1020 AllocNode->AllocTypes |= (uint8_t)AllocType;
1021
1022 // Now add or update nodes for each stack id in alloc's context.
1023 // Later when processing the stack ids on non-alloc callsites we will adjust
1024 // for any inlining in the context.
1025 ContextNode *PrevNode = AllocNode;
1026 // Look for recursion (direct recursion should have been collapsed by
1027 // module summary analysis, here we should just be detecting mutual
1028 // recursion). Mark these nodes so we don't try to clone.
1029 SmallSet<uint64_t, 8> StackIdSet;
1030 // Skip any on the allocation call (inlining).
1031 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1032 ContextIter != StackContext.end(); ++ContextIter) {
1033 auto StackId = getStackId(*ContextIter);
1034 ContextNode *StackNode = getNodeForStackId(StackId);
1035 if (!StackNode) {
1036 NodeOwner.push_back(
1037 std::make_unique<ContextNode>(/*IsAllocation=*/false));
1038 StackNode = NodeOwner.back().get();
1039 StackEntryIdToContextNodeMap[StackId] = StackNode;
1040 StackNode->OrigStackOrAllocId = StackId;
1041 }
1042 auto Ins = StackIdSet.insert(StackId);
1043 if (!Ins.second)
1044 StackNode->Recursive = true;
1045 StackNode->AllocTypes |= (uint8_t)AllocType;
1046 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1047 PrevNode = StackNode;
1048 }
1049}
1050
1051template <typename DerivedCCG, typename FuncTy, typename CallTy>
1053CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1054 const DenseSet<uint32_t> &StackSequenceContextIds,
1055 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1056 DenseSet<uint32_t> NewContextIds;
1057 for (auto OldId : StackSequenceContextIds) {
1058 NewContextIds.insert(++LastContextId);
1059 OldToNewContextIds[OldId].insert(LastContextId);
1060 assert(ContextIdToAllocationType.count(OldId));
1061 // The new context has the same allocation type as original.
1062 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1063 }
1064 return NewContextIds;
1065}
1066
1067template <typename DerivedCCG, typename FuncTy, typename CallTy>
1068void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1069 propagateDuplicateContextIds(
1070 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1071 // Build a set of duplicated context ids corresponding to the input id set.
1072 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1073 DenseSet<uint32_t> NewIds;
1074 for (auto Id : ContextIds)
1075 if (auto NewId = OldToNewContextIds.find(Id);
1076 NewId != OldToNewContextIds.end())
1077 NewIds.insert(NewId->second.begin(), NewId->second.end());
1078 return NewIds;
1079 };
1080
1081 // Recursively update context ids sets along caller edges.
1082 auto UpdateCallers = [&](ContextNode *Node,
1084 auto &&UpdateCallers) -> void {
1085 for (const auto &Edge : Node->CallerEdges) {
1086 auto Inserted = Visited.insert(Edge.get());
1087 if (!Inserted.second)
1088 continue;
1089 ContextNode *NextNode = Edge->Caller;
1090 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1091 // Only need to recursively iterate to NextNode via this caller edge if
1092 // it resulted in any added ids to NextNode.
1093 if (!NewIdsToAdd.empty()) {
1094 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1095 UpdateCallers(NextNode, Visited, UpdateCallers);
1096 }
1097 }
1098 };
1099
1101 for (auto &Entry : AllocationCallToContextNodeMap) {
1102 auto *Node = Entry.second;
1103 UpdateCallers(Node, Visited, UpdateCallers);
1104 }
1105}
1106
1107template <typename DerivedCCG, typename FuncTy, typename CallTy>
1108void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1109 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1110 // This must be passed by value to make a copy since it will be adjusted
1111 // as ids are moved.
1112 DenseSet<uint32_t> RemainingContextIds) {
1113 auto &OrigEdges =
1114 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1115 // Increment iterator in loop so that we can remove edges as needed.
1116 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1117 auto Edge = *EI;
1118 // Remove any matching context ids from Edge, return set that were found and
1119 // removed, these are the new edge's context ids. Also update the remaining
1120 // (not found ids).
1121 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1122 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1123 NotFoundContextIds);
1124 RemainingContextIds.swap(NotFoundContextIds);
1125 // If no matching context ids for this edge, skip it.
1126 if (NewEdgeContextIds.empty()) {
1127 ++EI;
1128 continue;
1129 }
1130 if (TowardsCallee) {
1131 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1132 auto NewEdge = std::make_shared<ContextEdge>(
1133 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1134 NewNode->CalleeEdges.push_back(NewEdge);
1135 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1136 } else {
1137 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1138 auto NewEdge = std::make_shared<ContextEdge>(
1139 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1140 NewNode->CallerEdges.push_back(NewEdge);
1141 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1142 }
1143 // Remove old edge if context ids empty.
1144 if (Edge->getContextIds().empty()) {
1145 if (TowardsCallee) {
1146 Edge->Callee->eraseCallerEdge(Edge.get());
1147 EI = OrigNode->CalleeEdges.erase(EI);
1148 } else {
1149 Edge->Caller->eraseCalleeEdge(Edge.get());
1150 EI = OrigNode->CallerEdges.erase(EI);
1151 }
1152 continue;
1153 }
1154 ++EI;
1155 }
1156}
1157
1158template <typename DerivedCCG, typename FuncTy, typename CallTy>
1159static void checkEdge(
1160 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1161 // Confirm that alloc type is not None and that we have at least one context
1162 // id.
1163 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1164 assert(!Edge->ContextIds.empty());
1165}
1166
1167template <typename DerivedCCG, typename FuncTy, typename CallTy>
1168static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1169 bool CheckEdges = true) {
1170 if (Node->isRemoved())
1171 return;
1172#ifndef NDEBUG
1173 // Compute node's context ids once for use in asserts.
1174 auto NodeContextIds = Node->getContextIds();
1175#endif
1176 // Node's context ids should be the union of both its callee and caller edge
1177 // context ids.
1178 if (Node->CallerEdges.size()) {
1179 DenseSet<uint32_t> CallerEdgeContextIds(
1180 Node->CallerEdges.front()->ContextIds);
1181 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1182 if (CheckEdges)
1183 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1184 set_union(CallerEdgeContextIds, Edge->ContextIds);
1185 }
1186 // Node can have more context ids than callers if some contexts terminate at
1187 // node and some are longer.
1188 assert(NodeContextIds == CallerEdgeContextIds ||
1189 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1190 }
1191 if (Node->CalleeEdges.size()) {
1192 DenseSet<uint32_t> CalleeEdgeContextIds(
1193 Node->CalleeEdges.front()->ContextIds);
1194 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1195 if (CheckEdges)
1196 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1197 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1198 }
1199 assert(NodeContextIds == CalleeEdgeContextIds);
1200 }
1201}
1202
1203template <typename DerivedCCG, typename FuncTy, typename CallTy>
1204void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1205 assignStackNodesPostOrder(ContextNode *Node,
1207 DenseMap<uint64_t, std::vector<CallContextInfo>>
1208 &StackIdToMatchingCalls) {
1209 auto Inserted = Visited.insert(Node);
1210 if (!Inserted.second)
1211 return;
1212 // Post order traversal. Iterate over a copy since we may add nodes and
1213 // therefore new callers during the recursive call, invalidating any
1214 // iterator over the original edge vector. We don't need to process these
1215 // new nodes as they were already processed on creation.
1216 auto CallerEdges = Node->CallerEdges;
1217 for (auto &Edge : CallerEdges) {
1218 // Skip any that have been removed during the recursion.
1219 if (!Edge)
1220 continue;
1221 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1222 }
1223
1224 // If this node's stack id is in the map, update the graph to contain new
1225 // nodes representing any inlining at interior callsites. Note we move the
1226 // associated context ids over to the new nodes.
1227
1228 // Ignore this node if it is for an allocation or we didn't record any
1229 // stack id lists ending at it.
1230 if (Node->IsAllocation ||
1231 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1232 return;
1233
1234 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1235 // Handle the simple case first. A single call with a single stack id.
1236 // In this case there is no need to create any new context nodes, simply
1237 // assign the context node for stack id to this Call.
1238 if (Calls.size() == 1) {
1239 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1240 if (Ids.size() == 1) {
1241 assert(SavedContextIds.empty());
1242 // It should be this Node
1243 assert(Node == getNodeForStackId(Ids[0]));
1244 if (Node->Recursive)
1245 return;
1246 Node->setCall(Call);
1247 NonAllocationCallToContextNodeMap[Call] = Node;
1248 NodeToCallingFunc[Node] = Func;
1249 return;
1250 }
1251 }
1252
1253 // Find the node for the last stack id, which should be the same
1254 // across all calls recorded for this id, and is this node's id.
1255 uint64_t LastId = Node->OrigStackOrAllocId;
1256 ContextNode *LastNode = getNodeForStackId(LastId);
1257 // We should only have kept stack ids that had nodes.
1258 assert(LastNode);
1259
1260 for (unsigned I = 0; I < Calls.size(); I++) {
1261 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1262 // Skip any for which we didn't assign any ids, these don't get a node in
1263 // the graph.
1264 if (SavedContextIds.empty())
1265 continue;
1266
1267 assert(LastId == Ids.back());
1268
1269 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1270 assert(FirstNode);
1271
1272 // Recompute the context ids for this stack id sequence (the
1273 // intersection of the context ids of the corresponding nodes).
1274 // Start with the ids we saved in the map for this call, which could be
1275 // duplicated context ids. We have to recompute as we might have overlap
1276 // overlap between the saved context ids for different last nodes, and
1277 // removed them already during the post order traversal.
1278 set_intersect(SavedContextIds, FirstNode->getContextIds());
1279 ContextNode *PrevNode = nullptr;
1280 for (auto Id : Ids) {
1281 ContextNode *CurNode = getNodeForStackId(Id);
1282 // We should only have kept stack ids that had nodes and weren't
1283 // recursive.
1284 assert(CurNode);
1285 assert(!CurNode->Recursive);
1286 if (!PrevNode) {
1287 PrevNode = CurNode;
1288 continue;
1289 }
1290 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1291 if (!Edge) {
1292 SavedContextIds.clear();
1293 break;
1294 }
1295 PrevNode = CurNode;
1296 set_intersect(SavedContextIds, Edge->getContextIds());
1297
1298 // If we now have no context ids for clone, skip this call.
1299 if (SavedContextIds.empty())
1300 break;
1301 }
1302 if (SavedContextIds.empty())
1303 continue;
1304
1305 // Create new context node.
1306 NodeOwner.push_back(
1307 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1308 ContextNode *NewNode = NodeOwner.back().get();
1309 NodeToCallingFunc[NewNode] = Func;
1310 NonAllocationCallToContextNodeMap[Call] = NewNode;
1311 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1312
1313 // Connect to callees of innermost stack frame in inlined call chain.
1314 // This updates context ids for FirstNode's callee's to reflect those
1315 // moved to NewNode.
1316 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1317
1318 // Connect to callers of outermost stack frame in inlined call chain.
1319 // This updates context ids for FirstNode's caller's to reflect those
1320 // moved to NewNode.
1321 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1322
1323 // Now we need to remove context ids from edges/nodes between First and
1324 // Last Node.
1325 PrevNode = nullptr;
1326 for (auto Id : Ids) {
1327 ContextNode *CurNode = getNodeForStackId(Id);
1328 // We should only have kept stack ids that had nodes.
1329 assert(CurNode);
1330
1331 // Remove the context ids moved to NewNode from CurNode, and the
1332 // edge from the prior node.
1333 if (PrevNode) {
1334 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1335 assert(PrevEdge);
1336 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1337 if (PrevEdge->getContextIds().empty()) {
1338 PrevNode->eraseCallerEdge(PrevEdge);
1339 CurNode->eraseCalleeEdge(PrevEdge);
1340 }
1341 }
1342 // Since we update the edges from leaf to tail, only look at the callee
1343 // edges. This isn't an alloc node, so if there are no callee edges, the
1344 // alloc type is None.
1345 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1346 ? (uint8_t)AllocationType::None
1347 : CurNode->computeAllocType();
1348 PrevNode = CurNode;
1349 }
1350 if (VerifyNodes) {
1351 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1352 for (auto Id : Ids) {
1353 ContextNode *CurNode = getNodeForStackId(Id);
1354 // We should only have kept stack ids that had nodes.
1355 assert(CurNode);
1356 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1357 }
1358 }
1359 }
1360}
1361
1362template <typename DerivedCCG, typename FuncTy, typename CallTy>
1363void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1364 // Map of stack id to all calls with that as the last (outermost caller)
1365 // callsite id that has a context node (some might not due to pruning
1366 // performed during matching of the allocation profile contexts).
1367 // The CallContextInfo contains the Call and a list of its stack ids with
1368 // ContextNodes, the function containing Call, and the set of context ids
1369 // the analysis will eventually identify for use in any new node created
1370 // for that callsite.
1372 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1373 for (auto &Call : CallsWithMetadata) {
1374 // Ignore allocations, already handled.
1375 if (AllocationCallToContextNodeMap.count(Call))
1376 continue;
1377 auto StackIdsWithContextNodes =
1378 getStackIdsWithContextNodesForCall(Call.call());
1379 // If there were no nodes created for MIBs on allocs (maybe this was in
1380 // the unambiguous part of the MIB stack that was pruned), ignore.
1381 if (StackIdsWithContextNodes.empty())
1382 continue;
1383 // Otherwise, record this Call along with the list of ids for the last
1384 // (outermost caller) stack id with a node.
1385 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1386 {Call.call(), StackIdsWithContextNodes, Func, {}});
1387 }
1388 }
1389
1390 // First make a pass through all stack ids that correspond to a call,
1391 // as identified in the above loop. Compute the context ids corresponding to
1392 // each of these calls when they correspond to multiple stack ids due to
1393 // due to inlining. Perform any duplication of context ids required when
1394 // there is more than one call with the same stack ids. Their (possibly newly
1395 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1396 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1397 for (auto &It : StackIdToMatchingCalls) {
1398 auto &Calls = It.getSecond();
1399 // Skip single calls with a single stack id. These don't need a new node.
1400 if (Calls.size() == 1) {
1401 auto &Ids = std::get<1>(Calls[0]);
1402 if (Ids.size() == 1)
1403 continue;
1404 }
1405 // In order to do the best and maximal matching of inlined calls to context
1406 // node sequences we will sort the vectors of stack ids in descending order
1407 // of length, and within each length, lexicographically by stack id. The
1408 // latter is so that we can specially handle calls that have identical stack
1409 // id sequences (either due to cloning or artificially because of the MIB
1410 // context pruning).
1411 std::stable_sort(Calls.begin(), Calls.end(),
1412 [](const CallContextInfo &A, const CallContextInfo &B) {
1413 auto &IdsA = std::get<1>(A);
1414 auto &IdsB = std::get<1>(B);
1415 return IdsA.size() > IdsB.size() ||
1416 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1417 });
1418
1419 // Find the node for the last stack id, which should be the same
1420 // across all calls recorded for this id, and is the id for this
1421 // entry in the StackIdToMatchingCalls map.
1422 uint64_t LastId = It.getFirst();
1423 ContextNode *LastNode = getNodeForStackId(LastId);
1424 // We should only have kept stack ids that had nodes.
1425 assert(LastNode);
1426
1427 if (LastNode->Recursive)
1428 continue;
1429
1430 // Initialize the context ids with the last node's. We will subsequently
1431 // refine the context ids by computing the intersection along all edges.
1432 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1433 assert(!LastNodeContextIds.empty());
1434
1435 for (unsigned I = 0; I < Calls.size(); I++) {
1436 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1437 assert(SavedContextIds.empty());
1438 assert(LastId == Ids.back());
1439
1440 // First compute the context ids for this stack id sequence (the
1441 // intersection of the context ids of the corresponding nodes).
1442 // Start with the remaining saved ids for the last node.
1443 assert(!LastNodeContextIds.empty());
1444 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1445
1446 ContextNode *PrevNode = LastNode;
1447 ContextNode *CurNode = LastNode;
1448 bool Skip = false;
1449
1450 // Iterate backwards through the stack Ids, starting after the last Id
1451 // in the list, which was handled once outside for all Calls.
1452 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1453 auto Id = *IdIter;
1454 CurNode = getNodeForStackId(Id);
1455 // We should only have kept stack ids that had nodes.
1456 assert(CurNode);
1457
1458 if (CurNode->Recursive) {
1459 Skip = true;
1460 break;
1461 }
1462
1463 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1464 // If there is no edge then the nodes belong to different MIB contexts,
1465 // and we should skip this inlined context sequence. For example, this
1466 // particular inlined context may include stack ids A->B, and we may
1467 // indeed have nodes for both A and B, but it is possible that they were
1468 // never profiled in sequence in a single MIB for any allocation (i.e.
1469 // we might have profiled an allocation that involves the callsite A,
1470 // but through a different one of its callee callsites, and we might
1471 // have profiled an allocation that involves callsite B, but reached
1472 // from a different caller callsite).
1473 if (!Edge) {
1474 Skip = true;
1475 break;
1476 }
1477 PrevNode = CurNode;
1478
1479 // Update the context ids, which is the intersection of the ids along
1480 // all edges in the sequence.
1481 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1482
1483 // If we now have no context ids for clone, skip this call.
1484 if (StackSequenceContextIds.empty()) {
1485 Skip = true;
1486 break;
1487 }
1488 }
1489 if (Skip)
1490 continue;
1491
1492 // If some of this call's stack ids did not have corresponding nodes (due
1493 // to pruning), don't include any context ids for contexts that extend
1494 // beyond these nodes. Otherwise we would be matching part of unrelated /
1495 // not fully matching stack contexts. To do this, subtract any context ids
1496 // found in caller nodes of the last node found above.
1497 if (Ids.back() != getLastStackId(Call)) {
1498 for (const auto &PE : LastNode->CallerEdges) {
1499 set_subtract(StackSequenceContextIds, PE->getContextIds());
1500 if (StackSequenceContextIds.empty())
1501 break;
1502 }
1503 // If we now have no context ids for clone, skip this call.
1504 if (StackSequenceContextIds.empty())
1505 continue;
1506 }
1507
1508 // Check if the next set of stack ids is the same (since the Calls vector
1509 // of tuples is sorted by the stack ids we can just look at the next one).
1510 bool DuplicateContextIds = false;
1511 if (I + 1 < Calls.size()) {
1512 auto NextIds = std::get<1>(Calls[I + 1]);
1513 DuplicateContextIds = Ids == NextIds;
1514 }
1515
1516 // If we don't have duplicate context ids, then we can assign all the
1517 // context ids computed for the original node sequence to this call.
1518 // If there are duplicate calls with the same stack ids then we synthesize
1519 // new context ids that are duplicates of the originals. These are
1520 // assigned to SavedContextIds, which is a reference into the map entry
1521 // for this call, allowing us to access these ids later on.
1522 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1523 StackSequenceContextIds.size());
1524 SavedContextIds =
1525 DuplicateContextIds
1526 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1527 : StackSequenceContextIds;
1528 assert(!SavedContextIds.empty());
1529
1530 if (!DuplicateContextIds) {
1531 // Update saved last node's context ids to remove those that are
1532 // assigned to other calls, so that it is ready for the next call at
1533 // this stack id.
1534 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1535 if (LastNodeContextIds.empty())
1536 break;
1537 }
1538 }
1539 }
1540
1541 // Propagate the duplicate context ids over the graph.
1542 propagateDuplicateContextIds(OldToNewContextIds);
1543
1544 if (VerifyCCG)
1545 check();
1546
1547 // Now perform a post-order traversal over the graph, starting with the
1548 // allocation nodes, essentially processing nodes from callers to callees.
1549 // For any that contains an id in the map, update the graph to contain new
1550 // nodes representing any inlining at interior callsites. Note we move the
1551 // associated context ids over to the new nodes.
1553 for (auto &Entry : AllocationCallToContextNodeMap)
1554 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1555 if (VerifyCCG)
1556 check();
1557}
1558
1559uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1561 Call->getMetadata(LLVMContext::MD_callsite));
1562 return CallsiteContext.back();
1563}
1564
1565uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1566 assert(isa<CallsiteInfo *>(Call.getBase()));
1568 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1569 // Need to convert index into stack id.
1570 return Index.getStackIdAtIndex(CallsiteContext.back());
1571}
1572
1573static const std::string MemProfCloneSuffix = ".memprof.";
1574
1575static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1576 // We use CloneNo == 0 to refer to the original version, which doesn't get
1577 // renamed with a suffix.
1578 if (!CloneNo)
1579 return Base.str();
1580 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1581}
1582
1583std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1584 const Instruction *Call,
1585 unsigned CloneNo) const {
1586 return (Twine(Call->getFunction()->getName()) + " -> " +
1587 cast<CallBase>(Call)->getCalledFunction()->getName())
1588 .str();
1589}
1590
1591std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1592 const IndexCall &Call,
1593 unsigned CloneNo) const {
1594 auto VI = FSToVIMap.find(Func);
1595 assert(VI != FSToVIMap.end());
1596 if (isa<AllocInfo *>(Call.getBase()))
1597 return (VI->second.name() + " -> alloc").str();
1598 else {
1599 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1600 return (VI->second.name() + " -> " +
1601 getMemProfFuncName(Callsite->Callee.name(),
1602 Callsite->Clones[CloneNo]))
1603 .str();
1604 }
1605}
1606
1607std::vector<uint64_t>
1608ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1609 Instruction *Call) {
1611 Call->getMetadata(LLVMContext::MD_callsite));
1612 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1613 CallsiteContext);
1614}
1615
1616std::vector<uint64_t>
1617IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1618 assert(isa<CallsiteInfo *>(Call.getBase()));
1620 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1621 return getStackIdsWithContextNodes<CallsiteInfo,
1623 CallsiteContext);
1624}
1625
1626template <typename DerivedCCG, typename FuncTy, typename CallTy>
1627template <class NodeT, class IteratorT>
1628std::vector<uint64_t>
1629CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1630 CallStack<NodeT, IteratorT> &CallsiteContext) {
1631 std::vector<uint64_t> StackIds;
1632 for (auto IdOrIndex : CallsiteContext) {
1633 auto StackId = getStackId(IdOrIndex);
1634 ContextNode *Node = getNodeForStackId(StackId);
1635 if (!Node)
1636 break;
1637 StackIds.push_back(StackId);
1638 }
1639 return StackIds;
1640}
1641
1642ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1643 Module &M,
1645 : Mod(M), OREGetter(OREGetter) {
1646 for (auto &F : M) {
1647 std::vector<CallInfo> CallsWithMetadata;
1648 for (auto &BB : F) {
1649 for (auto &I : BB) {
1650 if (!isa<CallBase>(I))
1651 continue;
1652 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1653 CallsWithMetadata.push_back(&I);
1654 auto *AllocNode = addAllocNode(&I, &F);
1655 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1656 assert(CallsiteMD);
1657 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1658 // Add all of the MIBs and their stack nodes.
1659 for (auto &MDOp : MemProfMD->operands()) {
1660 auto *MIBMD = cast<const MDNode>(MDOp);
1664 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1665 AllocNode, StackContext, CallsiteContext,
1666 getMIBAllocType(MIBMD));
1667 }
1668 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1669 // Memprof and callsite metadata on memory allocations no longer
1670 // needed.
1671 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1672 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1673 }
1674 // For callsite metadata, add to list for this function for later use.
1675 else if (I.getMetadata(LLVMContext::MD_callsite))
1676 CallsWithMetadata.push_back(&I);
1677 }
1678 }
1679 if (!CallsWithMetadata.empty())
1680 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1681 }
1682
1683 if (DumpCCG) {
1684 dbgs() << "CCG before updating call stack chains:\n";
1685 dbgs() << *this;
1686 }
1687
1688 if (ExportToDot)
1689 exportToDot("prestackupdate");
1690
1691 updateStackNodes();
1692
1693 handleCallsitesWithMultipleTargets();
1694
1695 // Strip off remaining callsite metadata, no longer needed.
1696 for (auto &FuncEntry : FuncToCallsWithMetadata)
1697 for (auto &Call : FuncEntry.second)
1698 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1699}
1700
1701IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1704 isPrevailing)
1705 : Index(Index), isPrevailing(isPrevailing) {
1706 for (auto &I : Index) {
1707 auto VI = Index.getValueInfo(I);
1708 for (auto &S : VI.getSummaryList()) {
1709 // We should only add the prevailing nodes. Otherwise we may try to clone
1710 // in a weak copy that won't be linked (and may be different than the
1711 // prevailing version).
1712 // We only keep the memprof summary on the prevailing copy now when
1713 // building the combined index, as a space optimization, however don't
1714 // rely on this optimization. The linker doesn't resolve local linkage
1715 // values so don't check whether those are prevailing.
1716 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1717 !isPrevailing(VI.getGUID(), S.get()))
1718 continue;
1719 auto *FS = dyn_cast<FunctionSummary>(S.get());
1720 if (!FS)
1721 continue;
1722 std::vector<CallInfo> CallsWithMetadata;
1723 if (!FS->allocs().empty()) {
1724 for (auto &AN : FS->mutableAllocs()) {
1725 // This can happen because of recursion elimination handling that
1726 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1727 // We still added them to the summary because we need to be able to
1728 // correlate properly in applyImport in the backends.
1729 if (AN.MIBs.empty())
1730 continue;
1731 CallsWithMetadata.push_back({&AN});
1732 auto *AllocNode = addAllocNode({&AN}, FS);
1733 // Pass an empty CallStack to the CallsiteContext (second)
1734 // parameter, since for ThinLTO we already collapsed out the inlined
1735 // stack ids on the allocation call during ModuleSummaryAnalysis.
1737 EmptyContext;
1738 // Now add all of the MIBs and their stack nodes.
1739 for (auto &MIB : AN.MIBs) {
1741 StackContext(&MIB);
1742 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1743 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1744 }
1745 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1746 // Initialize version 0 on the summary alloc node to the current alloc
1747 // type, unless it has both types in which case make it default, so
1748 // that in the case where we aren't able to clone the original version
1749 // always ends up with the default allocation behavior.
1750 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1751 }
1752 }
1753 // For callsite metadata, add to list for this function for later use.
1754 if (!FS->callsites().empty())
1755 for (auto &SN : FS->mutableCallsites())
1756 CallsWithMetadata.push_back({&SN});
1757
1758 if (!CallsWithMetadata.empty())
1759 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1760
1761 if (!FS->allocs().empty() || !FS->callsites().empty())
1762 FSToVIMap[FS] = VI;
1763 }
1764 }
1765
1766 if (DumpCCG) {
1767 dbgs() << "CCG before updating call stack chains:\n";
1768 dbgs() << *this;
1769 }
1770
1771 if (ExportToDot)
1772 exportToDot("prestackupdate");
1773
1774 updateStackNodes();
1775
1776 handleCallsitesWithMultipleTargets();
1777}
1778
1779template <typename DerivedCCG, typename FuncTy, typename CallTy>
1780void CallsiteContextGraph<DerivedCCG, FuncTy,
1781 CallTy>::handleCallsitesWithMultipleTargets() {
1782 // Look for and workaround callsites that call multiple functions.
1783 // This can happen for indirect calls, which needs better handling, and in
1784 // more rare cases (e.g. macro expansion).
1785 // TODO: To fix this for indirect calls we will want to perform speculative
1786 // devirtualization using either the normal PGO info with ICP, or using the
1787 // information in the profiled MemProf contexts. We can do this prior to
1788 // this transformation for regular LTO, and for ThinLTO we can simulate that
1789 // effect in the summary and perform the actual speculative devirtualization
1790 // while cloning in the ThinLTO backend.
1791
1792 // Keep track of the new nodes synthesized for discovered tail calls missing
1793 // from the profiled contexts.
1794 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1795
1796 for (auto &Entry : NonAllocationCallToContextNodeMap) {
1797 auto *Node = Entry.second;
1798 assert(Node->Clones.empty());
1799 // Check all node callees and see if in the same function.
1800 auto Call = Node->Call.call();
1801 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1802 auto Edge = *EI;
1803 if (!Edge->Callee->hasCall()) {
1804 ++EI;
1805 continue;
1806 }
1807 assert(NodeToCallingFunc.count(Edge->Callee));
1808 // Check if the called function matches that of the callee node.
1809 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1810 continue;
1811 RemovedEdgesWithMismatchedCallees++;
1812 // Work around by setting Node to have a null call, so it gets
1813 // skipped during cloning. Otherwise assignFunctions will assert
1814 // because its data structures are not designed to handle this case.
1815 Node->setCall(CallInfo());
1816 break;
1817 }
1818 }
1819
1820 // Remove all mismatched nodes identified in the above loop from the node map
1821 // (checking whether they have a null call which is set above). For a
1822 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
1823 // to do the removal via remove_if than by individually erasing entries above.
1824 NonAllocationCallToContextNodeMap.remove_if(
1825 [](const auto &it) { return !it.second->hasCall(); });
1826
1827 // Add the new nodes after the above loop so that the iteration is not
1828 // invalidated.
1829 for (auto &[Call, Node] : TailCallToContextNodeMap)
1830 NonAllocationCallToContextNodeMap[Call] = Node;
1831}
1832
1833uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1834 // In the Module (IR) case this is already the Id.
1835 return IdOrIndex;
1836}
1837
1838uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1839 // In the Index case this is an index into the stack id list in the summary
1840 // index, convert it to an Id.
1841 return Index.getStackIdAtIndex(IdOrIndex);
1842}
1843
1844template <typename DerivedCCG, typename FuncTy, typename CallTy>
1845bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1846 CallTy Call, EdgeIter &EI,
1847 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1848 auto Edge = *EI;
1849 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1850 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1851 // Will be populated in order of callee to caller if we find a chain of tail
1852 // calls between the profiled caller and callee.
1853 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1854 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1855 FoundCalleeChain)) {
1856 ++EI;
1857 return false;
1858 }
1859
1860 // The usual case where the profiled callee matches that of the IR/summary.
1861 if (FoundCalleeChain.empty()) {
1862 ++EI;
1863 return true;
1864 }
1865
1866 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1867 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1868 // If there is already an edge between these nodes, simply update it and
1869 // return.
1870 if (CurEdge) {
1871 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1872 Edge->ContextIds.end());
1873 CurEdge->AllocTypes |= Edge->AllocTypes;
1874 return;
1875 }
1876 // Otherwise, create a new edge and insert it into the caller and callee
1877 // lists.
1878 auto NewEdge = std::make_shared<ContextEdge>(
1879 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1880 Callee->CallerEdges.push_back(NewEdge);
1881 if (Caller == Edge->Caller) {
1882 // If we are inserting the new edge into the current edge's caller, insert
1883 // the new edge before the current iterator position, and then increment
1884 // back to the current edge.
1885 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1886 ++EI;
1887 assert(*EI == Edge &&
1888 "Iterator position not restored after insert and increment");
1889 } else
1890 Caller->CalleeEdges.push_back(NewEdge);
1891 };
1892
1893 // Create new nodes for each found callee and connect in between the profiled
1894 // caller and callee.
1895 auto *CurCalleeNode = Edge->Callee;
1896 for (auto &[NewCall, Func] : FoundCalleeChain) {
1897 ContextNode *NewNode = nullptr;
1898 // First check if we have already synthesized a node for this tail call.
1899 if (TailCallToContextNodeMap.count(NewCall)) {
1900 NewNode = TailCallToContextNodeMap[NewCall];
1901 NewNode->AllocTypes |= Edge->AllocTypes;
1902 } else {
1903 FuncToCallsWithMetadata[Func].push_back({NewCall});
1904 // Create Node and record node info.
1905 NodeOwner.push_back(
1906 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1907 NewNode = NodeOwner.back().get();
1908 NodeToCallingFunc[NewNode] = Func;
1909 TailCallToContextNodeMap[NewCall] = NewNode;
1910 NewNode->AllocTypes = Edge->AllocTypes;
1911 }
1912
1913 // Hook up node to its callee node
1914 AddEdge(NewNode, CurCalleeNode);
1915
1916 CurCalleeNode = NewNode;
1917 }
1918
1919 // Hook up edge's original caller to new callee node.
1920 AddEdge(Edge->Caller, CurCalleeNode);
1921
1922 // Remove old edge
1923 Edge->Callee->eraseCallerEdge(Edge.get());
1924 EI = Edge->Caller->CalleeEdges.erase(EI);
1925
1926 return true;
1927}
1928
1929bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1930 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1931 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1932 bool &FoundMultipleCalleeChains) {
1933 // Stop recursive search if we have already explored the maximum specified
1934 // depth.
1936 return false;
1937
1938 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1939 FoundCalleeChain.push_back({Callsite, F});
1940 };
1941
1942 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1943 if (!CalleeFunc) {
1944 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1945 assert(Alias);
1946 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1947 assert(CalleeFunc);
1948 }
1949
1950 // Look for tail calls in this function, and check if they either call the
1951 // profiled callee directly, or indirectly (via a recursive search).
1952 // Only succeed if there is a single unique tail call chain found between the
1953 // profiled caller and callee, otherwise we could perform incorrect cloning.
1954 bool FoundSingleCalleeChain = false;
1955 for (auto &BB : *CalleeFunc) {
1956 for (auto &I : BB) {
1957 auto *CB = dyn_cast<CallBase>(&I);
1958 if (!CB || !CB->isTailCall())
1959 continue;
1960 auto *CalledValue = CB->getCalledOperand();
1961 auto *CalledFunction = CB->getCalledFunction();
1962 if (CalledValue && !CalledFunction) {
1963 CalledValue = CalledValue->stripPointerCasts();
1964 // Stripping pointer casts can reveal a called function.
1965 CalledFunction = dyn_cast<Function>(CalledValue);
1966 }
1967 // Check if this is an alias to a function. If so, get the
1968 // called aliasee for the checks below.
1969 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
1970 assert(!CalledFunction &&
1971 "Expected null called function in callsite for alias");
1972 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
1973 }
1974 if (!CalledFunction)
1975 continue;
1976 if (CalledFunction == ProfiledCallee) {
1977 if (FoundSingleCalleeChain) {
1978 FoundMultipleCalleeChains = true;
1979 return false;
1980 }
1981 FoundSingleCalleeChain = true;
1982 FoundProfiledCalleeCount++;
1983 FoundProfiledCalleeDepth += Depth;
1984 if (Depth > FoundProfiledCalleeMaxDepth)
1985 FoundProfiledCalleeMaxDepth = Depth;
1986 SaveCallsiteInfo(&I, CalleeFunc);
1987 } else if (findProfiledCalleeThroughTailCalls(
1988 ProfiledCallee, CalledFunction, Depth + 1,
1989 FoundCalleeChain, FoundMultipleCalleeChains)) {
1990 // findProfiledCalleeThroughTailCalls should not have returned
1991 // true if FoundMultipleCalleeChains.
1992 assert(!FoundMultipleCalleeChains);
1993 if (FoundSingleCalleeChain) {
1994 FoundMultipleCalleeChains = true;
1995 return false;
1996 }
1997 FoundSingleCalleeChain = true;
1998 SaveCallsiteInfo(&I, CalleeFunc);
1999 } else if (FoundMultipleCalleeChains)
2000 return false;
2001 }
2002 }
2003
2004 return FoundSingleCalleeChain;
2005}
2006
2007bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2008 Instruction *Call, const Function *Func, const Function *CallerFunc,
2009 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2010 auto *CB = dyn_cast<CallBase>(Call);
2011 if (!CB->getCalledOperand())
2012 return false;
2013 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2014 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2015 if (CalleeFunc == Func)
2016 return true;
2017 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2018 if (Alias && Alias->getAliasee() == Func)
2019 return true;
2020
2021 // Recursively search for the profiled callee through tail calls starting with
2022 // the actual Callee. The discovered tail call chain is saved in
2023 // FoundCalleeChain, and we will fixup the graph to include these callsites
2024 // after returning.
2025 // FIXME: We will currently redo the same recursive walk if we find the same
2026 // mismatched callee from another callsite. We can improve this with more
2027 // bookkeeping of the created chain of new nodes for each mismatch.
2028 unsigned Depth = 1;
2029 bool FoundMultipleCalleeChains = false;
2030 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2031 FoundCalleeChain,
2032 FoundMultipleCalleeChains)) {
2033 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2034 << Func->getName() << " from " << CallerFunc->getName()
2035 << " that actually called " << CalleeVal->getName()
2036 << (FoundMultipleCalleeChains
2037 ? " (found multiple possible chains)"
2038 : "")
2039 << "\n");
2040 if (FoundMultipleCalleeChains)
2041 FoundProfiledCalleeNonUniquelyCount++;
2042 return false;
2043 }
2044
2045 return true;
2046}
2047
2048bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2049 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2050 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2051 bool &FoundMultipleCalleeChains) {
2052 // Stop recursive search if we have already explored the maximum specified
2053 // depth.
2055 return false;
2056
2057 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2058 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2059 // been synthesized.
2060 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2061 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2062 // StackIds is empty (we don't have debug info available in the index for
2063 // these callsites)
2064 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2065 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2066 CallsiteInfo *NewCallsiteInfo =
2067 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2068 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2069 };
2070
2071 // Look for tail calls in this function, and check if they either call the
2072 // profiled callee directly, or indirectly (via a recursive search).
2073 // Only succeed if there is a single unique tail call chain found between the
2074 // profiled caller and callee, otherwise we could perform incorrect cloning.
2075 bool FoundSingleCalleeChain = false;
2076 for (auto &S : CurCallee.getSummaryList()) {
2077 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2078 !isPrevailing(CurCallee.getGUID(), S.get()))
2079 continue;
2080 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2081 if (!FS)
2082 continue;
2083 auto FSVI = CurCallee;
2084 auto *AS = dyn_cast<AliasSummary>(S.get());
2085 if (AS)
2086 FSVI = AS->getAliaseeVI();
2087 for (auto &CallEdge : FS->calls()) {
2088 if (!CallEdge.second.hasTailCall())
2089 continue;
2090 if (CallEdge.first == ProfiledCallee) {
2091 if (FoundSingleCalleeChain) {
2092 FoundMultipleCalleeChains = true;
2093 return false;
2094 }
2095 FoundSingleCalleeChain = true;
2096 FoundProfiledCalleeCount++;
2097 FoundProfiledCalleeDepth += Depth;
2098 if (Depth > FoundProfiledCalleeMaxDepth)
2099 FoundProfiledCalleeMaxDepth = Depth;
2100 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2101 // Add FS to FSToVIMap in case it isn't already there.
2102 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2103 FSToVIMap[FS] = FSVI;
2104 } else if (findProfiledCalleeThroughTailCalls(
2105 ProfiledCallee, CallEdge.first, Depth + 1,
2106 FoundCalleeChain, FoundMultipleCalleeChains)) {
2107 // findProfiledCalleeThroughTailCalls should not have returned
2108 // true if FoundMultipleCalleeChains.
2109 assert(!FoundMultipleCalleeChains);
2110 if (FoundSingleCalleeChain) {
2111 FoundMultipleCalleeChains = true;
2112 return false;
2113 }
2114 FoundSingleCalleeChain = true;
2115 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2116 // Add FS to FSToVIMap in case it isn't already there.
2117 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2118 FSToVIMap[FS] = FSVI;
2119 } else if (FoundMultipleCalleeChains)
2120 return false;
2121 }
2122 }
2123
2124 return FoundSingleCalleeChain;
2125}
2126
2127bool IndexCallsiteContextGraph::calleeMatchesFunc(
2128 IndexCall &Call, const FunctionSummary *Func,
2129 const FunctionSummary *CallerFunc,
2130 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2132 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2133 // If there is no summary list then this is a call to an externally defined
2134 // symbol.
2135 AliasSummary *Alias =
2136 Callee.getSummaryList().empty()
2137 ? nullptr
2138 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2139 assert(FSToVIMap.count(Func));
2140 auto FuncVI = FSToVIMap[Func];
2141 if (Callee == FuncVI ||
2142 // If callee is an alias, check the aliasee, since only function
2143 // summary base objects will contain the stack node summaries and thus
2144 // get a context node.
2145 (Alias && Alias->getAliaseeVI() == FuncVI))
2146 return true;
2147
2148 // Recursively search for the profiled callee through tail calls starting with
2149 // the actual Callee. The discovered tail call chain is saved in
2150 // FoundCalleeChain, and we will fixup the graph to include these callsites
2151 // after returning.
2152 // FIXME: We will currently redo the same recursive walk if we find the same
2153 // mismatched callee from another callsite. We can improve this with more
2154 // bookkeeping of the created chain of new nodes for each mismatch.
2155 unsigned Depth = 1;
2156 bool FoundMultipleCalleeChains = false;
2157 if (!findProfiledCalleeThroughTailCalls(
2158 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2159 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2160 << " from " << FSToVIMap[CallerFunc]
2161 << " that actually called " << Callee
2162 << (FoundMultipleCalleeChains
2163 ? " (found multiple possible chains)"
2164 : "")
2165 << "\n");
2166 if (FoundMultipleCalleeChains)
2167 FoundProfiledCalleeNonUniquelyCount++;
2168 return false;
2169 }
2170
2171 return true;
2172}
2173
2174static std::string getAllocTypeString(uint8_t AllocTypes) {
2175 if (!AllocTypes)
2176 return "None";
2177 std::string Str;
2178 if (AllocTypes & (uint8_t)AllocationType::NotCold)
2179 Str += "NotCold";
2180 if (AllocTypes & (uint8_t)AllocationType::Cold)
2181 Str += "Cold";
2182 return Str;
2183}
2184
2185template <typename DerivedCCG, typename FuncTy, typename CallTy>
2186void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2187 const {
2188 print(dbgs());
2189 dbgs() << "\n";
2190}
2191
2192template <typename DerivedCCG, typename FuncTy, typename CallTy>
2193void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2194 raw_ostream &OS) const {
2195 OS << "Node " << this << "\n";
2196 OS << "\t";
2197 printCall(OS);
2198 if (Recursive)
2199 OS << " (recursive)";
2200 OS << "\n";
2201 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2202 OS << "\tContextIds:";
2203 // Make a copy of the computed context ids that we can sort for stability.
2204 auto ContextIds = getContextIds();
2205 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2206 std::sort(SortedIds.begin(), SortedIds.end());
2207 for (auto Id : SortedIds)
2208 OS << " " << Id;
2209 OS << "\n";
2210 OS << "\tCalleeEdges:\n";
2211 for (auto &Edge : CalleeEdges)
2212 OS << "\t\t" << *Edge << "\n";
2213 OS << "\tCallerEdges:\n";
2214 for (auto &Edge : CallerEdges)
2215 OS << "\t\t" << *Edge << "\n";
2216 if (!Clones.empty()) {
2217 OS << "\tClones: ";
2218 FieldSeparator FS;
2219 for (auto *Clone : Clones)
2220 OS << FS << Clone;
2221 OS << "\n";
2222 } else if (CloneOf) {
2223 OS << "\tClone of " << CloneOf << "\n";
2224 }
2225}
2226
2227template <typename DerivedCCG, typename FuncTy, typename CallTy>
2228void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2229 const {
2230 print(dbgs());
2231 dbgs() << "\n";
2232}
2233
2234template <typename DerivedCCG, typename FuncTy, typename CallTy>
2235void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2236 raw_ostream &OS) const {
2237 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2238 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2239 OS << " ContextIds:";
2240 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2241 std::sort(SortedIds.begin(), SortedIds.end());
2242 for (auto Id : SortedIds)
2243 OS << " " << Id;
2244}
2245
2246template <typename DerivedCCG, typename FuncTy, typename CallTy>
2247void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2248 print(dbgs());
2249}
2250
2251template <typename DerivedCCG, typename FuncTy, typename CallTy>
2252void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2253 raw_ostream &OS) const {
2254 OS << "Callsite Context Graph:\n";
2255 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2256 for (const auto Node : nodes<GraphType>(this)) {
2257 if (Node->isRemoved())
2258 continue;
2259 Node->print(OS);
2260 OS << "\n";
2261 }
2262}
2263
2264template <typename DerivedCCG, typename FuncTy, typename CallTy>
2265void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2266 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2267 for (const auto Node : nodes<GraphType>(this)) {
2268 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2269 for (auto &Edge : Node->CallerEdges)
2270 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2271 }
2272}
2273
2274template <typename DerivedCCG, typename FuncTy, typename CallTy>
2275struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2276 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2277 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2278
2279 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2280 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2281
2284 decltype(&getNode)>;
2285
2287 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2288 }
2289
2291 return nodes_iterator(G->NodeOwner.end(), &getNode);
2292 }
2293
2295 return G->NodeOwner.begin()->get();
2296 }
2297
2298 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2299 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2301 return P->Callee;
2302 }
2303
2305 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2306 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2307 decltype(&GetCallee)>;
2308
2310 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2311 }
2312
2314 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2315 }
2316};
2317
2318template <typename DerivedCCG, typename FuncTy, typename CallTy>
2319struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2320 : public DefaultDOTGraphTraits {
2321 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2322
2323 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2325 using NodeRef = typename GTraits::NodeRef;
2326 using ChildIteratorType = typename GTraits::ChildIteratorType;
2327
2328 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2329 std::string LabelString =
2330 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2331 Twine(Node->OrigStackOrAllocId))
2332 .str();
2333 LabelString += "\n";
2334 if (Node->hasCall()) {
2335 auto Func = G->NodeToCallingFunc.find(Node);
2336 assert(Func != G->NodeToCallingFunc.end());
2337 LabelString +=
2338 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2339 } else {
2340 LabelString += "null call";
2341 if (Node->Recursive)
2342 LabelString += " (recursive)";
2343 else
2344 LabelString += " (external)";
2345 }
2346 return LabelString;
2347 }
2348
2349 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2350 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2351 getContextIds(Node->getContextIds()) + "\"")
2352 .str();
2354 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2355 AttributeString += ",style=\"filled\"";
2356 if (Node->CloneOf) {
2357 AttributeString += ",color=\"blue\"";
2358 AttributeString += ",style=\"filled,bold,dashed\"";
2359 } else
2360 AttributeString += ",style=\"filled\"";
2361 return AttributeString;
2362 }
2363
2364 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2365 GraphType) {
2366 auto &Edge = *(ChildIter.getCurrent());
2367 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2368 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2369 .str();
2370 }
2371
2372 // Since the NodeOwners list includes nodes that are no longer connected to
2373 // the graph, skip them here.
2374 static bool isNodeHidden(NodeRef Node, GraphType) {
2375 return Node->isRemoved();
2376 }
2377
2378private:
2379 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2380 std::string IdString = "ContextIds:";
2381 if (ContextIds.size() < 100) {
2382 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2383 std::sort(SortedIds.begin(), SortedIds.end());
2384 for (auto Id : SortedIds)
2385 IdString += (" " + Twine(Id)).str();
2386 } else {
2387 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2388 }
2389 return IdString;
2390 }
2391
2392 static std::string getColor(uint8_t AllocTypes) {
2393 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2394 // Color "brown1" actually looks like a lighter red.
2395 return "brown1";
2396 if (AllocTypes == (uint8_t)AllocationType::Cold)
2397 return "cyan";
2398 if (AllocTypes ==
2399 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2400 // Lighter purple.
2401 return "mediumorchid1";
2402 return "gray";
2403 }
2404
2405 static std::string getNodeId(NodeRef Node) {
2406 std::stringstream SStream;
2407 SStream << std::hex << "N0x" << (unsigned long long)Node;
2408 std::string Result = SStream.str();
2409 return Result;
2410 }
2411};
2412
2413template <typename DerivedCCG, typename FuncTy, typename CallTy>
2414void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2415 std::string Label) const {
2416 WriteGraph(this, "", false, Label,
2417 DotFilePathPrefix + "ccg." + Label + ".dot");
2418}
2419
2420template <typename DerivedCCG, typename FuncTy, typename CallTy>
2421typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2422CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2423 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2424 DenseSet<uint32_t> ContextIdsToMove) {
2425 ContextNode *Node = Edge->Callee;
2426 NodeOwner.push_back(
2427 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2428 ContextNode *Clone = NodeOwner.back().get();
2429 Node->addClone(Clone);
2430 assert(NodeToCallingFunc.count(Node));
2431 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2432 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2433 ContextIdsToMove);
2434 return Clone;
2435}
2436
2437template <typename DerivedCCG, typename FuncTy, typename CallTy>
2438void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2439 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2440 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2441 bool NewClone,
2442 DenseSet<uint32_t> ContextIdsToMove) {
2443 // NewCallee and Edge's current callee must be clones of the same original
2444 // node (Edge's current callee may be the original node too).
2445 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2446
2447 ContextNode *OldCallee = Edge->Callee;
2448
2449 // We might already have an edge to the new callee from earlier cloning for a
2450 // different allocation. If one exists we will reuse it.
2451 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2452
2453 // Callers will pass an empty ContextIdsToMove set when they want to move the
2454 // edge. Copy in Edge's ids for simplicity.
2455 if (ContextIdsToMove.empty())
2456 ContextIdsToMove = Edge->getContextIds();
2457
2458 // If we are moving all of Edge's ids, then just move the whole Edge.
2459 // Otherwise only move the specified subset, to a new edge if needed.
2460 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2461 // Moving the whole Edge.
2462 if (CallerEdgeI)
2463 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2464 else
2465 OldCallee->eraseCallerEdge(Edge.get());
2466 if (ExistingEdgeToNewCallee) {
2467 // Since we already have an edge to NewCallee, simply move the ids
2468 // onto it, and remove the existing Edge.
2469 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2470 ContextIdsToMove.end());
2471 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2472 assert(Edge->ContextIds == ContextIdsToMove);
2473 Edge->ContextIds.clear();
2474 Edge->AllocTypes = (uint8_t)AllocationType::None;
2475 Edge->Caller->eraseCalleeEdge(Edge.get());
2476 } else {
2477 // Otherwise just reconnect Edge to NewCallee.
2478 Edge->Callee = NewCallee;
2479 NewCallee->CallerEdges.push_back(Edge);
2480 // Don't need to update Edge's context ids since we are simply
2481 // reconnecting it.
2482 }
2483 // In either case, need to update the alloc types on New Callee.
2484 NewCallee->AllocTypes |= Edge->AllocTypes;
2485 } else {
2486 // Only moving a subset of Edge's ids.
2487 if (CallerEdgeI)
2488 ++CallerEdgeI;
2489 // Compute the alloc type of the subset of ids being moved.
2490 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2491 if (ExistingEdgeToNewCallee) {
2492 // Since we already have an edge to NewCallee, simply move the ids
2493 // onto it.
2494 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2495 ContextIdsToMove.end());
2496 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2497 } else {
2498 // Otherwise, create a new edge to NewCallee for the ids being moved.
2499 auto NewEdge = std::make_shared<ContextEdge>(
2500 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2501 Edge->Caller->CalleeEdges.push_back(NewEdge);
2502 NewCallee->CallerEdges.push_back(NewEdge);
2503 }
2504 // In either case, need to update the alloc types on NewCallee, and remove
2505 // those ids and update the alloc type on the original Edge.
2506 NewCallee->AllocTypes |= CallerEdgeAllocType;
2507 set_subtract(Edge->ContextIds, ContextIdsToMove);
2508 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2509 }
2510 // Now walk the old callee node's callee edges and move Edge's context ids
2511 // over to the corresponding edge into the clone (which is created here if
2512 // this is a newly created clone).
2513 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2514 // The context ids moving to the new callee are the subset of this edge's
2515 // context ids and the context ids on the caller edge being moved.
2516 DenseSet<uint32_t> EdgeContextIdsToMove =
2517 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2518 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2519 OldCalleeEdge->AllocTypes =
2520 computeAllocType(OldCalleeEdge->getContextIds());
2521 if (!NewClone) {
2522 // Update context ids / alloc type on corresponding edge to NewCallee.
2523 // There is a chance this may not exist if we are reusing an existing
2524 // clone, specifically during function assignment, where we would have
2525 // removed none type edges after creating the clone. If we can't find
2526 // a corresponding edge there, fall through to the cloning below.
2527 if (auto *NewCalleeEdge =
2528 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2529 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2530 EdgeContextIdsToMove.end());
2531 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2532 continue;
2533 }
2534 }
2535 auto NewEdge = std::make_shared<ContextEdge>(
2536 OldCalleeEdge->Callee, NewCallee,
2537 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2538 NewCallee->CalleeEdges.push_back(NewEdge);
2539 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2540 }
2541 // Recompute the node alloc type now that its callee edges have been
2542 // updated (since we will compute from those edges).
2543 OldCallee->AllocTypes = OldCallee->computeAllocType();
2544 // OldCallee alloc type should be None iff its context id set is now empty.
2545 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2546 OldCallee->emptyContextIds());
2547 if (VerifyCCG) {
2548 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2549 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2550 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2551 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2552 /*CheckEdges=*/false);
2553 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2554 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2555 /*CheckEdges=*/false);
2556 }
2557}
2558
2559template <typename DerivedCCG, typename FuncTy, typename CallTy>
2560void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2561 recursivelyRemoveNoneTypeCalleeEdges(
2562 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2563 auto Inserted = Visited.insert(Node);
2564 if (!Inserted.second)
2565 return;
2566
2567 removeNoneTypeCalleeEdges(Node);
2568
2569 for (auto *Clone : Node->Clones)
2570 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2571
2572 // The recursive call may remove some of this Node's caller edges.
2573 // Iterate over a copy and skip any that were removed.
2574 auto CallerEdges = Node->CallerEdges;
2575 for (auto &Edge : CallerEdges) {
2576 // Skip any that have been removed by an earlier recursive call.
2577 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2578 assert(!is_contained(Node->CallerEdges, Edge));
2579 continue;
2580 }
2581 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2582 }
2583}
2584
2585template <typename DerivedCCG, typename FuncTy, typename CallTy>
2586void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2588 for (auto &Entry : AllocationCallToContextNodeMap) {
2589 Visited.clear();
2590 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
2591 }
2592 Visited.clear();
2593 for (auto &Entry : AllocationCallToContextNodeMap)
2594 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2595 if (VerifyCCG)
2596 check();
2597}
2598
2599// helper function to check an AllocType is cold or notcold or both.
2601 return (AllocType == (uint8_t)AllocationType::Cold) ||
2602 (AllocType == (uint8_t)AllocationType::NotCold) ||
2603 (AllocType ==
2604 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2605}
2606
2607template <typename DerivedCCG, typename FuncTy, typename CallTy>
2608void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2609 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2610 const DenseSet<uint32_t> &AllocContextIds) {
2611 if (VerifyNodes)
2612 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2613 assert(!Node->CloneOf);
2614
2615 // If Node as a null call, then either it wasn't found in the module (regular
2616 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2617 // cloning (e.g. recursion, calls multiple targets, etc).
2618 // Do this here so that we don't try to recursively clone callers below, which
2619 // isn't useful at least for this node.
2620 if (!Node->hasCall())
2621 return;
2622
2623#ifndef NDEBUG
2624 auto Insert =
2625#endif
2626 Visited.insert(Node);
2627 // We should not have visited this node yet.
2628 assert(Insert.second);
2629 // The recursive call to identifyClones may delete the current edge from the
2630 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2631 // in an iterator and having recursive call erase from it. Other edges may
2632 // also get removed during the recursion, which will have null Callee and
2633 // Caller pointers (and are deleted later), so we skip those below.
2634 {
2635 auto CallerEdges = Node->CallerEdges;
2636 for (auto &Edge : CallerEdges) {
2637 // Skip any that have been removed by an earlier recursive call.
2638 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2639 assert(!llvm::count(Node->CallerEdges, Edge));
2640 continue;
2641 }
2642 // Ignore any caller we previously visited via another edge.
2643 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2644 identifyClones(Edge->Caller, Visited, AllocContextIds);
2645 }
2646 }
2647 }
2648
2649 // Check if we reached an unambiguous call or have have only a single caller.
2650 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2651 return;
2652
2653 // We need to clone.
2654
2655 // Try to keep the original version as alloc type NotCold. This will make
2656 // cases with indirect calls or any other situation with an unknown call to
2657 // the original function get the default behavior. We do this by sorting the
2658 // CallerEdges of the Node we will clone by alloc type.
2659 //
2660 // Give NotCold edge the lowest sort priority so those edges are at the end of
2661 // the caller edges vector, and stay on the original version (since the below
2662 // code clones greedily until it finds all remaining edges have the same type
2663 // and leaves the remaining ones on the original Node).
2664 //
2665 // We shouldn't actually have any None type edges, so the sorting priority for
2666 // that is arbitrary, and we assert in that case below.
2667 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2668 /*Cold*/ 1,
2669 /*NotColdCold*/ 2};
2670 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2671 [&](const std::shared_ptr<ContextEdge> &A,
2672 const std::shared_ptr<ContextEdge> &B) {
2673 // Nodes with non-empty context ids should be sorted before
2674 // those with empty context ids.
2675 if (A->ContextIds.empty())
2676 // Either B ContextIds are non-empty (in which case we
2677 // should return false because B < A), or B ContextIds
2678 // are empty, in which case they are equal, and we should
2679 // maintain the original relative ordering.
2680 return false;
2681 if (B->ContextIds.empty())
2682 return true;
2683
2684 if (A->AllocTypes == B->AllocTypes)
2685 // Use the first context id for each edge as a
2686 // tie-breaker.
2687 return *A->ContextIds.begin() < *B->ContextIds.begin();
2688 return AllocTypeCloningPriority[A->AllocTypes] <
2689 AllocTypeCloningPriority[B->AllocTypes];
2690 });
2691
2692 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2693
2694 // Iterate until we find no more opportunities for disambiguating the alloc
2695 // types via cloning. In most cases this loop will terminate once the Node
2696 // has a single allocation type, in which case no more cloning is needed.
2697 // We need to be able to remove Edge from CallerEdges, so need to adjust
2698 // iterator inside the loop.
2699 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2700 auto CallerEdge = *EI;
2701
2702 // See if cloning the prior caller edge left this node with a single alloc
2703 // type or a single caller. In that case no more cloning of Node is needed.
2704 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2705 break;
2706
2707 // Only need to process the ids along this edge pertaining to the given
2708 // allocation.
2709 auto CallerEdgeContextsForAlloc =
2710 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2711 if (CallerEdgeContextsForAlloc.empty()) {
2712 ++EI;
2713 continue;
2714 }
2715 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2716
2717 // Compute the node callee edge alloc types corresponding to the context ids
2718 // for this caller edge.
2719 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2720 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2721 for (auto &CalleeEdge : Node->CalleeEdges)
2722 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2723 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2724
2725 // Don't clone if doing so will not disambiguate any alloc types amongst
2726 // caller edges (including the callee edges that would be cloned).
2727 // Otherwise we will simply move all edges to the clone.
2728 //
2729 // First check if by cloning we will disambiguate the caller allocation
2730 // type from node's allocation type. Query allocTypeToUse so that we don't
2731 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2732 // neither of these should be None type.
2733 //
2734 // Then check if by cloning node at least one of the callee edges will be
2735 // disambiguated by splitting out different context ids.
2736 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2737 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2738 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2739 allocTypeToUse(Node->AllocTypes) &&
2740 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2741 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2742 ++EI;
2743 continue;
2744 }
2745
2746 // First see if we can use an existing clone. Check each clone and its
2747 // callee edges for matching alloc types.
2748 ContextNode *Clone = nullptr;
2749 for (auto *CurClone : Node->Clones) {
2750 if (allocTypeToUse(CurClone->AllocTypes) !=
2751 allocTypeToUse(CallerAllocTypeForAlloc))
2752 continue;
2753
2754 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2755 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2756 continue;
2757 Clone = CurClone;
2758 break;
2759 }
2760
2761 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2762 if (Clone)
2763 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2764 CallerEdgeContextsForAlloc);
2765 else
2766 Clone =
2767 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2768
2769 assert(EI == Node->CallerEdges.end() ||
2770 Node->AllocTypes != (uint8_t)AllocationType::None);
2771 // Sanity check that no alloc types on clone or its edges are None.
2772 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2773 }
2774
2775 // We should still have some context ids on the original Node.
2776 assert(!Node->emptyContextIds());
2777
2778 // Sanity check that no alloc types on node or edges are None.
2779 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2780
2781 if (VerifyNodes)
2782 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2783}
2784
2785void ModuleCallsiteContextGraph::updateAllocationCall(
2787 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2788 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2789 "memprof", AllocTypeString);
2790 cast<CallBase>(Call.call())->addFnAttr(A);
2791 OREGetter(Call.call()->getFunction())
2792 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2793 << ore::NV("AllocationCall", Call.call()) << " in clone "
2794 << ore::NV("Caller", Call.call()->getFunction())
2795 << " marked with memprof allocation attribute "
2796 << ore::NV("Attribute", AllocTypeString));
2797}
2798
2799void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2801 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2802 assert(AI);
2803 assert(AI->Versions.size() > Call.cloneNo());
2804 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2805}
2806
2807void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2808 FuncInfo CalleeFunc) {
2809 if (CalleeFunc.cloneNo() > 0)
2810 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2811 OREGetter(CallerCall.call()->getFunction())
2812 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2813 << ore::NV("Call", CallerCall.call()) << " in clone "
2814 << ore::NV("Caller", CallerCall.call()->getFunction())
2815 << " assigned to call function clone "
2816 << ore::NV("Callee", CalleeFunc.func()));
2817}
2818
2819void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2820 FuncInfo CalleeFunc) {
2821 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2822 assert(CI &&
2823 "Caller cannot be an allocation which should not have profiled calls");
2824 assert(CI->Clones.size() > CallerCall.cloneNo());
2825 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2826}
2827
2828CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2829 Instruction *>::FuncInfo
2830ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2831 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2832 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2833 // Use existing LLVM facilities for cloning and obtaining Call in clone
2834 ValueToValueMapTy VMap;
2835 auto *NewFunc = CloneFunction(Func.func(), VMap);
2836 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2837 assert(!Func.func()->getParent()->getFunction(Name));
2838 NewFunc->setName(Name);
2839 for (auto &Inst : CallsWithMetadataInFunc) {
2840 // This map always has the initial version in it.
2841 assert(Inst.cloneNo() == 0);
2842 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2843 }
2844 OREGetter(Func.func())
2845 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2846 << "created clone " << ore::NV("NewFunction", NewFunc));
2847 return {NewFunc, CloneNo};
2848}
2849
2850CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2851 IndexCall>::FuncInfo
2852IndexCallsiteContextGraph::cloneFunctionForCallsite(
2853 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2854 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2855 // Check how many clones we have of Call (and therefore function).
2856 // The next clone number is the current size of versions array.
2857 // Confirm this matches the CloneNo provided by the caller, which is based on
2858 // the number of function clones we have.
2859 assert(CloneNo ==
2860 (Call.call().is<AllocInfo *>()
2861 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2862 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2863 // Walk all the instructions in this function. Create a new version for
2864 // each (by adding an entry to the Versions/Clones summary array), and copy
2865 // over the version being called for the function clone being cloned here.
2866 // Additionally, add an entry to the CallMap for the new function clone,
2867 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2868 // to the new call clone.
2869 for (auto &Inst : CallsWithMetadataInFunc) {
2870 // This map always has the initial version in it.
2871 assert(Inst.cloneNo() == 0);
2872 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2873 assert(AI->Versions.size() == CloneNo);
2874 // We assign the allocation type later (in updateAllocationCall), just add
2875 // an entry for it here.
2876 AI->Versions.push_back(0);
2877 } else {
2878 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2879 assert(CI && CI->Clones.size() == CloneNo);
2880 // We assign the clone number later (in updateCall), just add an entry for
2881 // it here.
2882 CI->Clones.push_back(0);
2883 }
2884 CallMap[Inst] = {Inst.call(), CloneNo};
2885 }
2886 return {Func.func(), CloneNo};
2887}
2888
2889// This method assigns cloned callsites to functions, cloning the functions as
2890// needed. The assignment is greedy and proceeds roughly as follows:
2891//
2892// For each function Func:
2893// For each call with graph Node having clones:
2894// Initialize ClonesWorklist to Node and its clones
2895// Initialize NodeCloneCount to 0
2896// While ClonesWorklist is not empty:
2897// Clone = pop front ClonesWorklist
2898// NodeCloneCount++
2899// If Func has been cloned less than NodeCloneCount times:
2900// If NodeCloneCount is 1:
2901// Assign Clone to original Func
2902// Continue
2903// Create a new function clone
2904// If other callers not assigned to call a function clone yet:
2905// Assign them to call new function clone
2906// Continue
2907// Assign any other caller calling the cloned version to new clone
2908//
2909// For each caller of Clone:
2910// If caller is assigned to call a specific function clone:
2911// If we cannot assign Clone to that function clone:
2912// Create new callsite Clone NewClone
2913// Add NewClone to ClonesWorklist
2914// Continue
2915// Assign Clone to existing caller's called function clone
2916// Else:
2917// If Clone not already assigned to a function clone:
2918// Assign to first function clone without assignment
2919// Assign caller to selected function clone
2920template <typename DerivedCCG, typename FuncTy, typename CallTy>
2921bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2922 bool Changed = false;
2923
2924 // Keep track of the assignment of nodes (callsites) to function clones they
2925 // call.
2926 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2927
2928 // Update caller node to call function version CalleeFunc, by recording the
2929 // assignment in CallsiteToCalleeFuncCloneMap.
2930 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2931 const FuncInfo &CalleeFunc) {
2932 assert(Caller->hasCall());
2933 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2934 };
2935
2936 // Walk all functions for which we saw calls with memprof metadata, and handle
2937 // cloning for each of its calls.
2938 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2939 FuncInfo OrigFunc(Func);
2940 // Map from each clone of OrigFunc to a map of remappings of each call of
2941 // interest (from original uncloned call to the corresponding cloned call in
2942 // that function clone).
2943 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2944 for (auto &Call : CallsWithMetadata) {
2945 ContextNode *Node = getNodeForInst(Call);
2946 // Skip call if we do not have a node for it (all uses of its stack ids
2947 // were either on inlined chains or pruned from the MIBs), or if we did
2948 // not create any clones for it.
2949 if (!Node || Node->Clones.empty())
2950 continue;
2951 assert(Node->hasCall() &&
2952 "Not having a call should have prevented cloning");
2953
2954 // Track the assignment of function clones to clones of the current
2955 // callsite Node being handled.
2956 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2957
2958 // Assign callsite version CallsiteClone to function version FuncClone,
2959 // and also assign (possibly cloned) Call to CallsiteClone.
2960 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2961 CallInfo &Call,
2962 ContextNode *CallsiteClone,
2963 bool IsAlloc) {
2964 // Record the clone of callsite node assigned to this function clone.
2965 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2966
2967 assert(FuncClonesToCallMap.count(FuncClone));
2968 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2969 CallInfo CallClone(Call);
2970 if (CallMap.count(Call))
2971 CallClone = CallMap[Call];
2972 CallsiteClone->setCall(CallClone);
2973 };
2974
2975 // Keep track of the clones of callsite Node that need to be assigned to
2976 // function clones. This list may be expanded in the loop body below if we
2977 // find additional cloning is required.
2978 std::deque<ContextNode *> ClonesWorklist;
2979 // Ignore original Node if we moved all of its contexts to clones.
2980 if (!Node->emptyContextIds())
2981 ClonesWorklist.push_back(Node);
2982 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2983 Node->Clones.end());
2984
2985 // Now walk through all of the clones of this callsite Node that we need,
2986 // and determine the assignment to a corresponding clone of the current
2987 // function (creating new function clones as needed).
2988 unsigned NodeCloneCount = 0;
2989 while (!ClonesWorklist.empty()) {
2990 ContextNode *Clone = ClonesWorklist.front();
2991 ClonesWorklist.pop_front();
2992 NodeCloneCount++;
2993 if (VerifyNodes)
2994 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2995
2996 // Need to create a new function clone if we have more callsite clones
2997 // than existing function clones, which would have been assigned to an
2998 // earlier clone in the list (we assign callsite clones to function
2999 // clones greedily).
3000 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3001 // If this is the first callsite copy, assign to original function.
3002 if (NodeCloneCount == 1) {
3003 // Since FuncClonesToCallMap is empty in this case, no clones have
3004 // been created for this function yet, and no callers should have
3005 // been assigned a function clone for this callee node yet.
3007 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3008 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3009 }));
3010 // Initialize with empty call map, assign Clone to original function
3011 // and its callers, and skip to the next clone.
3012 FuncClonesToCallMap[OrigFunc] = {};
3013 AssignCallsiteCloneToFuncClone(
3014 OrigFunc, Call, Clone,
3015 AllocationCallToContextNodeMap.count(Call));
3016 for (auto &CE : Clone->CallerEdges) {
3017 // Ignore any caller that does not have a recorded callsite Call.
3018 if (!CE->Caller->hasCall())
3019 continue;
3020 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3021 }
3022 continue;
3023 }
3024
3025 // First locate which copy of OrigFunc to clone again. If a caller
3026 // of this callsite clone was already assigned to call a particular
3027 // function clone, we need to redirect all of those callers to the
3028 // new function clone, and update their other callees within this
3029 // function.
3030 FuncInfo PreviousAssignedFuncClone;
3031 auto EI = llvm::find_if(
3032 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3033 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3034 });
3035 bool CallerAssignedToCloneOfFunc = false;
3036 if (EI != Clone->CallerEdges.end()) {
3037 const std::shared_ptr<ContextEdge> &Edge = *EI;
3038 PreviousAssignedFuncClone =
3039 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3040 CallerAssignedToCloneOfFunc = true;
3041 }
3042
3043 // Clone function and save it along with the CallInfo map created
3044 // during cloning in the FuncClonesToCallMap.
3045 std::map<CallInfo, CallInfo> NewCallMap;
3046 unsigned CloneNo = FuncClonesToCallMap.size();
3047 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3048 "should already exist in the map");
3049 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3050 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3051 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3052 FunctionClonesAnalysis++;
3053 Changed = true;
3054
3055 // If no caller callsites were already assigned to a clone of this
3056 // function, we can simply assign this clone to the new func clone
3057 // and update all callers to it, then skip to the next clone.
3058 if (!CallerAssignedToCloneOfFunc) {
3059 AssignCallsiteCloneToFuncClone(
3060 NewFuncClone, Call, Clone,
3061 AllocationCallToContextNodeMap.count(Call));
3062 for (auto &CE : Clone->CallerEdges) {
3063 // Ignore any caller that does not have a recorded callsite Call.
3064 if (!CE->Caller->hasCall())
3065 continue;
3066 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3067 }
3068 continue;
3069 }
3070
3071 // We may need to do additional node cloning in this case.
3072 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3073 // that were previously assigned to call PreviousAssignedFuncClone,
3074 // to record that they now call NewFuncClone.
3075 for (auto CE : Clone->CallerEdges) {
3076 // Skip any that have been removed on an earlier iteration.
3077 if (!CE)
3078 continue;
3079 // Ignore any caller that does not have a recorded callsite Call.
3080 if (!CE->Caller->hasCall())
3081 continue;
3082
3083 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3084 // We subsequently fall through to later handling that
3085 // will perform any additional cloning required for
3086 // callers that were calling other function clones.
3087 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3088 PreviousAssignedFuncClone)
3089 continue;
3090
3091 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3092
3093 // If we are cloning a function that was already assigned to some
3094 // callers, then essentially we are creating new callsite clones
3095 // of the other callsites in that function that are reached by those
3096 // callers. Clone the other callees of the current callsite's caller
3097 // that were already assigned to PreviousAssignedFuncClone
3098 // accordingly. This is important since we subsequently update the
3099 // calls from the nodes in the graph and their assignments to callee
3100 // functions recorded in CallsiteToCalleeFuncCloneMap.
3101 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3102 // Skip any that have been removed on an earlier iteration when
3103 // cleaning up newly None type callee edges.
3104 if (!CalleeEdge)
3105 continue;
3106 ContextNode *Callee = CalleeEdge->Callee;
3107 // Skip the current callsite, we are looking for other
3108 // callsites Caller calls, as well as any that does not have a
3109 // recorded callsite Call.
3110 if (Callee == Clone || !Callee->hasCall())
3111 continue;
3112 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3113 removeNoneTypeCalleeEdges(NewClone);
3114 // Moving the edge may have resulted in some none type
3115 // callee edges on the original Callee.
3116 removeNoneTypeCalleeEdges(Callee);
3117 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3118 // If the Callee node was already assigned to call a specific
3119 // function version, make sure its new clone is assigned to call
3120 // that same function clone.
3121 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3122 RecordCalleeFuncOfCallsite(
3123 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3124 // Update NewClone with the new Call clone of this callsite's Call
3125 // created for the new function clone created earlier.
3126 // Recall that we have already ensured when building the graph
3127 // that each caller can only call callsites within the same
3128 // function, so we are guaranteed that Callee Call is in the
3129 // current OrigFunc.
3130 // CallMap is set up as indexed by original Call at clone 0.
3131 CallInfo OrigCall(Callee->getOrigNode()->Call);
3132 OrigCall.setCloneNo(0);
3133 std::map<CallInfo, CallInfo> &CallMap =
3134 FuncClonesToCallMap[NewFuncClone];
3135 assert(CallMap.count(OrigCall));
3136 CallInfo NewCall(CallMap[OrigCall]);
3137 assert(NewCall);
3138 NewClone->setCall(NewCall);
3139 }
3140 }
3141 // Fall through to handling below to perform the recording of the
3142 // function for this callsite clone. This enables handling of cases
3143 // where the callers were assigned to different clones of a function.
3144 }
3145
3146 // See if we can use existing function clone. Walk through
3147 // all caller edges to see if any have already been assigned to
3148 // a clone of this callsite's function. If we can use it, do so. If not,
3149 // because that function clone is already assigned to a different clone
3150 // of this callsite, then we need to clone again.
3151 // Basically, this checking is needed to handle the case where different
3152 // caller functions/callsites may need versions of this function
3153 // containing different mixes of callsite clones across the different
3154 // callsites within the function. If that happens, we need to create
3155 // additional function clones to handle the various combinations.
3156 //
3157 // Keep track of any new clones of this callsite created by the
3158 // following loop, as well as any existing clone that we decided to
3159 // assign this clone to.
3160 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3161 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3162 // We need to be able to remove Edge from CallerEdges, so need to adjust
3163 // iterator in the loop.
3164 for (auto EI = Clone->CallerEdges.begin();
3165 EI != Clone->CallerEdges.end();) {
3166 auto Edge = *EI;
3167 // Ignore any caller that does not have a recorded callsite Call.
3168 if (!Edge->Caller->hasCall()) {
3169 EI++;
3170 continue;
3171 }
3172 // If this caller already assigned to call a version of OrigFunc, need
3173 // to ensure we can assign this callsite clone to that function clone.
3174 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3175 FuncInfo FuncCloneCalledByCaller =
3176 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3177 // First we need to confirm that this function clone is available
3178 // for use by this callsite node clone.
3179 //
3180 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3181 // its callsite clones, one of those callsite clones X could have
3182 // been assigned to the same function clone called by Edge's caller
3183 // - if Edge's caller calls another callsite within Node's original
3184 // function, and that callsite has another caller reaching clone X.
3185 // We need to clone Node again in this case.
3186 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3187 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3188 Clone) ||
3189 // Detect when we have multiple callers of this callsite that
3190 // have already been assigned to specific, and different, clones
3191 // of OrigFunc (due to other unrelated callsites in Func they
3192 // reach via call contexts). Is this Clone of callsite Node
3193 // assigned to a different clone of OrigFunc? If so, clone Node
3194 // again.
3195 (FuncCloneAssignedToCurCallsiteClone &&
3196 FuncCloneAssignedToCurCallsiteClone !=
3197 FuncCloneCalledByCaller)) {
3198 // We need to use a different newly created callsite clone, in
3199 // order to assign it to another new function clone on a
3200 // subsequent iteration over the Clones array (adjusted below).
3201 // Note we specifically do not reset the
3202 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3203 // when this new clone is processed later we know which version of
3204 // the function to copy (so that other callsite clones we have
3205 // assigned to that function clone are properly cloned over). See
3206 // comments in the function cloning handling earlier.
3207
3208 // Check if we already have cloned this callsite again while
3209 // walking through caller edges, for a caller calling the same
3210 // function clone. If so, we can move this edge to that new clone
3211 // rather than creating yet another new clone.
3212 if (FuncCloneToNewCallsiteCloneMap.count(
3213 FuncCloneCalledByCaller)) {
3214 ContextNode *NewClone =
3215 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3216 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3217 // Cleanup any none type edges cloned over.
3218 removeNoneTypeCalleeEdges(NewClone);
3219 } else {
3220 // Create a new callsite clone.
3221 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3222 removeNoneTypeCalleeEdges(NewClone);
3223 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3224 NewClone;
3225 // Add to list of clones and process later.
3226 ClonesWorklist.push_back(NewClone);
3227 assert(EI == Clone->CallerEdges.end() ||
3228 Clone->AllocTypes != (uint8_t)AllocationType::None);
3229 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3230 }
3231 // Moving the caller edge may have resulted in some none type
3232 // callee edges.
3233 removeNoneTypeCalleeEdges(Clone);
3234 // We will handle the newly created callsite clone in a subsequent
3235 // iteration over this Node's Clones. Continue here since we
3236 // already adjusted iterator EI while moving the edge.
3237 continue;
3238 }
3239
3240 // Otherwise, we can use the function clone already assigned to this
3241 // caller.
3242 if (!FuncCloneAssignedToCurCallsiteClone) {
3243 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3244 // Assign Clone to FuncCloneCalledByCaller
3245 AssignCallsiteCloneToFuncClone(
3246 FuncCloneCalledByCaller, Call, Clone,
3247 AllocationCallToContextNodeMap.count(Call));
3248 } else
3249 // Don't need to do anything - callsite is already calling this
3250 // function clone.
3251 assert(FuncCloneAssignedToCurCallsiteClone ==
3252 FuncCloneCalledByCaller);
3253
3254 } else {
3255 // We have not already assigned this caller to a version of
3256 // OrigFunc. Do the assignment now.
3257
3258 // First check if we have already assigned this callsite clone to a
3259 // clone of OrigFunc for another caller during this iteration over
3260 // its caller edges.
3261 if (!FuncCloneAssignedToCurCallsiteClone) {
3262 // Find first function in FuncClonesToCallMap without an assigned
3263 // clone of this callsite Node. We should always have one
3264 // available at this point due to the earlier cloning when the
3265 // FuncClonesToCallMap size was smaller than the clone number.
3266 for (auto &CF : FuncClonesToCallMap) {
3267 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3268 FuncCloneAssignedToCurCallsiteClone = CF.first;
3269 break;
3270 }
3271 }
3272 assert(FuncCloneAssignedToCurCallsiteClone);
3273 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3274 AssignCallsiteCloneToFuncClone(
3275 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3276 AllocationCallToContextNodeMap.count(Call));
3277 } else
3278 assert(FuncCloneToCurNodeCloneMap
3279 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3280 // Update callers to record function version called.
3281 RecordCalleeFuncOfCallsite(Edge->Caller,
3282 FuncCloneAssignedToCurCallsiteClone);
3283 }
3284
3285 EI++;
3286 }
3287 }
3288 if (VerifyCCG) {
3289 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3290 for (const auto &PE : Node->CalleeEdges)
3291 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3292 for (const auto &CE : Node->CallerEdges)
3293 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3294 for (auto *Clone : Node->Clones) {
3295 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3296 for (const auto &PE : Clone->CalleeEdges)
3297 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3298 for (const auto &CE : Clone->CallerEdges)
3299 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3300 }
3301 }
3302 }
3303 }
3304
3305 auto UpdateCalls = [&](ContextNode *Node,
3307 auto &&UpdateCalls) {
3308 auto Inserted = Visited.insert(Node);
3309 if (!Inserted.second)
3310 return;
3311
3312 for (auto *Clone : Node->Clones)
3313 UpdateCalls(Clone, Visited, UpdateCalls);
3314
3315 for (auto &Edge : Node->CallerEdges)
3316 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3317
3318 // Skip if either no call to update, or if we ended up with no context ids
3319 // (we moved all edges onto other clones).
3320 if (!Node->hasCall() || Node->emptyContextIds())
3321 return;
3322
3323 if (Node->IsAllocation) {
3324 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3325 return;
3326 }
3327
3328 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3329 return;
3330
3331 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3332 updateCall(Node->Call, CalleeFunc);
3333 };
3334
3335 // Performs DFS traversal starting from allocation nodes to update calls to
3336 // reflect cloning decisions recorded earlier. For regular LTO this will
3337 // update the actual calls in the IR to call the appropriate function clone
3338 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3339 // are recorded in the summary entries.
3341 for (auto &Entry : AllocationCallToContextNodeMap)
3342 UpdateCalls(Entry.second, Visited, UpdateCalls);
3343
3344 return Changed;
3345}
3346
3348 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3350 &FuncToAliasMap) {
3351 // The first "clone" is the original copy, we should only call this if we
3352 // needed to create new clones.
3353 assert(NumClones > 1);
3355 VMaps.reserve(NumClones - 1);
3356 FunctionsClonedThinBackend++;
3357 for (unsigned I = 1; I < NumClones; I++) {
3358 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3359 auto *NewF = CloneFunction(&F, *VMaps.back());
3360 FunctionClonesThinBackend++;
3361 // Strip memprof and callsite metadata from clone as they are no longer
3362 // needed.
3363 for (auto &BB : *NewF) {
3364 for (auto &Inst : BB) {
3365 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3366 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3367 }
3368 }
3369 std::string Name = getMemProfFuncName(F.getName(), I);
3370 auto *PrevF = M.getFunction(Name);
3371 if (PrevF) {
3372 // We might have created this when adjusting callsite in another
3373 // function. It should be a declaration.
3374 assert(PrevF->isDeclaration());
3375 NewF->takeName(PrevF);
3376 PrevF->replaceAllUsesWith(NewF);
3377 PrevF->eraseFromParent();
3378 } else
3379 NewF->setName(Name);
3380 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3381 << "created clone " << ore::NV("NewFunction", NewF));
3382
3383 // Now handle aliases to this function, and clone those as well.
3384 if (!FuncToAliasMap.count(&F))
3385 continue;
3386 for (auto *A : FuncToAliasMap[&F]) {
3387 std::string Name = getMemProfFuncName(A->getName(), I);
3388 auto *PrevA = M.getNamedAlias(Name);
3389 auto *NewA = GlobalAlias::create(A->getValueType(),
3390 A->getType()->getPointerAddressSpace(),
3391 A->getLinkage(), Name, NewF);
3392 NewA->copyAttributesFrom(A);
3393 if (PrevA) {
3394 // We might have created this when adjusting callsite in another
3395 // function. It should be a declaration.
3396 assert(PrevA->isDeclaration());
3397 NewA->takeName(PrevA);
3398 PrevA->replaceAllUsesWith(NewA);
3399 PrevA->eraseFromParent();
3400 }
3401 }
3402 }
3403 return VMaps;
3404}
3405
3406// Locate the summary for F. This is complicated by the fact that it might
3407// have been internalized or promoted.
3409 const ModuleSummaryIndex *ImportSummary) {
3410 // FIXME: Ideally we would retain the original GUID in some fashion on the
3411 // function (e.g. as metadata), but for now do our best to locate the
3412 // summary without that information.
3413 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3414 if (!TheFnVI)
3415 // See if theFn was internalized, by checking index directly with
3416 // original name (this avoids the name adjustment done by getGUID() for
3417 // internal symbols).
3418 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3419 if (TheFnVI)
3420 return TheFnVI;
3421 // Now query with the original name before any promotion was performed.
3422 StringRef OrigName =
3424 std::string OrigId = GlobalValue::getGlobalIdentifier(
3425 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3426 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3427 if (TheFnVI)
3428 return TheFnVI;
3429 // Could be a promoted local imported from another module. We need to pass
3430 // down more info here to find the original module id. For now, try with
3431 // the OrigName which might have been stored in the OidGuidMap in the
3432 // index. This would not work if there were same-named locals in multiple
3433 // modules, however.
3434 auto OrigGUID =
3435 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3436 if (OrigGUID)
3437 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3438 return TheFnVI;
3439}
3440
3441bool MemProfContextDisambiguation::applyImport(Module &M) {
3442 assert(ImportSummary);
3443 bool Changed = false;
3444
3445 auto IsMemProfClone = [](const Function &F) {
3446 return F.getName().contains(MemProfCloneSuffix);
3447 };
3448
3449 // We also need to clone any aliases that reference cloned functions, because
3450 // the modified callsites may invoke via the alias. Keep track of the aliases
3451 // for each function.
3452 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3453 FuncToAliasMap;
3454 for (auto &A : M.aliases()) {
3455 auto *Aliasee = A.getAliaseeObject();
3456 if (auto *F = dyn_cast<Function>(Aliasee))
3457 FuncToAliasMap[F].insert(&A);
3458 }
3459
3460 for (auto &F : M) {
3461 if (F.isDeclaration() || IsMemProfClone(F))
3462 continue;
3463
3465
3467 bool ClonesCreated = false;
3468 unsigned NumClonesCreated = 0;
3469 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3470 // We should at least have version 0 which is the original copy.
3471 assert(NumClones > 0);
3472 // If only one copy needed use original.
3473 if (NumClones == 1)
3474 return;
3475 // If we already performed cloning of this function, confirm that the
3476 // requested number of clones matches (the thin link should ensure the
3477 // number of clones for each constituent callsite is consistent within
3478 // each function), before returning.
3479 if (ClonesCreated) {
3480 assert(NumClonesCreated == NumClones);
3481 return;
3482 }
3483 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3484 // The first "clone" is the original copy, which doesn't have a VMap.
3485 assert(VMaps.size() == NumClones - 1);
3486 Changed = true;
3487 ClonesCreated = true;
3488 NumClonesCreated = NumClones;
3489 };
3490
3491 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3492 Function *CalledFunction) {
3493 // Perform cloning if not yet done.
3494 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3495
3496 // Should have skipped indirect calls via mayHaveMemprofSummary.
3497 assert(CalledFunction);
3498 assert(!IsMemProfClone(*CalledFunction));
3499
3500 // Update the calls per the summary info.
3501 // Save orig name since it gets updated in the first iteration
3502 // below.
3503 auto CalleeOrigName = CalledFunction->getName();
3504 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3505 // Do nothing if this version calls the original version of its
3506 // callee.
3507 if (!StackNode.Clones[J])
3508 continue;
3509 auto NewF = M.getOrInsertFunction(
3510 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3511 CalledFunction->getFunctionType());
3512 CallBase *CBClone;
3513 // Copy 0 is the original function.
3514 if (!J)
3515 CBClone = CB;
3516 else
3517 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3518 CBClone->setCalledFunction(NewF);
3519 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3520 << ore::NV("Call", CBClone) << " in clone "
3521 << ore::NV("Caller", CBClone->getFunction())
3522 << " assigned to call function clone "
3523 << ore::NV("Callee", NewF.getCallee()));
3524 }
3525 };
3526
3527 // Locate the summary for F.
3528 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3529 // If not found, this could be an imported local (see comment in
3530 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3531 // module (where it would have been promoted to global scope so should
3532 // satisfy any reference in this module).
3533 if (!TheFnVI)
3534 continue;
3535
3536 auto *GVSummary =
3537 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3538 if (!GVSummary) {
3539 // Must have been imported, use the summary which matches the definition。
3540 // (might be multiple if this was a linkonce_odr).
3541 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3542 assert(SrcModuleMD &&
3543 "enable-import-metadata is needed to emit thinlto_src_module");
3544 StringRef SrcModule =
3545 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3546 for (auto &GVS : TheFnVI.getSummaryList()) {
3547 if (GVS->modulePath() == SrcModule) {
3548 GVSummary = GVS.get();
3549 break;
3550 }
3551 }
3552 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3553 }
3554
3555 // If this was an imported alias skip it as we won't have the function
3556 // summary, and it should be cloned in the original module.
3557 if (isa<AliasSummary>(GVSummary))
3558 continue;
3559
3560 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3561
3562 if (FS->allocs().empty() && FS->callsites().empty())
3563 continue;
3564
3565 auto SI = FS->callsites().begin();
3566 auto AI = FS->allocs().begin();
3567
3568 // To handle callsite infos synthesized for tail calls which have missing
3569 // frames in the profiled context, map callee VI to the synthesized callsite
3570 // info.
3571 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3572 // Iterate the callsites for this function in reverse, since we place all
3573 // those synthesized for tail calls at the end.
3574 for (auto CallsiteIt = FS->callsites().rbegin();
3575 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3576 auto &Callsite = *CallsiteIt;
3577 // Stop as soon as we see a non-synthesized callsite info (see comment
3578 // above loop). All the entries added for discovered tail calls have empty
3579 // stack ids.
3580 if (!Callsite.StackIdIndices.empty())
3581 break;
3582 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3583 }
3584
3585 // Assume for now that the instructions are in the exact same order
3586 // as when the summary was created, but confirm this is correct by
3587 // matching the stack ids.
3588 for (auto &BB : F) {
3589 for (auto &I : BB) {
3590 auto *CB = dyn_cast<CallBase>(&I);
3591 // Same handling as when creating module summary.
3592 if (!mayHaveMemprofSummary(CB))
3593 continue;
3594
3595 auto *CalledValue = CB->getCalledOperand();
3596 auto *CalledFunction = CB->getCalledFunction();
3597 if (CalledValue && !CalledFunction) {
3598 CalledValue = CalledValue->stripPointerCasts();
3599 // Stripping pointer casts can reveal a called function.
3600 CalledFunction = dyn_cast<Function>(CalledValue);
3601 }
3602 // Check if this is an alias to a function. If so, get the
3603 // called aliasee for the checks below.
3604 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3605 assert(!CalledFunction &&
3606 "Expected null called function in callsite for alias");
3607 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3608 }
3609
3611 I.getMetadata(LLVMContext::MD_callsite));
3612 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3613
3614 // Include allocs that were already assigned a memprof function
3615 // attribute in the statistics.
3616 if (CB->getAttributes().hasFnAttr("memprof")) {
3617 assert(!MemProfMD);
3618 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3619 ? AllocTypeColdThinBackend++
3620 : AllocTypeNotColdThinBackend++;
3621 OrigAllocsThinBackend++;
3622 AllocVersionsThinBackend++;
3623 if (!MaxAllocVersionsThinBackend)
3624 MaxAllocVersionsThinBackend = 1;
3625 // Remove any remaining callsite metadata and we can skip the rest of
3626 // the handling for this instruction, since no cloning needed.
3627 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3628 continue;
3629 }
3630
3631 if (MemProfMD) {
3632 // Consult the next alloc node.
3633 assert(AI != FS->allocs().end());
3634 auto &AllocNode = *(AI++);
3635
3636 // Sanity check that the MIB stack ids match between the summary and
3637 // instruction metadata.
3638 auto MIBIter = AllocNode.MIBs.begin();
3639 for (auto &MDOp : MemProfMD->operands()) {
3640 assert(MIBIter != AllocNode.MIBs.end());
3641 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3642 MIBIter->StackIdIndices.begin();
3643 auto *MIBMD = cast<const MDNode>(MDOp);
3644 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3645 assert(StackMDNode);
3646 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3647 auto ContextIterBegin =
3648 StackContext.beginAfterSharedPrefix(CallsiteContext);
3649 // Skip the checking on the first iteration.
3650 uint64_t LastStackContextId =
3651 (ContextIterBegin != StackContext.end() &&
3652 *ContextIterBegin == 0)
3653 ? 1
3654 : 0;
3655 for (auto ContextIter = ContextIterBegin;
3656 ContextIter != StackContext.end(); ++ContextIter) {
3657 // If this is a direct recursion, simply skip the duplicate
3658 // entries, to be consistent with how the summary ids were
3659 // generated during ModuleSummaryAnalysis.
3660 if (LastStackContextId == *ContextIter)
3661 continue;
3662 LastStackContextId = *ContextIter;
3663 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3664 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3665 *ContextIter);
3666 StackIdIndexIter++;
3667 }
3668 MIBIter++;
3669 }
3670
3671 // Perform cloning if not yet done.
3672 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3673
3674 OrigAllocsThinBackend++;
3675 AllocVersionsThinBackend += AllocNode.Versions.size();
3676 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3677 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3678
3679 // If there is only one version that means we didn't end up
3680 // considering this function for cloning, and in that case the alloc
3681 // will still be none type or should have gotten the default NotCold.
3682 // Skip that after calling clone helper since that does some sanity
3683 // checks that confirm we haven't decided yet that we need cloning.
3684 if (AllocNode.Versions.size() == 1) {
3685 assert((AllocationType)AllocNode.Versions[0] ==
3687 (AllocationType)AllocNode.Versions[0] ==
3689 UnclonableAllocsThinBackend++;
3690 continue;
3691 }
3692
3693 // All versions should have a singular allocation type.
3694 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3695 return Type == ((uint8_t)AllocationType::NotCold |
3696 (uint8_t)AllocationType::Cold);
3697 }));
3698
3699 // Update the allocation types per the summary info.
3700 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3701 // Ignore any that didn't get an assigned allocation type.
3702 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3703 continue;
3704 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3705 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3706 : AllocTypeNotColdThinBackend++;
3707 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3708 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3709 AllocTypeString);
3710 CallBase *CBClone;
3711 // Copy 0 is the original function.
3712 if (!J)
3713 CBClone = CB;
3714 else
3715 // Since VMaps are only created for new clones, we index with
3716 // clone J-1 (J==0 is the original clone and does not have a VMaps
3717 // entry).
3718 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3719 CBClone->addFnAttr(A);
3720 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3721 << ore::NV("AllocationCall", CBClone) << " in clone "
3722 << ore::NV("Caller", CBClone->getFunction())
3723 << " marked with memprof allocation attribute "
3724 << ore::NV("Attribute", AllocTypeString));
3725 }
3726 } else if (!CallsiteContext.empty()) {
3727 // Consult the next callsite node.
3728 assert(SI != FS->callsites().end());
3729 auto &StackNode = *(SI++);
3730
3731#ifndef NDEBUG
3732 // Sanity check that the stack ids match between the summary and
3733 // instruction metadata.
3734 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3735 for (auto StackId : CallsiteContext) {
3736 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3737 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3738 StackId);
3739 StackIdIndexIter++;
3740 }
3741#endif
3742
3743 CloneCallsite(StackNode, CB, CalledFunction);
3744 } else if (CB->isTailCall()) {
3745 // Locate the synthesized callsite info for the callee VI, if any was
3746 // created, and use that for cloning.
3747 ValueInfo CalleeVI =
3748 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3749 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3750 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3751 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3752 CloneCallsite(Callsite->second, CB, CalledFunction);
3753 }
3754 }
3755 // Memprof and callsite metadata on memory allocations no longer needed.
3756 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3757 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3758 }
3759 }
3760 }
3761
3762 return Changed;
3763}
3764
3765template <typename DerivedCCG, typename FuncTy, typename CallTy>
3766bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3767 if (DumpCCG) {
3768 dbgs() << "CCG before cloning:\n";
3769 dbgs() << *this;
3770 }
3771 if (ExportToDot)
3772 exportToDot("postbuild");
3773
3774 if (VerifyCCG) {
3775 check();
3776 }
3777
3778 identifyClones();
3779
3780 if (VerifyCCG) {
3781 check();
3782 }
3783
3784 if (DumpCCG) {
3785 dbgs() << "CCG after cloning:\n";
3786 dbgs() << *this;
3787 }
3788 if (ExportToDot)
3789 exportToDot("cloned");
3790
3791 bool Changed = assignFunctions();
3792
3793 if (DumpCCG) {
3794 dbgs() << "CCG after assigning function clones:\n";
3795 dbgs() << *this;
3796 }
3797 if (ExportToDot)
3798 exportToDot("clonefuncassign");
3799
3800 return Changed;
3801}
3802
3803bool MemProfContextDisambiguation::processModule(
3804 Module &M,
3806
3807 // If we have an import summary, then the cloning decisions were made during
3808 // the thin link on the index. Apply them and return.
3809 if (ImportSummary)
3810 return applyImport(M);
3811
3812 // TODO: If/when other types of memprof cloning are enabled beyond just for
3813 // hot and cold, we will need to change this to individually control the
3814 // AllocationType passed to addStackNodesForMIB during CCG construction.
3815 // Note that we specifically check this after applying imports above, so that
3816 // the option isn't needed to be passed to distributed ThinLTO backend
3817 // clang processes, which won't necessarily have visibility into the linker
3818 // dependences. Instead the information is communicated from the LTO link to
3819 // the backends via the combined summary index.
3820 if (!SupportsHotColdNew)
3821 return false;
3822
3823 ModuleCallsiteContextGraph CCG(M, OREGetter);
3824 return CCG.process();
3825}
3826
3828 const ModuleSummaryIndex *Summary)
3829 : ImportSummary(Summary) {
3830 if (ImportSummary) {
3831 // The MemProfImportSummary should only be used for testing ThinLTO
3832 // distributed backend handling via opt, in which case we don't have a
3833 // summary from the pass pipeline.
3835 return;
3836 }
3837 if (MemProfImportSummary.empty())
3838 return;
3839
3840 auto ReadSummaryFile =
3842 if (!ReadSummaryFile) {
3843 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3844 "Error loading file '" + MemProfImportSummary +
3845 "': ");
3846 return;
3847 }
3848 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3849 if (!ImportSummaryForTestingOrErr) {
3850 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3851 "Error parsing file '" + MemProfImportSummary +
3852 "': ");
3853 return;
3854 }
3855 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3856 ImportSummary = ImportSummaryForTesting.get();
3857}
3858
3861 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3862 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3864 };
3865 if (!processModule(M, OREGetter))
3866 return PreservedAnalyses::all();
3867 return PreservedAnalyses::none();
3868}
3869
3873 isPrevailing) {
3874 // TODO: If/when other types of memprof cloning are enabled beyond just for
3875 // hot and cold, we will need to change this to individually control the
3876 // AllocationType passed to addStackNodesForMIB during CCG construction.
3877 // The index was set from the option, so these should be in sync.
3878 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3879 if (!SupportsHotColdNew)
3880 return;
3881
3882 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3883 CCG.process();
3884}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define check(cond)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
Module.h This file contains the declarations for the Module class.
#define P(N)
Module * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:251
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1574
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Function summary information to aid decisions and implementation of importing.
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:544
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:178
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:563
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
Metadata node.
Definition: Metadata.h:1067
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
std::pair< KeyT, ValueT > & back()
Definition: MapVector.h:85
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_t size() const
Definition: SmallVector.h:91
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:105
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
StringRef AttributeString(unsigned Attribute)
Definition: Dwarf.cpp:72
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:206
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:50
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
Definition: SetOperations.h:97
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:34
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:76
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1226
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:80
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const