LLVM 20.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
43#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125namespace llvm {
127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130// Indicate we are linking with an allocator that supports hot/cold operator
131// new interfaces.
133 "supports-hot-cold-new", cl::init(false), cl::Hidden,
134 cl::desc("Linking with hot/cold operator new interfaces"));
135
137 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
138 cl::desc(
139 "Require target function definition when promoting indirect calls"));
140} // namespace llvm
141
143
144namespace {
145/// CRTP base for graphs built from either IR or ThinLTO summary index.
146///
147/// The graph represents the call contexts in all memprof metadata on allocation
148/// calls, with nodes for the allocations themselves, as well as for the calls
149/// in each context. The graph is initially built from the allocation memprof
150/// metadata (or summary) MIBs. It is then updated to match calls with callsite
151/// metadata onto the nodes, updating it to reflect any inlining performed on
152/// those calls.
153///
154/// Each MIB (representing an allocation's call context with allocation
155/// behavior) is assigned a unique context id during the graph build. The edges
156/// and nodes in the graph are decorated with the context ids they carry. This
157/// is used to correctly update the graph when cloning is performed so that we
158/// can uniquify the context for a single (possibly cloned) allocation.
159template <typename DerivedCCG, typename FuncTy, typename CallTy>
160class CallsiteContextGraph {
161public:
162 CallsiteContextGraph() = default;
163 CallsiteContextGraph(const CallsiteContextGraph &) = default;
164 CallsiteContextGraph(CallsiteContextGraph &&) = default;
165
166 /// Main entry point to perform analysis and transformations on graph.
167 bool process();
168
169 /// Perform cloning on the graph necessary to uniquely identify the allocation
170 /// behavior of an allocation based on its context.
171 void identifyClones();
172
173 /// Assign callsite clones to functions, cloning functions as needed to
174 /// accommodate the combinations of their callsite clones reached by callers.
175 /// For regular LTO this clones functions and callsites in the IR, but for
176 /// ThinLTO the cloning decisions are noted in the summaries and later applied
177 /// in applyImport.
178 bool assignFunctions();
179
180 void dump() const;
181 void print(raw_ostream &OS) const;
182 void printTotalSizes(raw_ostream &OS) const;
183
185 const CallsiteContextGraph &CCG) {
186 CCG.print(OS);
187 return OS;
188 }
189
190 friend struct GraphTraits<
191 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
192 friend struct DOTGraphTraits<
193 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
194
195 void exportToDot(std::string Label) const;
196
197 /// Represents a function clone via FuncTy pointer and clone number pair.
198 struct FuncInfo final
199 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
200 using Base = std::pair<FuncTy *, unsigned>;
201 FuncInfo(const Base &B) : Base(B) {}
202 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
203 explicit operator bool() const { return this->first != nullptr; }
204 FuncTy *func() const { return this->first; }
205 unsigned cloneNo() const { return this->second; }
206 };
207
208 /// Represents a callsite clone via CallTy and clone number pair.
209 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
210 using Base = std::pair<CallTy, unsigned>;
211 CallInfo(const Base &B) : Base(B) {}
212 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
213 : Base(Call, CloneNo) {}
214 explicit operator bool() const { return (bool)this->first; }
215 CallTy call() const { return this->first; }
216 unsigned cloneNo() const { return this->second; }
217 void setCloneNo(unsigned N) { this->second = N; }
218 void print(raw_ostream &OS) const {
219 if (!operator bool()) {
220 assert(!cloneNo());
221 OS << "null Call";
222 return;
223 }
224 call()->print(OS);
225 OS << "\t(clone " << cloneNo() << ")";
226 }
227 void dump() const {
228 print(dbgs());
229 dbgs() << "\n";
230 }
231 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
232 Call.print(OS);
233 return OS;
234 }
235 };
236
237 struct ContextEdge;
238
239 /// Node in the Callsite Context Graph
240 struct ContextNode {
241 // Keep this for now since in the IR case where we have an Instruction* it
242 // is not as immediately discoverable. Used for printing richer information
243 // when dumping graph.
244 bool IsAllocation;
245
246 // Keeps track of when the Call was reset to null because there was
247 // recursion.
248 bool Recursive = false;
249
250 // This will be formed by ORing together the AllocationType enum values
251 // for contexts including this node.
252 uint8_t AllocTypes = 0;
253
254 // The corresponding allocation or interior call. This is the primary call
255 // for which we have created this node.
257
258 // List of other calls that can be treated the same as the primary call
259 // through cloning. I.e. located in the same function and have the same
260 // (possibly pruned) stack ids. They will be updated the same way as the
261 // primary call when assigning to function clones.
262 SmallVector<CallInfo, 0> MatchingCalls;
263
264 // For alloc nodes this is a unique id assigned when constructed, and for
265 // callsite stack nodes it is the original stack id when the node is
266 // constructed from the memprof MIB metadata on the alloc nodes. Note that
267 // this is only used when matching callsite metadata onto the stack nodes
268 // created when processing the allocation memprof MIBs, and for labeling
269 // nodes in the dot graph. Therefore we don't bother to assign a value for
270 // clones.
271 uint64_t OrigStackOrAllocId = 0;
272
273 // Edges to all callees in the profiled call stacks.
274 // TODO: Should this be a map (from Callee node) for more efficient lookup?
275 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
276
277 // Edges to all callers in the profiled call stacks.
278 // TODO: Should this be a map (from Caller node) for more efficient lookup?
279 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
280
281 // Get the list of edges from which we can compute allocation information
282 // such as the context ids and allocation type of this node.
283 const std::vector<std::shared_ptr<ContextEdge>> *
284 getEdgesWithAllocInfo() const {
285 // If node has any callees, compute from those, otherwise compute from
286 // callers (i.e. if this is the leaf allocation node).
287 if (!CalleeEdges.empty())
288 return &CalleeEdges;
289 if (!CallerEdges.empty()) {
290 // A node with caller edges but no callee edges must be the allocation
291 // node.
292 assert(IsAllocation);
293 return &CallerEdges;
294 }
295 return nullptr;
296 }
297
298 // Compute the context ids for this node from the union of its edge context
299 // ids.
300 DenseSet<uint32_t> getContextIds() const {
301 DenseSet<uint32_t> ContextIds;
302 auto *Edges = getEdgesWithAllocInfo();
303 if (!Edges)
304 return {};
305 unsigned Count = 0;
306 for (auto &Edge : *Edges)
307 Count += Edge->getContextIds().size();
308 ContextIds.reserve(Count);
309 for (auto &Edge : *Edges)
310 ContextIds.insert(Edge->getContextIds().begin(),
311 Edge->getContextIds().end());
312 return ContextIds;
313 }
314
315 // Compute the allocation type for this node from the OR of its edge
316 // allocation types.
317 uint8_t computeAllocType() const {
318 auto *Edges = getEdgesWithAllocInfo();
319 if (!Edges)
320 return (uint8_t)AllocationType::None;
321 uint8_t BothTypes =
322 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
323 uint8_t AllocType = (uint8_t)AllocationType::None;
324 for (auto &Edge : *Edges) {
325 AllocType |= Edge->AllocTypes;
326 // Bail early if alloc type reached both, no further refinement.
327 if (AllocType == BothTypes)
328 return AllocType;
329 }
330 return AllocType;
331 }
332
333 // The context ids set for this node is empty if its edge context ids are
334 // also all empty.
335 bool emptyContextIds() const {
336 auto *Edges = getEdgesWithAllocInfo();
337 if (!Edges)
338 return true;
339 for (auto &Edge : *Edges) {
340 if (!Edge->getContextIds().empty())
341 return false;
342 }
343 return true;
344 }
345
346 // List of clones of this ContextNode, initially empty.
347 std::vector<ContextNode *> Clones;
348
349 // If a clone, points to the original uncloned node.
350 ContextNode *CloneOf = nullptr;
351
352 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
353
354 ContextNode(bool IsAllocation, CallInfo C)
355 : IsAllocation(IsAllocation), Call(C) {}
356
357 void addClone(ContextNode *Clone) {
358 if (CloneOf) {
359 CloneOf->Clones.push_back(Clone);
360 Clone->CloneOf = CloneOf;
361 } else {
362 Clones.push_back(Clone);
363 assert(!Clone->CloneOf);
364 Clone->CloneOf = this;
365 }
366 }
367
368 ContextNode *getOrigNode() {
369 if (!CloneOf)
370 return this;
371 return CloneOf;
372 }
373
374 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
375 unsigned int ContextId);
376
377 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
378 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
379 void eraseCalleeEdge(const ContextEdge *Edge);
380 void eraseCallerEdge(const ContextEdge *Edge);
381
382 void setCall(CallInfo C) { Call = C; }
383
384 bool hasCall() const { return (bool)Call.call(); }
385
386 void printCall(raw_ostream &OS) const { Call.print(OS); }
387
388 // True if this node was effectively removed from the graph, in which case
389 // it should have an allocation type of None and empty context ids.
390 bool isRemoved() const {
391 assert((AllocTypes == (uint8_t)AllocationType::None) ==
392 emptyContextIds());
393 return AllocTypes == (uint8_t)AllocationType::None;
394 }
395
396 void dump() const;
397 void print(raw_ostream &OS) const;
398
399 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
400 Node.print(OS);
401 return OS;
402 }
403 };
404
405 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
406 /// callee.
407 struct ContextEdge {
408 ContextNode *Callee;
409 ContextNode *Caller;
410
411 // This will be formed by ORing together the AllocationType enum values
412 // for contexts including this edge.
413 uint8_t AllocTypes = 0;
414
415 // The set of IDs for contexts including this edge.
416 DenseSet<uint32_t> ContextIds;
417
418 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
419 DenseSet<uint32_t> ContextIds)
420 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
421 ContextIds(std::move(ContextIds)) {}
422
423 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
424
425 // Helper to clear the fields of this edge when we are removing it from the
426 // graph.
427 inline void clear() {
428 ContextIds.clear();
429 AllocTypes = (uint8_t)AllocationType::None;
430 Caller = nullptr;
431 Callee = nullptr;
432 }
433
434 // Check if edge was removed from the graph. This is useful while iterating
435 // over a copy of edge lists when performing operations that mutate the
436 // graph in ways that might remove one of the edges.
437 inline bool isRemoved() const {
438 if (Callee || Caller)
439 return false;
440 // Any edges that have been removed from the graph but are still in a
441 // shared_ptr somewhere should have all fields null'ed out by clear()
442 // above.
443 assert(AllocTypes == (uint8_t)AllocationType::None);
444 assert(ContextIds.empty());
445 return true;
446 }
447
448 void dump() const;
449 void print(raw_ostream &OS) const;
450
451 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
452 Edge.print(OS);
453 return OS;
454 }
455 };
456
457 /// Helpers to remove edges that have allocation type None (due to not
458 /// carrying any context ids) after transformations.
459 void removeNoneTypeCalleeEdges(ContextNode *Node);
460 void removeNoneTypeCallerEdges(ContextNode *Node);
461 void
462 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
464
465protected:
466 /// Get a list of nodes corresponding to the stack ids in the given callsite
467 /// context.
468 template <class NodeT, class IteratorT>
469 std::vector<uint64_t>
470 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
471
472 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
473 /// metadata (or summary).
474 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
475
476 /// Adds nodes for the given MIB stack ids.
477 template <class NodeT, class IteratorT>
478 void addStackNodesForMIB(ContextNode *AllocNode,
479 CallStack<NodeT, IteratorT> &StackContext,
480 CallStack<NodeT, IteratorT> &CallsiteContext,
482 ArrayRef<ContextTotalSize> ContextSizeInfo);
483
484 /// Matches all callsite metadata (or summary) to the nodes created for
485 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
486 /// inlining performed on those callsite instructions.
487 void updateStackNodes();
488
489 /// Update graph to conservatively handle any callsite stack nodes that target
490 /// multiple different callee target functions.
491 void handleCallsitesWithMultipleTargets();
492
493 // Try to partition calls on the given node (already placed into the AllCalls
494 // array) by callee function, creating new copies of Node as needed to hold
495 // calls with different callees, and moving the callee edges appropriately.
496 // Returns true if partitioning was successful.
497 bool partitionCallsByCallee(
498 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
499 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
500
501 /// Save lists of calls with MemProf metadata in each function, for faster
502 /// iteration.
503 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
504
505 /// Map from callsite node to the enclosing caller function.
506 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
507
508private:
509 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
510
511 // Structure to keep track of information for each call as we are matching
512 // non-allocation callsites onto context nodes created from the allocation
513 // call metadata / summary contexts.
514 struct CallContextInfo {
515 // The callsite we're trying to match.
516 CallTy Call;
517 // The callsites stack ids that have a context node in the graph.
518 std::vector<uint64_t> StackIds;
519 // The function containing this callsite.
520 const FuncTy *Func;
521 // Initially empty, if needed this will be updated to contain the context
522 // ids for use in a new context node created for this callsite.
523 DenseSet<uint32_t> ContextIds;
524 };
525
526 /// Helper to remove edge from graph, updating edge iterator if it is provided
527 /// (in which case CalleeIter indicates which edge list is being iterated).
528 /// This will also perform the necessary clearing of the ContextEdge members
529 /// to enable later checking if the edge has been removed (since we may have
530 /// other copies of the shared_ptr in existence, and in fact rely on this to
531 /// enable removal while iterating over a copy of a node's edge list).
532 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
533 bool CalleeIter = true);
534
535 /// Assigns the given Node to calls at or inlined into the location with
536 /// the Node's stack id, after post order traversing and processing its
537 /// caller nodes. Uses the call information recorded in the given
538 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
539 /// as needed. Called by updateStackNodes which sets up the given
540 /// StackIdToMatchingCalls map.
541 void assignStackNodesPostOrder(
542 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
543 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
544 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
545
546 /// Duplicates the given set of context ids, updating the provided
547 /// map from each original id with the newly generated context ids,
548 /// and returning the new duplicated id set.
549 DenseSet<uint32_t> duplicateContextIds(
550 const DenseSet<uint32_t> &StackSequenceContextIds,
551 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
552
553 /// Propagates all duplicated context ids across the graph.
554 void propagateDuplicateContextIds(
555 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
556
557 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
558 /// else to its callers. Also updates OrigNode's edges to remove any context
559 /// ids moved to the newly created edge.
560 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
561 bool TowardsCallee,
562 DenseSet<uint32_t> RemainingContextIds);
563
564 /// Get the stack id corresponding to the given Id or Index (for IR this will
565 /// return itself, for a summary index this will return the id recorded in the
566 /// index for that stack id index value).
567 uint64_t getStackId(uint64_t IdOrIndex) const {
568 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
569 }
570
571 /// Returns true if the given call targets the callee of the given edge, or if
572 /// we were able to identify the call chain through intermediate tail calls.
573 /// In the latter case new context nodes are added to the graph for the
574 /// identified tail calls, and their synthesized nodes are added to
575 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
576 /// the updated edges and to prepare it for an increment in the caller.
577 bool
578 calleesMatch(CallTy Call, EdgeIter &EI,
579 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
580
581 // Return the callee function of the given call, or nullptr if it can't be
582 // determined
583 const FuncTy *getCalleeFunc(CallTy Call) {
584 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
585 }
586
587 /// Returns true if the given call targets the given function, or if we were
588 /// able to identify the call chain through intermediate tail calls (in which
589 /// case FoundCalleeChain will be populated).
590 bool calleeMatchesFunc(
591 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
592 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
593 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
594 Call, Func, CallerFunc, FoundCalleeChain);
595 }
596
597 /// Returns true if both call instructions have the same callee.
598 bool sameCallee(CallTy Call1, CallTy Call2) {
599 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
600 }
601
602 /// Get a list of nodes corresponding to the stack ids in the given
603 /// callsite's context.
604 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
605 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
606 Call);
607 }
608
609 /// Get the last stack id in the context for callsite.
610 uint64_t getLastStackId(CallTy Call) {
611 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
612 }
613
614 /// Update the allocation call to record type of allocated memory.
615 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
616 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
617 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
618 }
619
620 /// Update non-allocation call to invoke (possibly cloned) function
621 /// CalleeFunc.
622 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
623 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
624 }
625
626 /// Clone the given function for the given callsite, recording mapping of all
627 /// of the functions tracked calls to their new versions in the CallMap.
628 /// Assigns new clones to clone number CloneNo.
629 FuncInfo cloneFunctionForCallsite(
630 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
631 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
632 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
633 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
634 }
635
636 /// Gets a label to use in the dot graph for the given call clone in the given
637 /// function.
638 std::string getLabel(const FuncTy *Func, const CallTy Call,
639 unsigned CloneNo) const {
640 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
641 }
642
643 // Create and return a new ContextNode.
644 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
645 CallInfo C = CallInfo()) {
646 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
647 auto *NewNode = NodeOwner.back().get();
648 if (F)
649 NodeToCallingFunc[NewNode] = F;
650 return NewNode;
651 }
652
653 /// Helpers to find the node corresponding to the given call or stackid.
654 ContextNode *getNodeForInst(const CallInfo &C);
655 ContextNode *getNodeForAlloc(const CallInfo &C);
656 ContextNode *getNodeForStackId(uint64_t StackId);
657
658 /// Computes the alloc type corresponding to the given context ids, by
659 /// unioning their recorded alloc types.
660 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
661
662 /// Returns the allocation type of the intersection of the contexts of two
663 /// nodes (based on their provided context id sets), optimized for the case
664 /// when Node1Ids is smaller than Node2Ids.
665 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
666 const DenseSet<uint32_t> &Node2Ids);
667
668 /// Returns the allocation type of the intersection of the contexts of two
669 /// nodes (based on their provided context id sets).
670 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
671 const DenseSet<uint32_t> &Node2Ids);
672
673 /// Create a clone of Edge's callee and move Edge to that new callee node,
674 /// performing the necessary context id and allocation type updates.
675 /// If callee's caller edge iterator is supplied, it is updated when removing
676 /// the edge from that list. If ContextIdsToMove is non-empty, only that
677 /// subset of Edge's ids are moved to an edge to the new callee.
678 ContextNode *
679 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
680 EdgeIter *CallerEdgeI = nullptr,
681 DenseSet<uint32_t> ContextIdsToMove = {});
682
683 /// Change the callee of Edge to existing callee clone NewCallee, performing
684 /// the necessary context id and allocation type updates.
685 /// If callee's caller edge iterator is supplied, it is updated when removing
686 /// the edge from that list. If ContextIdsToMove is non-empty, only that
687 /// subset of Edge's ids are moved to an edge to the new callee.
688 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
689 ContextNode *NewCallee,
690 EdgeIter *CallerEdgeI = nullptr,
691 bool NewClone = false,
692 DenseSet<uint32_t> ContextIdsToMove = {});
693
694 /// Change the caller of the edge at the given callee edge iterator to be
695 /// NewCaller, performing the necessary context id and allocation type
696 /// updates. The iterator is updated as the edge is removed from the list of
697 /// callee edges in the original caller. This is similar to the above
698 /// moveEdgeToExistingCalleeClone, but a simplified version of it as we always
699 /// move the given edge and all of its context ids.
700 void moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller);
701
702 /// Recursively perform cloning on the graph for the given Node and its
703 /// callers, in order to uniquely identify the allocation behavior of an
704 /// allocation given its context. The context ids of the allocation being
705 /// processed are given in AllocContextIds.
706 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
707 const DenseSet<uint32_t> &AllocContextIds);
708
709 /// Map from each context ID to the AllocationType assigned to that context.
710 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
711
712 /// Map from each contextID to the profiled full contexts and their total
713 /// sizes (there may be more than one due to context trimming),
714 /// optionally populated when requested (via MemProfReportHintedSizes).
715 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
716
717 /// Identifies the context node created for a stack id when adding the MIB
718 /// contexts to the graph. This is used to locate the context nodes when
719 /// trying to assign the corresponding callsites with those stack ids to these
720 /// nodes.
721 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
722
723 /// Maps to track the calls to their corresponding nodes in the graph.
724 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
725 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
726
727 /// Owner of all ContextNode unique_ptrs.
728 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
729
730 /// Perform sanity checks on graph when requested.
731 void check() const;
732
733 /// Keeps track of the last unique context id assigned.
734 unsigned int LastContextId = 0;
735};
736
737template <typename DerivedCCG, typename FuncTy, typename CallTy>
738using ContextNode =
739 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
740template <typename DerivedCCG, typename FuncTy, typename CallTy>
741using ContextEdge =
742 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
743template <typename DerivedCCG, typename FuncTy, typename CallTy>
744using FuncInfo =
745 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
746template <typename DerivedCCG, typename FuncTy, typename CallTy>
747using CallInfo =
748 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
749
750/// CRTP derived class for graphs built from IR (regular LTO).
751class ModuleCallsiteContextGraph
752 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
753 Instruction *> {
754public:
755 ModuleCallsiteContextGraph(
756 Module &M,
758
759private:
760 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
761 Instruction *>;
762
763 uint64_t getStackId(uint64_t IdOrIndex) const;
764 const Function *getCalleeFunc(Instruction *Call);
765 bool calleeMatchesFunc(
766 Instruction *Call, const Function *Func, const Function *CallerFunc,
767 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
768 bool sameCallee(Instruction *Call1, Instruction *Call2);
769 bool findProfiledCalleeThroughTailCalls(
770 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
771 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
772 bool &FoundMultipleCalleeChains);
773 uint64_t getLastStackId(Instruction *Call);
774 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
775 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
776 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
777 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778 Instruction *>::FuncInfo
779 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
780 std::map<CallInfo, CallInfo> &CallMap,
781 std::vector<CallInfo> &CallsWithMetadataInFunc,
782 unsigned CloneNo);
783 std::string getLabel(const Function *Func, const Instruction *Call,
784 unsigned CloneNo) const;
785
786 const Module &Mod;
788};
789
790/// Represents a call in the summary index graph, which can either be an
791/// allocation or an interior callsite node in an allocation's context.
792/// Holds a pointer to the corresponding data structure in the index.
793struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
794 IndexCall() : PointerUnion() {}
795 IndexCall(std::nullptr_t) : IndexCall() {}
797 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
798 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
799
800 IndexCall *operator->() { return this; }
801
802 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
803
804 void print(raw_ostream &OS) const {
805 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
806 OS << *AI;
807 } else {
808 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
809 assert(CI);
810 OS << *CI;
811 }
812 }
813};
814
815/// CRTP derived class for graphs built from summary index (ThinLTO).
816class IndexCallsiteContextGraph
817 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
818 IndexCall> {
819public:
820 IndexCallsiteContextGraph(
821 ModuleSummaryIndex &Index,
823 isPrevailing);
824
825 ~IndexCallsiteContextGraph() {
826 // Now that we are done with the graph it is safe to add the new
827 // CallsiteInfo structs to the function summary vectors. The graph nodes
828 // point into locations within these vectors, so we don't want to add them
829 // any earlier.
830 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
831 auto *FS = I.first;
832 for (auto &Callsite : I.second)
833 FS->addCallsite(*Callsite.second);
834 }
835 }
836
837private:
838 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
839 IndexCall>;
840
841 uint64_t getStackId(uint64_t IdOrIndex) const;
842 const FunctionSummary *getCalleeFunc(IndexCall &Call);
843 bool calleeMatchesFunc(
844 IndexCall &Call, const FunctionSummary *Func,
845 const FunctionSummary *CallerFunc,
846 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
847 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
848 bool findProfiledCalleeThroughTailCalls(
849 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
850 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
851 bool &FoundMultipleCalleeChains);
852 uint64_t getLastStackId(IndexCall &Call);
853 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
854 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
855 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
856 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
857 IndexCall>::FuncInfo
858 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
859 std::map<CallInfo, CallInfo> &CallMap,
860 std::vector<CallInfo> &CallsWithMetadataInFunc,
861 unsigned CloneNo);
862 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
863 unsigned CloneNo) const;
864
865 // Saves mapping from function summaries containing memprof records back to
866 // its VI, for use in checking and debugging.
867 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
868
871 isPrevailing;
872
873 // Saves/owns the callsite info structures synthesized for missing tail call
874 // frames that we discover while building the graph.
875 // It maps from the summary of the function making the tail call, to a map
876 // of callee ValueInfo to corresponding synthesized callsite info.
877 std::unordered_map<FunctionSummary *,
878 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
879 FunctionCalleesToSynthesizedCallsiteInfos;
880};
881} // namespace
882
883namespace llvm {
884template <>
885struct DenseMapInfo<typename CallsiteContextGraph<
886 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
888template <>
889struct DenseMapInfo<typename CallsiteContextGraph<
890 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
891 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
892template <>
893struct DenseMapInfo<IndexCall>
894 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
895} // end namespace llvm
896
897namespace {
898
899// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
900// type we should actually use on the corresponding allocation.
901// If we can't clone a node that has NotCold+Cold alloc type, we will fall
902// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
903// from NotCold.
904AllocationType allocTypeToUse(uint8_t AllocTypes) {
905 assert(AllocTypes != (uint8_t)AllocationType::None);
906 if (AllocTypes ==
907 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
908 return AllocationType::NotCold;
909 else
910 return (AllocationType)AllocTypes;
911}
912
913// Helper to check if the alloc types for all edges recorded in the
914// InAllocTypes vector match the alloc types for all edges in the Edges
915// vector.
916template <typename DerivedCCG, typename FuncTy, typename CallTy>
917bool allocTypesMatch(
918 const std::vector<uint8_t> &InAllocTypes,
919 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
920 &Edges) {
921 // This should be called only when the InAllocTypes vector was computed for
922 // this set of Edges. Make sure the sizes are the same.
923 assert(InAllocTypes.size() == Edges.size());
924 return std::equal(
925 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
926 [](const uint8_t &l,
927 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
928 // Can share if one of the edges is None type - don't
929 // care about the type along that edge as it doesn't
930 // exist for those context ids.
931 if (l == (uint8_t)AllocationType::None ||
932 r->AllocTypes == (uint8_t)AllocationType::None)
933 return true;
934 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
935 });
936}
937
938// Helper to check if the alloc types for all edges recorded in the
939// InAllocTypes vector match the alloc types for callee edges in the given
940// clone. Because the InAllocTypes were computed from the original node's callee
941// edges, and other cloning could have happened after this clone was created, we
942// need to find the matching clone callee edge, which may or may not exist.
943template <typename DerivedCCG, typename FuncTy, typename CallTy>
944bool allocTypesMatchClone(
945 const std::vector<uint8_t> &InAllocTypes,
946 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
947 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
948 assert(Node);
949 // InAllocTypes should have been computed for the original node's callee
950 // edges.
951 assert(InAllocTypes.size() == Node->CalleeEdges.size());
952 // First create a map of the clone callee edge callees to the edge alloc type.
954 EdgeCalleeMap;
955 for (const auto &E : Clone->CalleeEdges) {
956 assert(!EdgeCalleeMap.contains(E->Callee));
957 EdgeCalleeMap[E->Callee] = E->AllocTypes;
958 }
959 // Next, walk the original node's callees, and look for the corresponding
960 // clone edge to that callee.
961 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
962 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
963 // Not found is ok, we will simply add an edge if we use this clone.
964 if (Iter == EdgeCalleeMap.end())
965 continue;
966 // Can share if one of the edges is None type - don't
967 // care about the type along that edge as it doesn't
968 // exist for those context ids.
969 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
970 Iter->second == (uint8_t)AllocationType::None)
971 continue;
972 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
973 return false;
974 }
975 return true;
976}
977
978} // end anonymous namespace
979
980template <typename DerivedCCG, typename FuncTy, typename CallTy>
981typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
982CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
983 const CallInfo &C) {
984 ContextNode *Node = getNodeForAlloc(C);
985 if (Node)
986 return Node;
987
988 return NonAllocationCallToContextNodeMap.lookup(C);
989}
990
991template <typename DerivedCCG, typename FuncTy, typename CallTy>
992typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
993CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
994 const CallInfo &C) {
995 return AllocationCallToContextNodeMap.lookup(C);
996}
997
998template <typename DerivedCCG, typename FuncTy, typename CallTy>
999typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1000CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1001 uint64_t StackId) {
1002 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1003 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1004 return StackEntryNode->second;
1005 return nullptr;
1006}
1007
1008template <typename DerivedCCG, typename FuncTy, typename CallTy>
1009void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1010 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1011 unsigned int ContextId) {
1012 for (auto &Edge : CallerEdges) {
1013 if (Edge->Caller == Caller) {
1014 Edge->AllocTypes |= (uint8_t)AllocType;
1015 Edge->getContextIds().insert(ContextId);
1016 return;
1017 }
1018 }
1019 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1020 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1021 CallerEdges.push_back(Edge);
1022 Caller->CalleeEdges.push_back(Edge);
1023}
1024
1025template <typename DerivedCCG, typename FuncTy, typename CallTy>
1026void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1027 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1028 assert(!EI || (*EI)->get() == Edge);
1029 // Save the Caller and Callee pointers so we can erase Edge from their edge
1030 // lists after clearing Edge below. We do the clearing first in case it is
1031 // destructed after removing from the edge lists (if those were the last
1032 // shared_ptr references to Edge).
1033 auto *Callee = Edge->Callee;
1034 auto *Caller = Edge->Caller;
1035
1036 // Make sure the edge fields are cleared out so we can properly detect
1037 // removed edges if Edge is not destructed because there is still a shared_ptr
1038 // reference.
1039 Edge->clear();
1040
1041 if (!EI) {
1042 Callee->eraseCallerEdge(Edge);
1043 Caller->eraseCalleeEdge(Edge);
1044 } else if (CalleeIter) {
1045 Callee->eraseCallerEdge(Edge);
1046 *EI = Caller->CalleeEdges.erase(*EI);
1047 } else {
1048 Caller->eraseCalleeEdge(Edge);
1049 *EI = Callee->CallerEdges.erase(*EI);
1050 }
1051}
1052
1053template <typename DerivedCCG, typename FuncTy, typename CallTy>
1054void CallsiteContextGraph<
1055 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1056 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1057 auto Edge = *EI;
1058 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1059 assert(Edge->ContextIds.empty());
1060 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1061 } else
1062 ++EI;
1063 }
1064}
1065
1066template <typename DerivedCCG, typename FuncTy, typename CallTy>
1067void CallsiteContextGraph<
1068 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1069 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1070 auto Edge = *EI;
1071 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1072 assert(Edge->ContextIds.empty());
1073 Edge->Caller->eraseCalleeEdge(Edge.get());
1074 EI = Node->CallerEdges.erase(EI);
1075 } else
1076 ++EI;
1077 }
1078}
1079
1080template <typename DerivedCCG, typename FuncTy, typename CallTy>
1081typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1082CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1083 findEdgeFromCallee(const ContextNode *Callee) {
1084 for (const auto &Edge : CalleeEdges)
1085 if (Edge->Callee == Callee)
1086 return Edge.get();
1087 return nullptr;
1088}
1089
1090template <typename DerivedCCG, typename FuncTy, typename CallTy>
1091typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1092CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1093 findEdgeFromCaller(const ContextNode *Caller) {
1094 for (const auto &Edge : CallerEdges)
1095 if (Edge->Caller == Caller)
1096 return Edge.get();
1097 return nullptr;
1098}
1099
1100template <typename DerivedCCG, typename FuncTy, typename CallTy>
1101void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1102 eraseCalleeEdge(const ContextEdge *Edge) {
1103 auto EI = llvm::find_if(
1104 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1105 return CalleeEdge.get() == Edge;
1106 });
1107 assert(EI != CalleeEdges.end());
1108 CalleeEdges.erase(EI);
1109}
1110
1111template <typename DerivedCCG, typename FuncTy, typename CallTy>
1112void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1113 eraseCallerEdge(const ContextEdge *Edge) {
1114 auto EI = llvm::find_if(
1115 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1116 return CallerEdge.get() == Edge;
1117 });
1118 assert(EI != CallerEdges.end());
1119 CallerEdges.erase(EI);
1120}
1121
1122template <typename DerivedCCG, typename FuncTy, typename CallTy>
1123uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1124 DenseSet<uint32_t> &ContextIds) {
1125 uint8_t BothTypes =
1126 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1127 uint8_t AllocType = (uint8_t)AllocationType::None;
1128 for (auto Id : ContextIds) {
1129 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1130 // Bail early if alloc type reached both, no further refinement.
1131 if (AllocType == BothTypes)
1132 return AllocType;
1133 }
1134 return AllocType;
1135}
1136
1137template <typename DerivedCCG, typename FuncTy, typename CallTy>
1138uint8_t
1139CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1140 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1141 uint8_t BothTypes =
1142 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1143 uint8_t AllocType = (uint8_t)AllocationType::None;
1144 for (auto Id : Node1Ids) {
1145 if (!Node2Ids.count(Id))
1146 continue;
1147 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1148 // Bail early if alloc type reached both, no further refinement.
1149 if (AllocType == BothTypes)
1150 return AllocType;
1151 }
1152 return AllocType;
1153}
1154
1155template <typename DerivedCCG, typename FuncTy, typename CallTy>
1156uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1157 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1158 if (Node1Ids.size() < Node2Ids.size())
1159 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1160 else
1161 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1162}
1163
1164template <typename DerivedCCG, typename FuncTy, typename CallTy>
1165typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1166CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1167 CallInfo Call, const FuncTy *F) {
1168 assert(!getNodeForAlloc(Call));
1169 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1170 AllocationCallToContextNodeMap[Call] = AllocNode;
1171 // Use LastContextId as a uniq id for MIB allocation nodes.
1172 AllocNode->OrigStackOrAllocId = LastContextId;
1173 // Alloc type should be updated as we add in the MIBs. We should assert
1174 // afterwards that it is not still None.
1175 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1176
1177 return AllocNode;
1178}
1179
1180static std::string getAllocTypeString(uint8_t AllocTypes) {
1181 if (!AllocTypes)
1182 return "None";
1183 std::string Str;
1184 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1185 Str += "NotCold";
1186 if (AllocTypes & (uint8_t)AllocationType::Cold)
1187 Str += "Cold";
1188 return Str;
1189}
1190
1191template <typename DerivedCCG, typename FuncTy, typename CallTy>
1192template <class NodeT, class IteratorT>
1193void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1194 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1196 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1197 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1198 // is done.
1199 if (AllocType == AllocationType::Hot)
1200 AllocType = AllocationType::NotCold;
1201
1202 ContextIdToAllocationType[++LastContextId] = AllocType;
1203
1205 assert(!ContextSizeInfo.empty());
1206 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1207 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1208 }
1209
1210 // Update alloc type and context ids for this MIB.
1211 AllocNode->AllocTypes |= (uint8_t)AllocType;
1212
1213 // Now add or update nodes for each stack id in alloc's context.
1214 // Later when processing the stack ids on non-alloc callsites we will adjust
1215 // for any inlining in the context.
1216 ContextNode *PrevNode = AllocNode;
1217 // Look for recursion (direct recursion should have been collapsed by
1218 // module summary analysis, here we should just be detecting mutual
1219 // recursion). Mark these nodes so we don't try to clone.
1220 SmallSet<uint64_t, 8> StackIdSet;
1221 // Skip any on the allocation call (inlining).
1222 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1223 ContextIter != StackContext.end(); ++ContextIter) {
1224 auto StackId = getStackId(*ContextIter);
1225 ContextNode *StackNode = getNodeForStackId(StackId);
1226 if (!StackNode) {
1227 StackNode = createNewNode(/*IsAllocation=*/false);
1228 StackEntryIdToContextNodeMap[StackId] = StackNode;
1229 StackNode->OrigStackOrAllocId = StackId;
1230 }
1231 auto Ins = StackIdSet.insert(StackId);
1232 if (!Ins.second)
1233 StackNode->Recursive = true;
1234 StackNode->AllocTypes |= (uint8_t)AllocType;
1235 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1236 PrevNode = StackNode;
1237 }
1238}
1239
1240template <typename DerivedCCG, typename FuncTy, typename CallTy>
1242CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1243 const DenseSet<uint32_t> &StackSequenceContextIds,
1244 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1245 DenseSet<uint32_t> NewContextIds;
1246 for (auto OldId : StackSequenceContextIds) {
1247 NewContextIds.insert(++LastContextId);
1248 OldToNewContextIds[OldId].insert(LastContextId);
1249 assert(ContextIdToAllocationType.count(OldId));
1250 // The new context has the same allocation type as original.
1251 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1252 }
1253 return NewContextIds;
1254}
1255
1256template <typename DerivedCCG, typename FuncTy, typename CallTy>
1257void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1258 propagateDuplicateContextIds(
1259 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1260 // Build a set of duplicated context ids corresponding to the input id set.
1261 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1262 DenseSet<uint32_t> NewIds;
1263 for (auto Id : ContextIds)
1264 if (auto NewId = OldToNewContextIds.find(Id);
1265 NewId != OldToNewContextIds.end())
1266 NewIds.insert(NewId->second.begin(), NewId->second.end());
1267 return NewIds;
1268 };
1269
1270 // Recursively update context ids sets along caller edges.
1271 auto UpdateCallers = [&](ContextNode *Node,
1273 auto &&UpdateCallers) -> void {
1274 for (const auto &Edge : Node->CallerEdges) {
1275 auto Inserted = Visited.insert(Edge.get());
1276 if (!Inserted.second)
1277 continue;
1278 ContextNode *NextNode = Edge->Caller;
1279 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1280 // Only need to recursively iterate to NextNode via this caller edge if
1281 // it resulted in any added ids to NextNode.
1282 if (!NewIdsToAdd.empty()) {
1283 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1284 UpdateCallers(NextNode, Visited, UpdateCallers);
1285 }
1286 }
1287 };
1288
1290 for (auto &Entry : AllocationCallToContextNodeMap) {
1291 auto *Node = Entry.second;
1292 UpdateCallers(Node, Visited, UpdateCallers);
1293 }
1294}
1295
1296template <typename DerivedCCG, typename FuncTy, typename CallTy>
1297void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1298 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1299 // This must be passed by value to make a copy since it will be adjusted
1300 // as ids are moved.
1301 DenseSet<uint32_t> RemainingContextIds) {
1302 auto &OrigEdges =
1303 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1304 // Increment iterator in loop so that we can remove edges as needed.
1305 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1306 auto Edge = *EI;
1307 // Remove any matching context ids from Edge, return set that were found and
1308 // removed, these are the new edge's context ids. Also update the remaining
1309 // (not found ids).
1310 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1311 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1312 NotFoundContextIds);
1313 RemainingContextIds.swap(NotFoundContextIds);
1314 // If no matching context ids for this edge, skip it.
1315 if (NewEdgeContextIds.empty()) {
1316 ++EI;
1317 continue;
1318 }
1319 if (TowardsCallee) {
1320 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1321 auto NewEdge = std::make_shared<ContextEdge>(
1322 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1323 NewNode->CalleeEdges.push_back(NewEdge);
1324 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1325 } else {
1326 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1327 auto NewEdge = std::make_shared<ContextEdge>(
1328 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1329 NewNode->CallerEdges.push_back(NewEdge);
1330 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1331 }
1332 // Remove old edge if context ids empty.
1333 if (Edge->getContextIds().empty()) {
1334 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1335 continue;
1336 }
1337 ++EI;
1338 }
1339}
1340
1341template <typename DerivedCCG, typename FuncTy, typename CallTy>
1342static void checkEdge(
1343 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1344 // Confirm that alloc type is not None and that we have at least one context
1345 // id.
1346 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1347 assert(!Edge->ContextIds.empty());
1348}
1349
1350template <typename DerivedCCG, typename FuncTy, typename CallTy>
1351static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1352 bool CheckEdges = true) {
1353 if (Node->isRemoved())
1354 return;
1355#ifndef NDEBUG
1356 // Compute node's context ids once for use in asserts.
1357 auto NodeContextIds = Node->getContextIds();
1358#endif
1359 // Node's context ids should be the union of both its callee and caller edge
1360 // context ids.
1361 if (Node->CallerEdges.size()) {
1362 DenseSet<uint32_t> CallerEdgeContextIds(
1363 Node->CallerEdges.front()->ContextIds);
1364 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1365 if (CheckEdges)
1366 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1367 set_union(CallerEdgeContextIds, Edge->ContextIds);
1368 }
1369 // Node can have more context ids than callers if some contexts terminate at
1370 // node and some are longer.
1371 assert(NodeContextIds == CallerEdgeContextIds ||
1372 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1373 }
1374 if (Node->CalleeEdges.size()) {
1375 DenseSet<uint32_t> CalleeEdgeContextIds(
1376 Node->CalleeEdges.front()->ContextIds);
1377 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1378 if (CheckEdges)
1379 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1380 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1381 }
1382 assert(NodeContextIds == CalleeEdgeContextIds);
1383 }
1384 // FIXME: Since this checking is only invoked under an option, we should
1385 // change the error checking from using assert to something that will trigger
1386 // an error on a release build.
1387#ifndef NDEBUG
1388 // Make sure we don't end up with duplicate edges between the same caller and
1389 // callee.
1391 for (const auto &E : Node->CalleeEdges)
1392 NodeSet.insert(E->Callee);
1393 assert(NodeSet.size() == Node->CalleeEdges.size());
1394#endif
1395}
1396
1397template <typename DerivedCCG, typename FuncTy, typename CallTy>
1398void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1399 assignStackNodesPostOrder(
1400 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1401 DenseMap<uint64_t, std::vector<CallContextInfo>>
1402 &StackIdToMatchingCalls,
1403 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1404 auto Inserted = Visited.insert(Node);
1405 if (!Inserted.second)
1406 return;
1407 // Post order traversal. Iterate over a copy since we may add nodes and
1408 // therefore new callers during the recursive call, invalidating any
1409 // iterator over the original edge vector. We don't need to process these
1410 // new nodes as they were already processed on creation.
1411 auto CallerEdges = Node->CallerEdges;
1412 for (auto &Edge : CallerEdges) {
1413 // Skip any that have been removed during the recursion.
1414 if (Edge->isRemoved()) {
1415 assert(!is_contained(Node->CallerEdges, Edge));
1416 continue;
1417 }
1418 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1419 CallToMatchingCall);
1420 }
1421
1422 // If this node's stack id is in the map, update the graph to contain new
1423 // nodes representing any inlining at interior callsites. Note we move the
1424 // associated context ids over to the new nodes.
1425
1426 // Ignore this node if it is for an allocation or we didn't record any
1427 // stack id lists ending at it.
1428 if (Node->IsAllocation ||
1429 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1430 return;
1431
1432 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1433 // Handle the simple case first. A single call with a single stack id.
1434 // In this case there is no need to create any new context nodes, simply
1435 // assign the context node for stack id to this Call.
1436 if (Calls.size() == 1) {
1437 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1438 if (Ids.size() == 1) {
1439 assert(SavedContextIds.empty());
1440 // It should be this Node
1441 assert(Node == getNodeForStackId(Ids[0]));
1442 if (Node->Recursive)
1443 return;
1444 Node->setCall(Call);
1445 NonAllocationCallToContextNodeMap[Call] = Node;
1446 NodeToCallingFunc[Node] = Func;
1447 return;
1448 }
1449 }
1450
1451#ifndef NDEBUG
1452 // Find the node for the last stack id, which should be the same
1453 // across all calls recorded for this id, and is this node's id.
1454 uint64_t LastId = Node->OrigStackOrAllocId;
1455 ContextNode *LastNode = getNodeForStackId(LastId);
1456 // We should only have kept stack ids that had nodes.
1457 assert(LastNode);
1458 assert(LastNode == Node);
1459#else
1460 ContextNode *LastNode = Node;
1461#endif
1462
1463 // Compute the last node's context ids once, as it is shared by all calls in
1464 // this entry.
1465 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1466
1467 bool PrevIterCreatedNode = false;
1468 bool CreatedNode = false;
1469 for (unsigned I = 0; I < Calls.size();
1470 I++, PrevIterCreatedNode = CreatedNode) {
1471 CreatedNode = false;
1472 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1473 // Skip any for which we didn't assign any ids, these don't get a node in
1474 // the graph.
1475 if (SavedContextIds.empty()) {
1476 // If this call has a matching call (located in the same function and
1477 // having the same stack ids), simply add it to the context node created
1478 // for its matching call earlier. These can be treated the same through
1479 // cloning and get updated at the same time.
1480 if (!CallToMatchingCall.contains(Call))
1481 continue;
1482 auto MatchingCall = CallToMatchingCall[Call];
1483 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1484 // This should only happen if we had a prior iteration, and it didn't
1485 // create a node because of the below recomputation of context ids
1486 // finding none remaining and continuing early.
1487 assert(I > 0 && !PrevIterCreatedNode);
1488 continue;
1489 }
1490 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1491 Call);
1492 continue;
1493 }
1494
1495 assert(LastId == Ids.back());
1496
1497 // Recompute the context ids for this stack id sequence (the
1498 // intersection of the context ids of the corresponding nodes).
1499 // Start with the ids we saved in the map for this call, which could be
1500 // duplicated context ids. We have to recompute as we might have overlap
1501 // overlap between the saved context ids for different last nodes, and
1502 // removed them already during the post order traversal.
1503 set_intersect(SavedContextIds, LastNodeContextIds);
1504 ContextNode *PrevNode = LastNode;
1505 bool Skip = false;
1506 // Iterate backwards through the stack Ids, starting after the last Id
1507 // in the list, which was handled once outside for all Calls.
1508 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1509 auto Id = *IdIter;
1510 ContextNode *CurNode = getNodeForStackId(Id);
1511 // We should only have kept stack ids that had nodes and weren't
1512 // recursive.
1513 assert(CurNode);
1514 assert(!CurNode->Recursive);
1515
1516 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1517 if (!Edge) {
1518 Skip = true;
1519 break;
1520 }
1521 PrevNode = CurNode;
1522
1523 // Update the context ids, which is the intersection of the ids along
1524 // all edges in the sequence.
1525 set_intersect(SavedContextIds, Edge->getContextIds());
1526
1527 // If we now have no context ids for clone, skip this call.
1528 if (SavedContextIds.empty()) {
1529 Skip = true;
1530 break;
1531 }
1532 }
1533 if (Skip)
1534 continue;
1535
1536 // Create new context node.
1537 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1538 NonAllocationCallToContextNodeMap[Call] = NewNode;
1539 CreatedNode = true;
1540 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1541
1542 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1543 assert(FirstNode);
1544
1545 // Connect to callees of innermost stack frame in inlined call chain.
1546 // This updates context ids for FirstNode's callee's to reflect those
1547 // moved to NewNode.
1548 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1549
1550 // Connect to callers of outermost stack frame in inlined call chain.
1551 // This updates context ids for FirstNode's caller's to reflect those
1552 // moved to NewNode.
1553 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1554
1555 // Now we need to remove context ids from edges/nodes between First and
1556 // Last Node.
1557 PrevNode = nullptr;
1558 for (auto Id : Ids) {
1559 ContextNode *CurNode = getNodeForStackId(Id);
1560 // We should only have kept stack ids that had nodes.
1561 assert(CurNode);
1562
1563 // Remove the context ids moved to NewNode from CurNode, and the
1564 // edge from the prior node.
1565 if (PrevNode) {
1566 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1567 assert(PrevEdge);
1568 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1569 if (PrevEdge->getContextIds().empty())
1570 removeEdgeFromGraph(PrevEdge);
1571 }
1572 // Since we update the edges from leaf to tail, only look at the callee
1573 // edges. This isn't an alloc node, so if there are no callee edges, the
1574 // alloc type is None.
1575 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1576 ? (uint8_t)AllocationType::None
1577 : CurNode->computeAllocType();
1578 PrevNode = CurNode;
1579 }
1580 if (VerifyNodes) {
1581 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1582 for (auto Id : Ids) {
1583 ContextNode *CurNode = getNodeForStackId(Id);
1584 // We should only have kept stack ids that had nodes.
1585 assert(CurNode);
1586 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1587 }
1588 }
1589 }
1590}
1591
1592template <typename DerivedCCG, typename FuncTy, typename CallTy>
1593void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1594 // Map of stack id to all calls with that as the last (outermost caller)
1595 // callsite id that has a context node (some might not due to pruning
1596 // performed during matching of the allocation profile contexts).
1597 // The CallContextInfo contains the Call and a list of its stack ids with
1598 // ContextNodes, the function containing Call, and the set of context ids
1599 // the analysis will eventually identify for use in any new node created
1600 // for that callsite.
1602 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1603 for (auto &Call : CallsWithMetadata) {
1604 // Ignore allocations, already handled.
1605 if (AllocationCallToContextNodeMap.count(Call))
1606 continue;
1607 auto StackIdsWithContextNodes =
1608 getStackIdsWithContextNodesForCall(Call.call());
1609 // If there were no nodes created for MIBs on allocs (maybe this was in
1610 // the unambiguous part of the MIB stack that was pruned), ignore.
1611 if (StackIdsWithContextNodes.empty())
1612 continue;
1613 // Otherwise, record this Call along with the list of ids for the last
1614 // (outermost caller) stack id with a node.
1615 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1616 {Call.call(), StackIdsWithContextNodes, Func, {}});
1617 }
1618 }
1619
1620 // First make a pass through all stack ids that correspond to a call,
1621 // as identified in the above loop. Compute the context ids corresponding to
1622 // each of these calls when they correspond to multiple stack ids due to
1623 // due to inlining. Perform any duplication of context ids required when
1624 // there is more than one call with the same stack ids. Their (possibly newly
1625 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1626 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1627 // Save a map from each call to any that are found to match it. I.e. located
1628 // in the same function and have the same (possibly pruned) stack ids. We use
1629 // this to avoid creating extra graph nodes as they can be treated the same.
1630 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1631 for (auto &It : StackIdToMatchingCalls) {
1632 auto &Calls = It.getSecond();
1633 // Skip single calls with a single stack id. These don't need a new node.
1634 if (Calls.size() == 1) {
1635 auto &Ids = Calls[0].StackIds;
1636 if (Ids.size() == 1)
1637 continue;
1638 }
1639 // In order to do the best and maximal matching of inlined calls to context
1640 // node sequences we will sort the vectors of stack ids in descending order
1641 // of length, and within each length, lexicographically by stack id. The
1642 // latter is so that we can specially handle calls that have identical stack
1643 // id sequences (either due to cloning or artificially because of the MIB
1644 // context pruning). Those with the same Ids are then sorted by function to
1645 // facilitate efficiently mapping them to the same context node.
1646 // Because the functions are pointers, to ensure a stable sort first assign
1647 // each function pointer to its first index in the Calls array, and then use
1648 // that to sort by.
1650 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1651 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1652 std::stable_sort(
1653 Calls.begin(), Calls.end(),
1654 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1655 return A.StackIds.size() > B.StackIds.size() ||
1656 (A.StackIds.size() == B.StackIds.size() &&
1657 (A.StackIds < B.StackIds ||
1658 (A.StackIds == B.StackIds &&
1659 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1660 });
1661
1662 // Find the node for the last stack id, which should be the same
1663 // across all calls recorded for this id, and is the id for this
1664 // entry in the StackIdToMatchingCalls map.
1665 uint64_t LastId = It.getFirst();
1666 ContextNode *LastNode = getNodeForStackId(LastId);
1667 // We should only have kept stack ids that had nodes.
1668 assert(LastNode);
1669
1670 if (LastNode->Recursive)
1671 continue;
1672
1673 // Initialize the context ids with the last node's. We will subsequently
1674 // refine the context ids by computing the intersection along all edges.
1675 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1676 assert(!LastNodeContextIds.empty());
1677
1678#ifndef NDEBUG
1679 // Save the set of functions seen for a particular set of the same stack
1680 // ids. This is used to ensure that they have been correctly sorted to be
1681 // adjacent in the Calls list, since we rely on that to efficiently place
1682 // all such matching calls onto the same context node.
1683 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1684#endif
1685
1686 for (unsigned I = 0; I < Calls.size(); I++) {
1687 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1688 assert(SavedContextIds.empty());
1689 assert(LastId == Ids.back());
1690
1691#ifndef NDEBUG
1692 // If this call has a different set of ids than the last one, clear the
1693 // set used to ensure they are sorted properly.
1694 if (I > 0 && Ids != Calls[I - 1].StackIds)
1695 MatchingIdsFuncSet.clear();
1696#endif
1697
1698 // First compute the context ids for this stack id sequence (the
1699 // intersection of the context ids of the corresponding nodes).
1700 // Start with the remaining saved ids for the last node.
1701 assert(!LastNodeContextIds.empty());
1702 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1703
1704 ContextNode *PrevNode = LastNode;
1705 ContextNode *CurNode = LastNode;
1706 bool Skip = false;
1707
1708 // Iterate backwards through the stack Ids, starting after the last Id
1709 // in the list, which was handled once outside for all Calls.
1710 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1711 auto Id = *IdIter;
1712 CurNode = getNodeForStackId(Id);
1713 // We should only have kept stack ids that had nodes.
1714 assert(CurNode);
1715
1716 if (CurNode->Recursive) {
1717 Skip = true;
1718 break;
1719 }
1720
1721 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1722 // If there is no edge then the nodes belong to different MIB contexts,
1723 // and we should skip this inlined context sequence. For example, this
1724 // particular inlined context may include stack ids A->B, and we may
1725 // indeed have nodes for both A and B, but it is possible that they were
1726 // never profiled in sequence in a single MIB for any allocation (i.e.
1727 // we might have profiled an allocation that involves the callsite A,
1728 // but through a different one of its callee callsites, and we might
1729 // have profiled an allocation that involves callsite B, but reached
1730 // from a different caller callsite).
1731 if (!Edge) {
1732 Skip = true;
1733 break;
1734 }
1735 PrevNode = CurNode;
1736
1737 // Update the context ids, which is the intersection of the ids along
1738 // all edges in the sequence.
1739 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1740
1741 // If we now have no context ids for clone, skip this call.
1742 if (StackSequenceContextIds.empty()) {
1743 Skip = true;
1744 break;
1745 }
1746 }
1747 if (Skip)
1748 continue;
1749
1750 // If some of this call's stack ids did not have corresponding nodes (due
1751 // to pruning), don't include any context ids for contexts that extend
1752 // beyond these nodes. Otherwise we would be matching part of unrelated /
1753 // not fully matching stack contexts. To do this, subtract any context ids
1754 // found in caller nodes of the last node found above.
1755 if (Ids.back() != getLastStackId(Call)) {
1756 for (const auto &PE : LastNode->CallerEdges) {
1757 set_subtract(StackSequenceContextIds, PE->getContextIds());
1758 if (StackSequenceContextIds.empty())
1759 break;
1760 }
1761 // If we now have no context ids for clone, skip this call.
1762 if (StackSequenceContextIds.empty())
1763 continue;
1764 }
1765
1766#ifndef NDEBUG
1767 // If the prior call had the same stack ids this set would not be empty.
1768 // Check if we already have a call that "matches" because it is located
1769 // in the same function. If the Calls list was sorted properly we should
1770 // not encounter this situation as all such entries should be adjacent
1771 // and processed in bulk further below.
1772 assert(!MatchingIdsFuncSet.contains(Func));
1773
1774 MatchingIdsFuncSet.insert(Func);
1775#endif
1776
1777 // Check if the next set of stack ids is the same (since the Calls vector
1778 // of tuples is sorted by the stack ids we can just look at the next one).
1779 // If so, save them in the CallToMatchingCall map so that they get
1780 // assigned to the same context node, and skip them.
1781 bool DuplicateContextIds = false;
1782 for (unsigned J = I + 1; J < Calls.size(); J++) {
1783 auto &CallCtxInfo = Calls[J];
1784 auto &NextIds = CallCtxInfo.StackIds;
1785 if (NextIds != Ids)
1786 break;
1787 auto *NextFunc = CallCtxInfo.Func;
1788 if (NextFunc != Func) {
1789 // We have another Call with the same ids but that cannot share this
1790 // node, must duplicate ids for it.
1791 DuplicateContextIds = true;
1792 break;
1793 }
1794 auto &NextCall = CallCtxInfo.Call;
1795 CallToMatchingCall[NextCall] = Call;
1796 // Update I so that it gets incremented correctly to skip this call.
1797 I = J;
1798 }
1799
1800 // If we don't have duplicate context ids, then we can assign all the
1801 // context ids computed for the original node sequence to this call.
1802 // If there are duplicate calls with the same stack ids then we synthesize
1803 // new context ids that are duplicates of the originals. These are
1804 // assigned to SavedContextIds, which is a reference into the map entry
1805 // for this call, allowing us to access these ids later on.
1806 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1807 StackSequenceContextIds.size());
1808 SavedContextIds =
1809 DuplicateContextIds
1810 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1811 : StackSequenceContextIds;
1812 assert(!SavedContextIds.empty());
1813
1814 if (!DuplicateContextIds) {
1815 // Update saved last node's context ids to remove those that are
1816 // assigned to other calls, so that it is ready for the next call at
1817 // this stack id.
1818 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1819 if (LastNodeContextIds.empty())
1820 break;
1821 }
1822 }
1823 }
1824
1825 // Propagate the duplicate context ids over the graph.
1826 propagateDuplicateContextIds(OldToNewContextIds);
1827
1828 if (VerifyCCG)
1829 check();
1830
1831 // Now perform a post-order traversal over the graph, starting with the
1832 // allocation nodes, essentially processing nodes from callers to callees.
1833 // For any that contains an id in the map, update the graph to contain new
1834 // nodes representing any inlining at interior callsites. Note we move the
1835 // associated context ids over to the new nodes.
1837 for (auto &Entry : AllocationCallToContextNodeMap)
1838 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1839 CallToMatchingCall);
1840 if (VerifyCCG)
1841 check();
1842}
1843
1844uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1846 Call->getMetadata(LLVMContext::MD_callsite));
1847 return CallsiteContext.back();
1848}
1849
1850uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1851 assert(isa<CallsiteInfo *>(Call.getBase()));
1853 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1854 // Need to convert index into stack id.
1855 return Index.getStackIdAtIndex(CallsiteContext.back());
1856}
1857
1858static const std::string MemProfCloneSuffix = ".memprof.";
1859
1860static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1861 // We use CloneNo == 0 to refer to the original version, which doesn't get
1862 // renamed with a suffix.
1863 if (!CloneNo)
1864 return Base.str();
1865 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1866}
1867
1868static bool isMemProfClone(const Function &F) {
1869 return F.getName().contains(MemProfCloneSuffix);
1870}
1871
1872std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1873 const Instruction *Call,
1874 unsigned CloneNo) const {
1875 return (Twine(Call->getFunction()->getName()) + " -> " +
1876 cast<CallBase>(Call)->getCalledFunction()->getName())
1877 .str();
1878}
1879
1880std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1881 const IndexCall &Call,
1882 unsigned CloneNo) const {
1883 auto VI = FSToVIMap.find(Func);
1884 assert(VI != FSToVIMap.end());
1885 if (isa<AllocInfo *>(Call.getBase()))
1886 return (VI->second.name() + " -> alloc").str();
1887 else {
1888 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1889 return (VI->second.name() + " -> " +
1890 getMemProfFuncName(Callsite->Callee.name(),
1891 Callsite->Clones[CloneNo]))
1892 .str();
1893 }
1894}
1895
1896std::vector<uint64_t>
1897ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1898 Instruction *Call) {
1900 Call->getMetadata(LLVMContext::MD_callsite));
1901 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1902 CallsiteContext);
1903}
1904
1905std::vector<uint64_t>
1906IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1907 assert(isa<CallsiteInfo *>(Call.getBase()));
1909 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1910 return getStackIdsWithContextNodes<CallsiteInfo,
1912 CallsiteContext);
1913}
1914
1915template <typename DerivedCCG, typename FuncTy, typename CallTy>
1916template <class NodeT, class IteratorT>
1917std::vector<uint64_t>
1918CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1919 CallStack<NodeT, IteratorT> &CallsiteContext) {
1920 std::vector<uint64_t> StackIds;
1921 for (auto IdOrIndex : CallsiteContext) {
1922 auto StackId = getStackId(IdOrIndex);
1923 ContextNode *Node = getNodeForStackId(StackId);
1924 if (!Node)
1925 break;
1926 StackIds.push_back(StackId);
1927 }
1928 return StackIds;
1929}
1930
1931ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1932 Module &M,
1934 : Mod(M), OREGetter(OREGetter) {
1935 for (auto &F : M) {
1936 std::vector<CallInfo> CallsWithMetadata;
1937 for (auto &BB : F) {
1938 for (auto &I : BB) {
1939 if (!isa<CallBase>(I))
1940 continue;
1941 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1942 CallsWithMetadata.push_back(&I);
1943 auto *AllocNode = addAllocNode(&I, &F);
1944 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1945 assert(CallsiteMD);
1946 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1947 // Add all of the MIBs and their stack nodes.
1948 for (auto &MDOp : MemProfMD->operands()) {
1949 auto *MIBMD = cast<const MDNode>(MDOp);
1950 std::vector<ContextTotalSize> ContextSizeInfo;
1951 // Collect the context size information if it exists.
1952 if (MIBMD->getNumOperands() > 2) {
1953 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
1954 MDNode *ContextSizePair =
1955 dyn_cast<MDNode>(MIBMD->getOperand(I));
1956 assert(ContextSizePair->getNumOperands() == 2);
1957 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
1958 ContextSizePair->getOperand(0))
1959 ->getZExtValue();
1960 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
1961 ContextSizePair->getOperand(1))
1962 ->getZExtValue();
1963 ContextSizeInfo.push_back({FullStackId, TotalSize});
1964 }
1965 }
1969 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1970 AllocNode, StackContext, CallsiteContext,
1971 getMIBAllocType(MIBMD), ContextSizeInfo);
1972 }
1973 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1974 // Memprof and callsite metadata on memory allocations no longer
1975 // needed.
1976 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1977 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1978 }
1979 // For callsite metadata, add to list for this function for later use.
1980 else if (I.getMetadata(LLVMContext::MD_callsite)) {
1981 CallsWithMetadata.push_back(&I);
1982 }
1983 }
1984 }
1985 if (!CallsWithMetadata.empty())
1986 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1987 }
1988
1989 if (DumpCCG) {
1990 dbgs() << "CCG before updating call stack chains:\n";
1991 dbgs() << *this;
1992 }
1993
1994 if (ExportToDot)
1995 exportToDot("prestackupdate");
1996
1997 updateStackNodes();
1998
1999 handleCallsitesWithMultipleTargets();
2000
2001 // Strip off remaining callsite metadata, no longer needed.
2002 for (auto &FuncEntry : FuncToCallsWithMetadata)
2003 for (auto &Call : FuncEntry.second)
2004 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2005}
2006
2007IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2008 ModuleSummaryIndex &Index,
2010 isPrevailing)
2011 : Index(Index), isPrevailing(isPrevailing) {
2012 for (auto &I : Index) {
2013 auto VI = Index.getValueInfo(I);
2014 for (auto &S : VI.getSummaryList()) {
2015 // We should only add the prevailing nodes. Otherwise we may try to clone
2016 // in a weak copy that won't be linked (and may be different than the
2017 // prevailing version).
2018 // We only keep the memprof summary on the prevailing copy now when
2019 // building the combined index, as a space optimization, however don't
2020 // rely on this optimization. The linker doesn't resolve local linkage
2021 // values so don't check whether those are prevailing.
2022 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2023 !isPrevailing(VI.getGUID(), S.get()))
2024 continue;
2025 auto *FS = dyn_cast<FunctionSummary>(S.get());
2026 if (!FS)
2027 continue;
2028 std::vector<CallInfo> CallsWithMetadata;
2029 if (!FS->allocs().empty()) {
2030 for (auto &AN : FS->mutableAllocs()) {
2031 // This can happen because of recursion elimination handling that
2032 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2033 // We still added them to the summary because we need to be able to
2034 // correlate properly in applyImport in the backends.
2035 if (AN.MIBs.empty())
2036 continue;
2037 IndexCall AllocCall(&AN);
2038 CallsWithMetadata.push_back(AllocCall);
2039 auto *AllocNode = addAllocNode(AllocCall, FS);
2040 // Pass an empty CallStack to the CallsiteContext (second)
2041 // parameter, since for ThinLTO we already collapsed out the inlined
2042 // stack ids on the allocation call during ModuleSummaryAnalysis.
2044 EmptyContext;
2045 unsigned I = 0;
2047 AN.ContextSizeInfos.size() == AN.MIBs.size());
2048 // Now add all of the MIBs and their stack nodes.
2049 for (auto &MIB : AN.MIBs) {
2051 StackContext(&MIB);
2052 std::vector<ContextTotalSize> ContextSizeInfo;
2054 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2055 ContextSizeInfo.push_back({FullStackId, TotalSize});
2056 }
2057 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2058 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2059 ContextSizeInfo);
2060 I++;
2061 }
2062 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2063 // Initialize version 0 on the summary alloc node to the current alloc
2064 // type, unless it has both types in which case make it default, so
2065 // that in the case where we aren't able to clone the original version
2066 // always ends up with the default allocation behavior.
2067 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2068 }
2069 }
2070 // For callsite metadata, add to list for this function for later use.
2071 if (!FS->callsites().empty())
2072 for (auto &SN : FS->mutableCallsites()) {
2073 IndexCall StackNodeCall(&SN);
2074 CallsWithMetadata.push_back(StackNodeCall);
2075 }
2076
2077 if (!CallsWithMetadata.empty())
2078 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2079
2080 if (!FS->allocs().empty() || !FS->callsites().empty())
2081 FSToVIMap[FS] = VI;
2082 }
2083 }
2084
2085 if (DumpCCG) {
2086 dbgs() << "CCG before updating call stack chains:\n";
2087 dbgs() << *this;
2088 }
2089
2090 if (ExportToDot)
2091 exportToDot("prestackupdate");
2092
2093 updateStackNodes();
2094
2095 handleCallsitesWithMultipleTargets();
2096}
2097
2098template <typename DerivedCCG, typename FuncTy, typename CallTy>
2099void CallsiteContextGraph<DerivedCCG, FuncTy,
2100 CallTy>::handleCallsitesWithMultipleTargets() {
2101 // Look for and workaround callsites that call multiple functions.
2102 // This can happen for indirect calls, which needs better handling, and in
2103 // more rare cases (e.g. macro expansion).
2104 // TODO: To fix this for indirect calls we will want to perform speculative
2105 // devirtualization using either the normal PGO info with ICP, or using the
2106 // information in the profiled MemProf contexts. We can do this prior to
2107 // this transformation for regular LTO, and for ThinLTO we can simulate that
2108 // effect in the summary and perform the actual speculative devirtualization
2109 // while cloning in the ThinLTO backend.
2110
2111 // Keep track of the new nodes synthesized for discovered tail calls missing
2112 // from the profiled contexts.
2113 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2114
2115 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2116 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2117 auto *Node = Entry.second;
2118 assert(Node->Clones.empty());
2119 // Check all node callees and see if in the same function.
2120 // We need to check all of the calls recorded in this Node, because in some
2121 // cases we may have had multiple calls with the same debug info calling
2122 // different callees. This can happen, for example, when an object is
2123 // constructed in the paramter list - the destructor call of the object has
2124 // the same debug info (line/col) as the call the object was passed to.
2125 // Here we will prune any that don't match all callee nodes.
2126 std::vector<CallInfo> AllCalls;
2127 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2128 AllCalls.push_back(Node->Call);
2129 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2130 Node->MatchingCalls.end());
2131
2132 // First see if we can partition the calls by callee function, creating new
2133 // nodes to host each set of calls calling the same callees. This is
2134 // necessary for support indirect calls with ThinLTO, for which we
2135 // synthesized CallsiteInfo records for each target. They will all have the
2136 // same callsite stack ids and would be sharing a context node at this
2137 // point. We need to perform separate cloning for each, which will be
2138 // applied along with speculative devirtualization in the ThinLTO backends
2139 // as needed. Note this does not currently support looking through tail
2140 // calls, it is unclear if we need that for indirect call targets.
2141 // First partition calls by callee func. Map indexed by func, value is
2142 // struct with list of matching calls, assigned node.
2143 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2144 continue;
2145
2146 auto It = AllCalls.begin();
2147 // Iterate through the calls until we find the first that matches.
2148 for (; It != AllCalls.end(); ++It) {
2149 auto ThisCall = *It;
2150 bool Match = true;
2151 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2152 ++EI) {
2153 auto Edge = *EI;
2154 if (!Edge->Callee->hasCall())
2155 continue;
2156 assert(NodeToCallingFunc.count(Edge->Callee));
2157 // Check if the called function matches that of the callee node.
2158 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2159 Match = false;
2160 break;
2161 }
2162 }
2163 // Found a call that matches the callee nodes, we can quit now.
2164 if (Match) {
2165 // If the first match is not the primary call on the Node, update it
2166 // now. We will update the list of matching calls further below.
2167 if (Node->Call != ThisCall) {
2168 Node->setCall(ThisCall);
2169 // We need to update the NonAllocationCallToContextNodeMap, but don't
2170 // want to do this during iteration over that map, so save the calls
2171 // that need updated entries.
2172 NewCallToNode.push_back({ThisCall, Node});
2173 }
2174 break;
2175 }
2176 }
2177 // We will update this list below (or leave it cleared if there was no
2178 // match found above).
2179 Node->MatchingCalls.clear();
2180 // If we hit the end of the AllCalls vector, no call matching the callee
2181 // nodes was found, clear the call information in the node.
2182 if (It == AllCalls.end()) {
2183 RemovedEdgesWithMismatchedCallees++;
2184 // Work around by setting Node to have a null call, so it gets
2185 // skipped during cloning. Otherwise assignFunctions will assert
2186 // because its data structures are not designed to handle this case.
2187 Node->setCall(CallInfo());
2188 continue;
2189 }
2190 // Now add back any matching calls that call the same function as the
2191 // matching primary call on Node.
2192 for (++It; It != AllCalls.end(); ++It) {
2193 auto ThisCall = *It;
2194 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2195 continue;
2196 Node->MatchingCalls.push_back(ThisCall);
2197 }
2198 }
2199
2200 // Remove all mismatched nodes identified in the above loop from the node map
2201 // (checking whether they have a null call which is set above). For a
2202 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2203 // to do the removal via remove_if than by individually erasing entries above.
2204 // Also remove any entries if we updated the node's primary call above.
2205 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2206 return !it.second->hasCall() || it.second->Call != it.first;
2207 });
2208
2209 // Add entries for any new primary calls recorded above.
2210 for (auto &[Call, Node] : NewCallToNode)
2211 NonAllocationCallToContextNodeMap[Call] = Node;
2212
2213 // Add the new nodes after the above loop so that the iteration is not
2214 // invalidated.
2215 for (auto &[Call, Node] : TailCallToContextNodeMap)
2216 NonAllocationCallToContextNodeMap[Call] = Node;
2217}
2218
2219template <typename DerivedCCG, typename FuncTy, typename CallTy>
2220bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2221 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2222 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2223 // Struct to keep track of all the calls having the same callee function,
2224 // and the node we eventually assign to them. Eventually we will record the
2225 // context node assigned to this group of calls.
2226 struct CallsWithSameCallee {
2227 std::vector<CallInfo> Calls;
2228 ContextNode *Node = nullptr;
2229 };
2230
2231 // First partition calls by callee function. Build map from each function
2232 // to the list of matching calls.
2234 for (auto ThisCall : AllCalls) {
2235 auto *F = getCalleeFunc(ThisCall.call());
2236 if (F)
2237 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2238 }
2239
2240 // Next, walk through all callee edges. For each callee node, get its
2241 // containing function and see if it was recorded in the above map (meaning we
2242 // have at least one matching call). Build another map from each callee node
2243 // with a matching call to the structure instance created above containing all
2244 // the calls.
2246 for (const auto &Edge : Node->CalleeEdges) {
2247 if (!Edge->Callee->hasCall())
2248 continue;
2249 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2250 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2251 CalleeNodeToCallInfo[Edge->Callee] =
2252 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2253 }
2254
2255 // If there are entries in the second map, then there were no matching
2256 // calls/callees, nothing to do here. Return so we can go to the handling that
2257 // looks through tail calls.
2258 if (CalleeNodeToCallInfo.empty())
2259 return false;
2260
2261 // Walk through all callee edges again. Any and all callee edges that didn't
2262 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2263 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2264 // ignored during cloning. If it is in the map, then we use the node recorded
2265 // in that entry (creating it if needed), and move the callee edge to it.
2266 // The first callee will use the original node instead of creating a new one.
2267 // Note that any of the original calls on this node (in AllCalls) that didn't
2268 // have a callee function automatically get dropped from the node as part of
2269 // this process.
2270 ContextNode *UnmatchedCalleesNode = nullptr;
2271 // Track whether we already assigned original node to a callee.
2272 bool UsedOrigNode = false;
2273 assert(NodeToCallingFunc[Node]);
2274 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
2275 auto Edge = *EI;
2276 if (!Edge->Callee->hasCall()) {
2277 ++EI;
2278 continue;
2279 }
2280
2281 // Will be updated below to point to whatever (caller) node this callee edge
2282 // should be moved to.
2283 ContextNode *CallerNodeToUse = nullptr;
2284
2285 // Handle the case where there were no matching calls first. Move this
2286 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2287 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2288 if (!UnmatchedCalleesNode)
2289 UnmatchedCalleesNode =
2290 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2291 CallerNodeToUse = UnmatchedCalleesNode;
2292 } else {
2293 // Look up the information recorded for this callee node, and use the
2294 // recorded caller node (creating it if needed).
2295 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2296 if (!Info->Node) {
2297 // If we haven't assigned any callees to the original node use it.
2298 if (!UsedOrigNode) {
2299 Info->Node = Node;
2300 // Clear the set of matching calls which will be updated below.
2301 Node->MatchingCalls.clear();
2302 UsedOrigNode = true;
2303 } else
2304 Info->Node =
2305 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2306 assert(!Info->Calls.empty());
2307 // The first call becomes the primary call for this caller node, and the
2308 // rest go in the matching calls list.
2309 Info->Node->setCall(Info->Calls.front());
2310 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2311 Info->Calls.begin() + 1,
2312 Info->Calls.end());
2313 // Save the primary call to node correspondence so that we can update
2314 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2315 // caller of this function.
2316 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2317 }
2318 CallerNodeToUse = Info->Node;
2319 }
2320
2321 // Don't need to move edge if we are using the original node;
2322 if (CallerNodeToUse == Node) {
2323 ++EI;
2324 continue;
2325 }
2326
2327 moveCalleeEdgeToNewCaller(EI, CallerNodeToUse);
2328 }
2329 // Now that we are done moving edges, clean up any caller edges that ended
2330 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2331 // caller edges from Node are replicated onto the new callers, and it
2332 // simplifies the handling to leave them until we have moved all
2333 // edges/context ids.
2334 for (auto &I : CalleeNodeToCallInfo)
2335 removeNoneTypeCallerEdges(I.second->Node);
2336 if (UnmatchedCalleesNode)
2337 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2338 removeNoneTypeCallerEdges(Node);
2339
2340 return true;
2341}
2342
2343uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2344 // In the Module (IR) case this is already the Id.
2345 return IdOrIndex;
2346}
2347
2348uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2349 // In the Index case this is an index into the stack id list in the summary
2350 // index, convert it to an Id.
2351 return Index.getStackIdAtIndex(IdOrIndex);
2352}
2353
2354template <typename DerivedCCG, typename FuncTy, typename CallTy>
2355bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2356 CallTy Call, EdgeIter &EI,
2357 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2358 auto Edge = *EI;
2359 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2360 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2361 // Will be populated in order of callee to caller if we find a chain of tail
2362 // calls between the profiled caller and callee.
2363 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2364 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2365 FoundCalleeChain))
2366 return false;
2367
2368 // The usual case where the profiled callee matches that of the IR/summary.
2369 if (FoundCalleeChain.empty())
2370 return true;
2371
2372 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2373 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2374 // If there is already an edge between these nodes, simply update it and
2375 // return.
2376 if (CurEdge) {
2377 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2378 Edge->ContextIds.end());
2379 CurEdge->AllocTypes |= Edge->AllocTypes;
2380 return;
2381 }
2382 // Otherwise, create a new edge and insert it into the caller and callee
2383 // lists.
2384 auto NewEdge = std::make_shared<ContextEdge>(
2385 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2386 Callee->CallerEdges.push_back(NewEdge);
2387 if (Caller == Edge->Caller) {
2388 // If we are inserting the new edge into the current edge's caller, insert
2389 // the new edge before the current iterator position, and then increment
2390 // back to the current edge.
2391 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2392 ++EI;
2393 assert(*EI == Edge &&
2394 "Iterator position not restored after insert and increment");
2395 } else
2396 Caller->CalleeEdges.push_back(NewEdge);
2397 };
2398
2399 // Create new nodes for each found callee and connect in between the profiled
2400 // caller and callee.
2401 auto *CurCalleeNode = Edge->Callee;
2402 for (auto &[NewCall, Func] : FoundCalleeChain) {
2403 ContextNode *NewNode = nullptr;
2404 // First check if we have already synthesized a node for this tail call.
2405 if (TailCallToContextNodeMap.count(NewCall)) {
2406 NewNode = TailCallToContextNodeMap[NewCall];
2407 NewNode->AllocTypes |= Edge->AllocTypes;
2408 } else {
2409 FuncToCallsWithMetadata[Func].push_back({NewCall});
2410 // Create Node and record node info.
2411 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2412 TailCallToContextNodeMap[NewCall] = NewNode;
2413 NewNode->AllocTypes = Edge->AllocTypes;
2414 }
2415
2416 // Hook up node to its callee node
2417 AddEdge(NewNode, CurCalleeNode);
2418
2419 CurCalleeNode = NewNode;
2420 }
2421
2422 // Hook up edge's original caller to new callee node.
2423 AddEdge(Edge->Caller, CurCalleeNode);
2424
2425#ifndef NDEBUG
2426 // Save this because Edge's fields get cleared below when removed.
2427 auto *Caller = Edge->Caller;
2428#endif
2429
2430 // Remove old edge
2431 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2432
2433 // To simplify the increment of EI in the caller, subtract one from EI.
2434 // In the final AddEdge call we would have either added a new callee edge,
2435 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2436 // that there is at least one callee edge.
2437 assert(!Caller->CalleeEdges.empty());
2438 --EI;
2439
2440 return true;
2441}
2442
2443bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2444 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2445 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2446 bool &FoundMultipleCalleeChains) {
2447 // Stop recursive search if we have already explored the maximum specified
2448 // depth.
2450 return false;
2451
2452 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2453 FoundCalleeChain.push_back({Callsite, F});
2454 };
2455
2456 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2457 if (!CalleeFunc) {
2458 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2459 assert(Alias);
2460 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2461 assert(CalleeFunc);
2462 }
2463
2464 // Look for tail calls in this function, and check if they either call the
2465 // profiled callee directly, or indirectly (via a recursive search).
2466 // Only succeed if there is a single unique tail call chain found between the
2467 // profiled caller and callee, otherwise we could perform incorrect cloning.
2468 bool FoundSingleCalleeChain = false;
2469 for (auto &BB : *CalleeFunc) {
2470 for (auto &I : BB) {
2471 auto *CB = dyn_cast<CallBase>(&I);
2472 if (!CB || !CB->isTailCall())
2473 continue;
2474 auto *CalledValue = CB->getCalledOperand();
2475 auto *CalledFunction = CB->getCalledFunction();
2476 if (CalledValue && !CalledFunction) {
2477 CalledValue = CalledValue->stripPointerCasts();
2478 // Stripping pointer casts can reveal a called function.
2479 CalledFunction = dyn_cast<Function>(CalledValue);
2480 }
2481 // Check if this is an alias to a function. If so, get the
2482 // called aliasee for the checks below.
2483 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2484 assert(!CalledFunction &&
2485 "Expected null called function in callsite for alias");
2486 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2487 }
2488 if (!CalledFunction)
2489 continue;
2490 if (CalledFunction == ProfiledCallee) {
2491 if (FoundSingleCalleeChain) {
2492 FoundMultipleCalleeChains = true;
2493 return false;
2494 }
2495 FoundSingleCalleeChain = true;
2496 FoundProfiledCalleeCount++;
2497 FoundProfiledCalleeDepth += Depth;
2498 if (Depth > FoundProfiledCalleeMaxDepth)
2499 FoundProfiledCalleeMaxDepth = Depth;
2500 SaveCallsiteInfo(&I, CalleeFunc);
2501 } else if (findProfiledCalleeThroughTailCalls(
2502 ProfiledCallee, CalledFunction, Depth + 1,
2503 FoundCalleeChain, FoundMultipleCalleeChains)) {
2504 // findProfiledCalleeThroughTailCalls should not have returned
2505 // true if FoundMultipleCalleeChains.
2506 assert(!FoundMultipleCalleeChains);
2507 if (FoundSingleCalleeChain) {
2508 FoundMultipleCalleeChains = true;
2509 return false;
2510 }
2511 FoundSingleCalleeChain = true;
2512 SaveCallsiteInfo(&I, CalleeFunc);
2513 } else if (FoundMultipleCalleeChains)
2514 return false;
2515 }
2516 }
2517
2518 return FoundSingleCalleeChain;
2519}
2520
2521const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2522 auto *CB = dyn_cast<CallBase>(Call);
2523 if (!CB->getCalledOperand() || CB->isIndirectCall())
2524 return nullptr;
2525 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2526 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2527 if (Alias)
2528 return dyn_cast<Function>(Alias->getAliasee());
2529 return dyn_cast<Function>(CalleeVal);
2530}
2531
2532bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2533 Instruction *Call, const Function *Func, const Function *CallerFunc,
2534 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2535 auto *CB = dyn_cast<CallBase>(Call);
2536 if (!CB->getCalledOperand() || CB->isIndirectCall())
2537 return false;
2538 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2539 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2540 if (CalleeFunc == Func)
2541 return true;
2542 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2543 if (Alias && Alias->getAliasee() == Func)
2544 return true;
2545
2546 // Recursively search for the profiled callee through tail calls starting with
2547 // the actual Callee. The discovered tail call chain is saved in
2548 // FoundCalleeChain, and we will fixup the graph to include these callsites
2549 // after returning.
2550 // FIXME: We will currently redo the same recursive walk if we find the same
2551 // mismatched callee from another callsite. We can improve this with more
2552 // bookkeeping of the created chain of new nodes for each mismatch.
2553 unsigned Depth = 1;
2554 bool FoundMultipleCalleeChains = false;
2555 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2556 FoundCalleeChain,
2557 FoundMultipleCalleeChains)) {
2558 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2559 << Func->getName() << " from " << CallerFunc->getName()
2560 << " that actually called " << CalleeVal->getName()
2561 << (FoundMultipleCalleeChains
2562 ? " (found multiple possible chains)"
2563 : "")
2564 << "\n");
2565 if (FoundMultipleCalleeChains)
2566 FoundProfiledCalleeNonUniquelyCount++;
2567 return false;
2568 }
2569
2570 return true;
2571}
2572
2573bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2574 Instruction *Call2) {
2575 auto *CB1 = cast<CallBase>(Call1);
2576 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2577 return false;
2578 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2579 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2580 auto *CB2 = cast<CallBase>(Call2);
2581 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2582 return false;
2583 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2584 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2585 return CalleeFunc1 == CalleeFunc2;
2586}
2587
2588bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2589 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2590 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2591 bool &FoundMultipleCalleeChains) {
2592 // Stop recursive search if we have already explored the maximum specified
2593 // depth.
2595 return false;
2596
2597 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2598 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2599 // been synthesized.
2600 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2601 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2602 // StackIds is empty (we don't have debug info available in the index for
2603 // these callsites)
2604 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2605 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2606 CallsiteInfo *NewCallsiteInfo =
2607 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2608 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2609 };
2610
2611 // Look for tail calls in this function, and check if they either call the
2612 // profiled callee directly, or indirectly (via a recursive search).
2613 // Only succeed if there is a single unique tail call chain found between the
2614 // profiled caller and callee, otherwise we could perform incorrect cloning.
2615 bool FoundSingleCalleeChain = false;
2616 for (auto &S : CurCallee.getSummaryList()) {
2617 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2618 !isPrevailing(CurCallee.getGUID(), S.get()))
2619 continue;
2620 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2621 if (!FS)
2622 continue;
2623 auto FSVI = CurCallee;
2624 auto *AS = dyn_cast<AliasSummary>(S.get());
2625 if (AS)
2626 FSVI = AS->getAliaseeVI();
2627 for (auto &CallEdge : FS->calls()) {
2628 if (!CallEdge.second.hasTailCall())
2629 continue;
2630 if (CallEdge.first == ProfiledCallee) {
2631 if (FoundSingleCalleeChain) {
2632 FoundMultipleCalleeChains = true;
2633 return false;
2634 }
2635 FoundSingleCalleeChain = true;
2636 FoundProfiledCalleeCount++;
2637 FoundProfiledCalleeDepth += Depth;
2638 if (Depth > FoundProfiledCalleeMaxDepth)
2639 FoundProfiledCalleeMaxDepth = Depth;
2640 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2641 // Add FS to FSToVIMap in case it isn't already there.
2642 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2643 FSToVIMap[FS] = FSVI;
2644 } else if (findProfiledCalleeThroughTailCalls(
2645 ProfiledCallee, CallEdge.first, Depth + 1,
2646 FoundCalleeChain, FoundMultipleCalleeChains)) {
2647 // findProfiledCalleeThroughTailCalls should not have returned
2648 // true if FoundMultipleCalleeChains.
2649 assert(!FoundMultipleCalleeChains);
2650 if (FoundSingleCalleeChain) {
2651 FoundMultipleCalleeChains = true;
2652 return false;
2653 }
2654 FoundSingleCalleeChain = true;
2655 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2656 // Add FS to FSToVIMap in case it isn't already there.
2657 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2658 FSToVIMap[FS] = FSVI;
2659 } else if (FoundMultipleCalleeChains)
2660 return false;
2661 }
2662 }
2663
2664 return FoundSingleCalleeChain;
2665}
2666
2667const FunctionSummary *
2668IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2670 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2671 if (Callee.getSummaryList().empty())
2672 return nullptr;
2673 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2674}
2675
2676bool IndexCallsiteContextGraph::calleeMatchesFunc(
2677 IndexCall &Call, const FunctionSummary *Func,
2678 const FunctionSummary *CallerFunc,
2679 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2681 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2682 // If there is no summary list then this is a call to an externally defined
2683 // symbol.
2684 AliasSummary *Alias =
2685 Callee.getSummaryList().empty()
2686 ? nullptr
2687 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2688 assert(FSToVIMap.count(Func));
2689 auto FuncVI = FSToVIMap[Func];
2690 if (Callee == FuncVI ||
2691 // If callee is an alias, check the aliasee, since only function
2692 // summary base objects will contain the stack node summaries and thus
2693 // get a context node.
2694 (Alias && Alias->getAliaseeVI() == FuncVI))
2695 return true;
2696
2697 // Recursively search for the profiled callee through tail calls starting with
2698 // the actual Callee. The discovered tail call chain is saved in
2699 // FoundCalleeChain, and we will fixup the graph to include these callsites
2700 // after returning.
2701 // FIXME: We will currently redo the same recursive walk if we find the same
2702 // mismatched callee from another callsite. We can improve this with more
2703 // bookkeeping of the created chain of new nodes for each mismatch.
2704 unsigned Depth = 1;
2705 bool FoundMultipleCalleeChains = false;
2706 if (!findProfiledCalleeThroughTailCalls(
2707 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2708 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2709 << " from " << FSToVIMap[CallerFunc]
2710 << " that actually called " << Callee
2711 << (FoundMultipleCalleeChains
2712 ? " (found multiple possible chains)"
2713 : "")
2714 << "\n");
2715 if (FoundMultipleCalleeChains)
2716 FoundProfiledCalleeNonUniquelyCount++;
2717 return false;
2718 }
2719
2720 return true;
2721}
2722
2723bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2724 ValueInfo Callee1 =
2725 dyn_cast_if_present<CallsiteInfo *>(Call1.getBase())->Callee;
2726 ValueInfo Callee2 =
2727 dyn_cast_if_present<CallsiteInfo *>(Call2.getBase())->Callee;
2728 return Callee1 == Callee2;
2729}
2730
2731template <typename DerivedCCG, typename FuncTy, typename CallTy>
2732void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2733 const {
2734 print(dbgs());
2735 dbgs() << "\n";
2736}
2737
2738template <typename DerivedCCG, typename FuncTy, typename CallTy>
2739void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2740 raw_ostream &OS) const {
2741 OS << "Node " << this << "\n";
2742 OS << "\t";
2743 printCall(OS);
2744 if (Recursive)
2745 OS << " (recursive)";
2746 OS << "\n";
2747 if (!MatchingCalls.empty()) {
2748 OS << "\tMatchingCalls:\n";
2749 for (auto &MatchingCall : MatchingCalls) {
2750 OS << "\t";
2751 MatchingCall.print(OS);
2752 OS << "\n";
2753 }
2754 }
2755 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2756 OS << "\tContextIds:";
2757 // Make a copy of the computed context ids that we can sort for stability.
2758 auto ContextIds = getContextIds();
2759 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2760 std::sort(SortedIds.begin(), SortedIds.end());
2761 for (auto Id : SortedIds)
2762 OS << " " << Id;
2763 OS << "\n";
2764 OS << "\tCalleeEdges:\n";
2765 for (auto &Edge : CalleeEdges)
2766 OS << "\t\t" << *Edge << "\n";
2767 OS << "\tCallerEdges:\n";
2768 for (auto &Edge : CallerEdges)
2769 OS << "\t\t" << *Edge << "\n";
2770 if (!Clones.empty()) {
2771 OS << "\tClones: ";
2772 ListSeparator LS;
2773 for (auto *Clone : Clones)
2774 OS << LS << Clone;
2775 OS << "\n";
2776 } else if (CloneOf) {
2777 OS << "\tClone of " << CloneOf << "\n";
2778 }
2779}
2780
2781template <typename DerivedCCG, typename FuncTy, typename CallTy>
2782void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2783 const {
2784 print(dbgs());
2785 dbgs() << "\n";
2786}
2787
2788template <typename DerivedCCG, typename FuncTy, typename CallTy>
2789void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2790 raw_ostream &OS) const {
2791 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2792 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2793 OS << " ContextIds:";
2794 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2795 std::sort(SortedIds.begin(), SortedIds.end());
2796 for (auto Id : SortedIds)
2797 OS << " " << Id;
2798}
2799
2800template <typename DerivedCCG, typename FuncTy, typename CallTy>
2801void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2802 print(dbgs());
2803}
2804
2805template <typename DerivedCCG, typename FuncTy, typename CallTy>
2806void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2807 raw_ostream &OS) const {
2808 OS << "Callsite Context Graph:\n";
2809 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2810 for (const auto Node : nodes<GraphType>(this)) {
2811 if (Node->isRemoved())
2812 continue;
2813 Node->print(OS);
2814 OS << "\n";
2815 }
2816}
2817
2818template <typename DerivedCCG, typename FuncTy, typename CallTy>
2819void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2820 raw_ostream &OS) const {
2821 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2822 for (const auto Node : nodes<GraphType>(this)) {
2823 if (Node->isRemoved())
2824 continue;
2825 if (!Node->IsAllocation)
2826 continue;
2827 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2828 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2829 std::sort(SortedIds.begin(), SortedIds.end());
2830 for (auto Id : SortedIds) {
2831 auto TypeI = ContextIdToAllocationType.find(Id);
2832 assert(TypeI != ContextIdToAllocationType.end());
2833 auto CSI = ContextIdToContextSizeInfos.find(Id);
2834 if (CSI != ContextIdToContextSizeInfos.end()) {
2835 for (auto &Info : CSI->second) {
2836 OS << "MemProf hinting: "
2837 << getAllocTypeString((uint8_t)TypeI->second)
2838 << " full allocation context " << Info.FullStackId
2839 << " with total size " << Info.TotalSize << " is "
2840 << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2841 }
2842 }
2843 }
2844 }
2845}
2846
2847template <typename DerivedCCG, typename FuncTy, typename CallTy>
2848void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2849 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2850 for (const auto Node : nodes<GraphType>(this)) {
2851 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2852 for (auto &Edge : Node->CallerEdges)
2853 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2854 }
2855}
2856
2857template <typename DerivedCCG, typename FuncTy, typename CallTy>
2858struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2859 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2860 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2861
2862 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2863 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2864
2867 decltype(&getNode)>;
2868
2870 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2871 }
2872
2874 return nodes_iterator(G->NodeOwner.end(), &getNode);
2875 }
2876
2878 return G->NodeOwner.begin()->get();
2879 }
2880
2881 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2882 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2884 return P->Callee;
2885 }
2886
2888 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2889 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2890 decltype(&GetCallee)>;
2891
2893 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2894 }
2895
2897 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2898 }
2899};
2900
2901template <typename DerivedCCG, typename FuncTy, typename CallTy>
2902struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2903 : public DefaultDOTGraphTraits {
2904 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2905
2906 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2908 using NodeRef = typename GTraits::NodeRef;
2909 using ChildIteratorType = typename GTraits::ChildIteratorType;
2910
2911 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2912 std::string LabelString =
2913 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2914 Twine(Node->OrigStackOrAllocId))
2915 .str();
2916 LabelString += "\n";
2917 if (Node->hasCall()) {
2918 auto Func = G->NodeToCallingFunc.find(Node);
2919 assert(Func != G->NodeToCallingFunc.end());
2920 LabelString +=
2921 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2922 } else {
2923 LabelString += "null call";
2924 if (Node->Recursive)
2925 LabelString += " (recursive)";
2926 else
2927 LabelString += " (external)";
2928 }
2929 return LabelString;
2930 }
2931
2932 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2933 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2934 getContextIds(Node->getContextIds()) + "\"")
2935 .str();
2936 AttributeString +=
2937 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2938 AttributeString += ",style=\"filled\"";
2939 if (Node->CloneOf) {
2940 AttributeString += ",color=\"blue\"";
2941 AttributeString += ",style=\"filled,bold,dashed\"";
2942 } else
2943 AttributeString += ",style=\"filled\"";
2944 return AttributeString;
2945 }
2946
2947 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2948 GraphType) {
2949 auto &Edge = *(ChildIter.getCurrent());
2950 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2951 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2952 .str();
2953 }
2954
2955 // Since the NodeOwners list includes nodes that are no longer connected to
2956 // the graph, skip them here.
2957 static bool isNodeHidden(NodeRef Node, GraphType) {
2958 return Node->isRemoved();
2959 }
2960
2961private:
2962 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2963 std::string IdString = "ContextIds:";
2964 if (ContextIds.size() < 100) {
2965 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2966 std::sort(SortedIds.begin(), SortedIds.end());
2967 for (auto Id : SortedIds)
2968 IdString += (" " + Twine(Id)).str();
2969 } else {
2970 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2971 }
2972 return IdString;
2973 }
2974
2975 static std::string getColor(uint8_t AllocTypes) {
2976 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2977 // Color "brown1" actually looks like a lighter red.
2978 return "brown1";
2979 if (AllocTypes == (uint8_t)AllocationType::Cold)
2980 return "cyan";
2981 if (AllocTypes ==
2983 // Lighter purple.
2984 return "mediumorchid1";
2985 return "gray";
2986 }
2987
2988 static std::string getNodeId(NodeRef Node) {
2989 std::stringstream SStream;
2990 SStream << std::hex << "N0x" << (unsigned long long)Node;
2991 std::string Result = SStream.str();
2992 return Result;
2993 }
2994};
2995
2996template <typename DerivedCCG, typename FuncTy, typename CallTy>
2997void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2998 std::string Label) const {
2999 WriteGraph(this, "", false, Label,
3000 DotFilePathPrefix + "ccg." + Label + ".dot");
3001}
3002
3003template <typename DerivedCCG, typename FuncTy, typename CallTy>
3004typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3005CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3006 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
3007 DenseSet<uint32_t> ContextIdsToMove) {
3008 ContextNode *Node = Edge->Callee;
3009 assert(NodeToCallingFunc.count(Node));
3010 ContextNode *Clone =
3011 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3012 Node->addClone(Clone);
3013 Clone->MatchingCalls = Node->MatchingCalls;
3014 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
3015 ContextIdsToMove);
3016 return Clone;
3017}
3018
3019template <typename DerivedCCG, typename FuncTy, typename CallTy>
3020void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3021 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3022 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
3023 bool NewClone,
3024 DenseSet<uint32_t> ContextIdsToMove) {
3025 // NewCallee and Edge's current callee must be clones of the same original
3026 // node (Edge's current callee may be the original node too).
3027 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3028
3029 ContextNode *OldCallee = Edge->Callee;
3030
3031 // We might already have an edge to the new callee from earlier cloning for a
3032 // different allocation. If one exists we will reuse it.
3033 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3034
3035 // Callers will pass an empty ContextIdsToMove set when they want to move the
3036 // edge. Copy in Edge's ids for simplicity.
3037 if (ContextIdsToMove.empty())
3038 ContextIdsToMove = Edge->getContextIds();
3039
3040 // If we are moving all of Edge's ids, then just move the whole Edge.
3041 // Otherwise only move the specified subset, to a new edge if needed.
3042 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3043 // First, update the alloc types on New Callee from Edge.
3044 // Do this before we potentially clear Edge's fields below!
3045 NewCallee->AllocTypes |= Edge->AllocTypes;
3046 // Moving the whole Edge.
3047 if (ExistingEdgeToNewCallee) {
3048 // Since we already have an edge to NewCallee, simply move the ids
3049 // onto it, and remove the existing Edge.
3050 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3051 ContextIdsToMove.end());
3052 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3053 assert(Edge->ContextIds == ContextIdsToMove);
3054 removeEdgeFromGraph(Edge.get(), CallerEdgeI, /*CalleeIter=*/false);
3055 } else {
3056 // Otherwise just reconnect Edge to NewCallee.
3057 Edge->Callee = NewCallee;
3058 NewCallee->CallerEdges.push_back(Edge);
3059 // Remove it from callee where it was previously connected.
3060 if (CallerEdgeI)
3061 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
3062 else
3063 OldCallee->eraseCallerEdge(Edge.get());
3064 // Don't need to update Edge's context ids since we are simply
3065 // reconnecting it.
3066 }
3067 } else {
3068 // Only moving a subset of Edge's ids.
3069 if (CallerEdgeI)
3070 ++CallerEdgeI;
3071 // Compute the alloc type of the subset of ids being moved.
3072 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3073 if (ExistingEdgeToNewCallee) {
3074 // Since we already have an edge to NewCallee, simply move the ids
3075 // onto it.
3076 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3077 ContextIdsToMove.end());
3078 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3079 } else {
3080 // Otherwise, create a new edge to NewCallee for the ids being moved.
3081 auto NewEdge = std::make_shared<ContextEdge>(
3082 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3083 Edge->Caller->CalleeEdges.push_back(NewEdge);
3084 NewCallee->CallerEdges.push_back(NewEdge);
3085 }
3086 // In either case, need to update the alloc types on NewCallee, and remove
3087 // those ids and update the alloc type on the original Edge.
3088 NewCallee->AllocTypes |= CallerEdgeAllocType;
3089 set_subtract(Edge->ContextIds, ContextIdsToMove);
3090 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3091 }
3092 // Now walk the old callee node's callee edges and move Edge's context ids
3093 // over to the corresponding edge into the clone (which is created here if
3094 // this is a newly created clone).
3095 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3096 // The context ids moving to the new callee are the subset of this edge's
3097 // context ids and the context ids on the caller edge being moved.
3098 DenseSet<uint32_t> EdgeContextIdsToMove =
3099 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3100 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3101 OldCalleeEdge->AllocTypes =
3102 computeAllocType(OldCalleeEdge->getContextIds());
3103 if (!NewClone) {
3104 // Update context ids / alloc type on corresponding edge to NewCallee.
3105 // There is a chance this may not exist if we are reusing an existing
3106 // clone, specifically during function assignment, where we would have
3107 // removed none type edges after creating the clone. If we can't find
3108 // a corresponding edge there, fall through to the cloning below.
3109 if (auto *NewCalleeEdge =
3110 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
3111 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3112 EdgeContextIdsToMove.end());
3113 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3114 continue;
3115 }
3116 }
3117 auto NewEdge = std::make_shared<ContextEdge>(
3118 OldCalleeEdge->Callee, NewCallee,
3119 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3120 NewCallee->CalleeEdges.push_back(NewEdge);
3121 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3122 }
3123 // Recompute the node alloc type now that its callee edges have been
3124 // updated (since we will compute from those edges).
3125 OldCallee->AllocTypes = OldCallee->computeAllocType();
3126 // OldCallee alloc type should be None iff its context id set is now empty.
3127 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3128 OldCallee->emptyContextIds());
3129 if (VerifyCCG) {
3130 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3131 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3132 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3133 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3134 /*CheckEdges=*/false);
3135 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3136 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3137 /*CheckEdges=*/false);
3138 }
3139}
3140
3141template <typename DerivedCCG, typename FuncTy, typename CallTy>
3142void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3143 moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller) {
3144 auto Edge = *CalleeEdgeI;
3145
3146 ContextNode *OldCaller = Edge->Caller;
3147
3148 // We might already have an edge to the new caller. If one exists we will
3149 // reuse it.
3150 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
3151
3152 CalleeEdgeI = OldCaller->CalleeEdges.erase(CalleeEdgeI);
3153 if (ExistingEdgeToNewCaller) {
3154 // Since we already have an edge to NewCaller, simply move the ids
3155 // onto it, and remove the existing Edge.
3156 ExistingEdgeToNewCaller->getContextIds().insert(
3157 Edge->getContextIds().begin(), Edge->getContextIds().end());
3158 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3159 Edge->ContextIds.clear();
3160 Edge->AllocTypes = (uint8_t)AllocationType::None;
3161 Edge->Callee->eraseCallerEdge(Edge.get());
3162 } else {
3163 // Otherwise just reconnect Edge to NewCaller.
3164 Edge->Caller = NewCaller;
3165 NewCaller->CalleeEdges.push_back(Edge);
3166 // Don't need to update Edge's context ids since we are simply
3167 // reconnecting it.
3168 }
3169 // In either case, need to update the alloc types on New Caller.
3170 NewCaller->AllocTypes |= Edge->AllocTypes;
3171
3172 // Now walk the old caller node's caller edges and move Edge's context ids
3173 // over to the corresponding edge into the node (which is created here if
3174 // this is a newly created node). We can tell whether this is a newly created
3175 // node by seeing if it has any caller edges yet.
3176#ifndef NDEBUG
3177 bool IsNewNode = NewCaller->CallerEdges.empty();
3178#endif
3179 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3180 // The context ids moving to the new caller are the subset of this edge's
3181 // context ids and the context ids on the callee edge being moved.
3182 DenseSet<uint32_t> EdgeContextIdsToMove =
3183 set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
3184 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3185 OldCallerEdge->AllocTypes =
3186 computeAllocType(OldCallerEdge->getContextIds());
3187 // In this function we expect that any pre-existing node already has edges
3188 // from the same callers as the old node. That should be true in the current
3189 // use case, where we will remove None-type edges after copying over all
3190 // caller edges from the callee.
3191 auto *ExistingCallerEdge =
3192 NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
3193 assert(IsNewNode || ExistingCallerEdge);
3194 if (ExistingCallerEdge) {
3195 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3196 EdgeContextIdsToMove.end());
3197 ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3198 continue;
3199 }
3200 auto NewEdge = std::make_shared<ContextEdge>(
3201 NewCaller, OldCallerEdge->Caller,
3202 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3203 NewCaller->CallerEdges.push_back(NewEdge);
3204 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3205 }
3206 // Recompute the node alloc type now that its caller edges have been
3207 // updated (since we will compute from those edges).
3208 OldCaller->AllocTypes = OldCaller->computeAllocType();
3209 // OldCaller alloc type should be None iff its context id set is now empty.
3210 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3211 OldCaller->emptyContextIds());
3212 if (VerifyCCG) {
3213 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3214 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3215 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3216 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3217 /*CheckEdges=*/false);
3218 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3219 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3220 /*CheckEdges=*/false);
3221 }
3222}
3223
3224template <typename DerivedCCG, typename FuncTy, typename CallTy>
3225void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3226 recursivelyRemoveNoneTypeCalleeEdges(
3227 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3228 auto Inserted = Visited.insert(Node);
3229 if (!Inserted.second)
3230 return;
3231
3232 removeNoneTypeCalleeEdges(Node);
3233
3234 for (auto *Clone : Node->Clones)
3235 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3236
3237 // The recursive call may remove some of this Node's caller edges.
3238 // Iterate over a copy and skip any that were removed.
3239 auto CallerEdges = Node->CallerEdges;
3240 for (auto &Edge : CallerEdges) {
3241 // Skip any that have been removed by an earlier recursive call.
3242 if (Edge->isRemoved()) {
3243 assert(!is_contained(Node->CallerEdges, Edge));
3244 continue;
3245 }
3246 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3247 }
3248}
3249
3250template <typename DerivedCCG, typename FuncTy, typename CallTy>
3251void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3253 for (auto &Entry : AllocationCallToContextNodeMap) {
3254 Visited.clear();
3255 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3256 }
3257 Visited.clear();
3258 for (auto &Entry : AllocationCallToContextNodeMap)
3259 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3260 if (VerifyCCG)
3261 check();
3262}
3263
3264// helper function to check an AllocType is cold or notcold or both.
3266 return (AllocType == (uint8_t)AllocationType::Cold) ||
3268 (AllocType ==
3270}
3271
3272template <typename DerivedCCG, typename FuncTy, typename CallTy>
3273void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3274 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3275 const DenseSet<uint32_t> &AllocContextIds) {
3276 if (VerifyNodes)
3277 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3278 assert(!Node->CloneOf);
3279
3280 // If Node as a null call, then either it wasn't found in the module (regular
3281 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3282 // cloning (e.g. recursion, calls multiple targets, etc).
3283 // Do this here so that we don't try to recursively clone callers below, which
3284 // isn't useful at least for this node.
3285 if (!Node->hasCall())
3286 return;
3287
3288#ifndef NDEBUG
3289 auto Insert =
3290#endif
3291 Visited.insert(Node);
3292 // We should not have visited this node yet.
3293 assert(Insert.second);
3294 // The recursive call to identifyClones may delete the current edge from the
3295 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3296 // in an iterator and having recursive call erase from it. Other edges may
3297 // also get removed during the recursion, which will have null Callee and
3298 // Caller pointers (and are deleted later), so we skip those below.
3299 {
3300 auto CallerEdges = Node->CallerEdges;
3301 for (auto &Edge : CallerEdges) {
3302 // Skip any that have been removed by an earlier recursive call.
3303 if (Edge->isRemoved()) {
3304 assert(!is_contained(Node->CallerEdges, Edge));
3305 continue;
3306 }
3307 // Ignore any caller we previously visited via another edge.
3308 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3309 identifyClones(Edge->Caller, Visited, AllocContextIds);
3310 }
3311 }
3312 }
3313
3314 // Check if we reached an unambiguous call or have have only a single caller.
3315 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3316 return;
3317
3318 // We need to clone.
3319
3320 // Try to keep the original version as alloc type NotCold. This will make
3321 // cases with indirect calls or any other situation with an unknown call to
3322 // the original function get the default behavior. We do this by sorting the
3323 // CallerEdges of the Node we will clone by alloc type.
3324 //
3325 // Give NotCold edge the lowest sort priority so those edges are at the end of
3326 // the caller edges vector, and stay on the original version (since the below
3327 // code clones greedily until it finds all remaining edges have the same type
3328 // and leaves the remaining ones on the original Node).
3329 //
3330 // We shouldn't actually have any None type edges, so the sorting priority for
3331 // that is arbitrary, and we assert in that case below.
3332 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3333 /*Cold*/ 1,
3334 /*NotColdCold*/ 2};
3335 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3336 [&](const std::shared_ptr<ContextEdge> &A,
3337 const std::shared_ptr<ContextEdge> &B) {
3338 // Nodes with non-empty context ids should be sorted before
3339 // those with empty context ids.
3340 if (A->ContextIds.empty())
3341 // Either B ContextIds are non-empty (in which case we
3342 // should return false because B < A), or B ContextIds
3343 // are empty, in which case they are equal, and we should
3344 // maintain the original relative ordering.
3345 return false;
3346 if (B->ContextIds.empty())
3347 return true;
3348
3349 if (A->AllocTypes == B->AllocTypes)
3350 // Use the first context id for each edge as a
3351 // tie-breaker.
3352 return *A->ContextIds.begin() < *B->ContextIds.begin();
3353 return AllocTypeCloningPriority[A->AllocTypes] <
3354 AllocTypeCloningPriority[B->AllocTypes];
3355 });
3356
3357 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3358
3359 // Iterate until we find no more opportunities for disambiguating the alloc
3360 // types via cloning. In most cases this loop will terminate once the Node
3361 // has a single allocation type, in which case no more cloning is needed.
3362 // We need to be able to remove Edge from CallerEdges, so need to adjust
3363 // iterator inside the loop.
3364 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
3365 auto CallerEdge = *EI;
3366
3367 // See if cloning the prior caller edge left this node with a single alloc
3368 // type or a single caller. In that case no more cloning of Node is needed.
3369 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3370 break;
3371
3372 // If the caller was not successfully matched to a call in the IR/summary,
3373 // there is no point in trying to clone for it as we can't update that call.
3374 if (!CallerEdge->Caller->hasCall()) {
3375 ++EI;
3376 continue;
3377 }
3378
3379 // Only need to process the ids along this edge pertaining to the given
3380 // allocation.
3381 auto CallerEdgeContextsForAlloc =
3382 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3383 if (CallerEdgeContextsForAlloc.empty()) {
3384 ++EI;
3385 continue;
3386 }
3387 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3388
3389 // Compute the node callee edge alloc types corresponding to the context ids
3390 // for this caller edge.
3391 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3392 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3393 for (auto &CalleeEdge : Node->CalleeEdges)
3394 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3395 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3396
3397 // Don't clone if doing so will not disambiguate any alloc types amongst
3398 // caller edges (including the callee edges that would be cloned).
3399 // Otherwise we will simply move all edges to the clone.
3400 //
3401 // First check if by cloning we will disambiguate the caller allocation
3402 // type from node's allocation type. Query allocTypeToUse so that we don't
3403 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3404 // neither of these should be None type.
3405 //
3406 // Then check if by cloning node at least one of the callee edges will be
3407 // disambiguated by splitting out different context ids.
3408 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3409 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3410 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3411 allocTypeToUse(Node->AllocTypes) &&
3412 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3413 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3414 ++EI;
3415 continue;
3416 }
3417
3418 // First see if we can use an existing clone. Check each clone and its
3419 // callee edges for matching alloc types.
3420 ContextNode *Clone = nullptr;
3421 for (auto *CurClone : Node->Clones) {
3422 if (allocTypeToUse(CurClone->AllocTypes) !=
3423 allocTypeToUse(CallerAllocTypeForAlloc))
3424 continue;
3425
3426 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3427 hasSingleAllocType(CallerAllocTypeForAlloc);
3428 // The above check should mean that if both have single alloc types that
3429 // they should be equal.
3430 assert(!BothSingleAlloc ||
3431 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3432
3433 // If either both have a single alloc type (which are the same), or if the
3434 // clone's callee edges have the same alloc types as those for the current
3435 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3436 // then we can reuse this clone.
3437 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3438 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3439 Clone = CurClone;
3440 break;
3441 }
3442 }
3443
3444 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3445 if (Clone)
3446 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
3447 CallerEdgeContextsForAlloc);
3448 else
3449 Clone =
3450 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
3451
3452 assert(EI == Node->CallerEdges.end() ||
3453 Node->AllocTypes != (uint8_t)AllocationType::None);
3454 // Sanity check that no alloc types on clone or its edges are None.
3455 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3456 }
3457
3458 // We should still have some context ids on the original Node.
3459 assert(!Node->emptyContextIds());
3460
3461 // Sanity check that no alloc types on node or edges are None.
3462 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3463
3464 if (VerifyNodes)
3465 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3466}
3467
3468void ModuleCallsiteContextGraph::updateAllocationCall(
3470 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3471 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3472 "memprof", AllocTypeString);
3473 cast<CallBase>(Call.call())->addFnAttr(A);
3474 OREGetter(Call.call()->getFunction())
3475 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3476 << ore::NV("AllocationCall", Call.call()) << " in clone "
3477 << ore::NV("Caller", Call.call()->getFunction())
3478 << " marked with memprof allocation attribute "
3479 << ore::NV("Attribute", AllocTypeString));
3480}
3481
3482void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3484 auto *AI = Call.call().dyn_cast<AllocInfo *>();
3485 assert(AI);
3486 assert(AI->Versions.size() > Call.cloneNo());
3487 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3488}
3489
3490void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3491 FuncInfo CalleeFunc) {
3492 if (CalleeFunc.cloneNo() > 0)
3493 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3494 OREGetter(CallerCall.call()->getFunction())
3495 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3496 << ore::NV("Call", CallerCall.call()) << " in clone "
3497 << ore::NV("Caller", CallerCall.call()->getFunction())
3498 << " assigned to call function clone "
3499 << ore::NV("Callee", CalleeFunc.func()));
3500}
3501
3502void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3503 FuncInfo CalleeFunc) {
3504 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
3505 assert(CI &&
3506 "Caller cannot be an allocation which should not have profiled calls");
3507 assert(CI->Clones.size() > CallerCall.cloneNo());
3508 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3509}
3510
3511CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3512 Instruction *>::FuncInfo
3513ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3514 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3515 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3516 // Use existing LLVM facilities for cloning and obtaining Call in clone
3517 ValueToValueMapTy VMap;
3518 auto *NewFunc = CloneFunction(Func.func(), VMap);
3519 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3520 assert(!Func.func()->getParent()->getFunction(Name));
3521 NewFunc->setName(Name);
3522 for (auto &Inst : CallsWithMetadataInFunc) {
3523 // This map always has the initial version in it.
3524 assert(Inst.cloneNo() == 0);
3525 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3526 }
3527 OREGetter(Func.func())
3528 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3529 << "created clone " << ore::NV("NewFunction", NewFunc));
3530 return {NewFunc, CloneNo};
3531}
3532
3533CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3534 IndexCall>::FuncInfo
3535IndexCallsiteContextGraph::cloneFunctionForCallsite(
3536 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3537 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3538 // Check how many clones we have of Call (and therefore function).
3539 // The next clone number is the current size of versions array.
3540 // Confirm this matches the CloneNo provided by the caller, which is based on
3541 // the number of function clones we have.
3542 assert(CloneNo ==
3543 (Call.call().is<AllocInfo *>()
3544 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
3545 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
3546 // Walk all the instructions in this function. Create a new version for
3547 // each (by adding an entry to the Versions/Clones summary array), and copy
3548 // over the version being called for the function clone being cloned here.
3549 // Additionally, add an entry to the CallMap for the new function clone,
3550 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3551 // to the new call clone.
3552 for (auto &Inst : CallsWithMetadataInFunc) {
3553 // This map always has the initial version in it.
3554 assert(Inst.cloneNo() == 0);
3555 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
3556 assert(AI->Versions.size() == CloneNo);
3557 // We assign the allocation type later (in updateAllocationCall), just add
3558 // an entry for it here.
3559 AI->Versions.push_back(0);
3560 } else {
3561 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
3562 assert(CI && CI->Clones.size() == CloneNo);
3563 // We assign the clone number later (in updateCall), just add an entry for
3564 // it here.
3565 CI->Clones.push_back(0);
3566 }
3567 CallMap[Inst] = {Inst.call(), CloneNo};
3568 }
3569 return {Func.func(), CloneNo};
3570}
3571
3572// This method assigns cloned callsites to functions, cloning the functions as
3573// needed. The assignment is greedy and proceeds roughly as follows:
3574//
3575// For each function Func:
3576// For each call with graph Node having clones:
3577// Initialize ClonesWorklist to Node and its clones
3578// Initialize NodeCloneCount to 0
3579// While ClonesWorklist is not empty:
3580// Clone = pop front ClonesWorklist
3581// NodeCloneCount++
3582// If Func has been cloned less than NodeCloneCount times:
3583// If NodeCloneCount is 1:
3584// Assign Clone to original Func
3585// Continue
3586// Create a new function clone
3587// If other callers not assigned to call a function clone yet:
3588// Assign them to call new function clone
3589// Continue
3590// Assign any other caller calling the cloned version to new clone
3591//
3592// For each caller of Clone:
3593// If caller is assigned to call a specific function clone:
3594// If we cannot assign Clone to that function clone:
3595// Create new callsite Clone NewClone
3596// Add NewClone to ClonesWorklist
3597// Continue
3598// Assign Clone to existing caller's called function clone
3599// Else:
3600// If Clone not already assigned to a function clone:
3601// Assign to first function clone without assignment
3602// Assign caller to selected function clone
3603template <typename DerivedCCG, typename FuncTy, typename CallTy>
3604bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3605 bool Changed = false;
3606
3607 // Keep track of the assignment of nodes (callsites) to function clones they
3608 // call.
3609 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3610
3611 // Update caller node to call function version CalleeFunc, by recording the
3612 // assignment in CallsiteToCalleeFuncCloneMap.
3613 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3614 const FuncInfo &CalleeFunc) {
3615 assert(Caller->hasCall());
3616 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3617 };
3618
3619 // Walk all functions for which we saw calls with memprof metadata, and handle
3620 // cloning for each of its calls.
3621 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3622 FuncInfo OrigFunc(Func);
3623 // Map from each clone of OrigFunc to a map of remappings of each call of
3624 // interest (from original uncloned call to the corresponding cloned call in
3625 // that function clone).
3626 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3627 for (auto &Call : CallsWithMetadata) {
3628 ContextNode *Node = getNodeForInst(Call);
3629 // Skip call if we do not have a node for it (all uses of its stack ids
3630 // were either on inlined chains or pruned from the MIBs), or if we did
3631 // not create any clones for it.
3632 if (!Node || Node->Clones.empty())
3633 continue;
3634 assert(Node->hasCall() &&
3635 "Not having a call should have prevented cloning");
3636
3637 // Track the assignment of function clones to clones of the current
3638 // callsite Node being handled.
3639 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3640
3641 // Assign callsite version CallsiteClone to function version FuncClone,
3642 // and also assign (possibly cloned) Call to CallsiteClone.
3643 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3644 CallInfo &Call,
3645 ContextNode *CallsiteClone,
3646 bool IsAlloc) {
3647 // Record the clone of callsite node assigned to this function clone.
3648 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3649
3650 assert(FuncClonesToCallMap.count(FuncClone));
3651 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3652 CallInfo CallClone(Call);
3653 if (CallMap.count(Call))
3654 CallClone = CallMap[Call];
3655 CallsiteClone->setCall(CallClone);
3656 // Need to do the same for all matching calls.
3657 for (auto &MatchingCall : Node->MatchingCalls) {
3658 CallInfo CallClone(MatchingCall);
3659 if (CallMap.count(MatchingCall))
3660 CallClone = CallMap[MatchingCall];
3661 // Updates the call in the list.
3662 MatchingCall = CallClone;
3663 }
3664 };
3665
3666 // Keep track of the clones of callsite Node that need to be assigned to
3667 // function clones. This list may be expanded in the loop body below if we
3668 // find additional cloning is required.
3669 std::deque<ContextNode *> ClonesWorklist;
3670 // Ignore original Node if we moved all of its contexts to clones.
3671 if (!Node->emptyContextIds())
3672 ClonesWorklist.push_back(Node);
3673 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3674 Node->Clones.end());
3675
3676 // Now walk through all of the clones of this callsite Node that we need,
3677 // and determine the assignment to a corresponding clone of the current
3678 // function (creating new function clones as needed).
3679 unsigned NodeCloneCount = 0;
3680 while (!ClonesWorklist.empty()) {
3681 ContextNode *Clone = ClonesWorklist.front();
3682 ClonesWorklist.pop_front();
3683 NodeCloneCount++;
3684 if (VerifyNodes)
3685 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3686
3687 // Need to create a new function clone if we have more callsite clones
3688 // than existing function clones, which would have been assigned to an
3689 // earlier clone in the list (we assign callsite clones to function
3690 // clones greedily).
3691 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3692 // If this is the first callsite copy, assign to original function.
3693 if (NodeCloneCount == 1) {
3694 // Since FuncClonesToCallMap is empty in this case, no clones have
3695 // been created for this function yet, and no callers should have
3696 // been assigned a function clone for this callee node yet.
3698 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3699 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3700 }));
3701 // Initialize with empty call map, assign Clone to original function
3702 // and its callers, and skip to the next clone.
3703 FuncClonesToCallMap[OrigFunc] = {};
3704 AssignCallsiteCloneToFuncClone(
3705 OrigFunc, Call, Clone,
3706 AllocationCallToContextNodeMap.count(Call));
3707 for (auto &CE : Clone->CallerEdges) {
3708 // Ignore any caller that does not have a recorded callsite Call.
3709 if (!CE->Caller->hasCall())
3710 continue;
3711 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3712 }
3713 continue;
3714 }
3715
3716 // First locate which copy of OrigFunc to clone again. If a caller
3717 // of this callsite clone was already assigned to call a particular
3718 // function clone, we need to redirect all of those callers to the
3719 // new function clone, and update their other callees within this
3720 // function.
3721 FuncInfo PreviousAssignedFuncClone;
3722 auto EI = llvm::find_if(
3723 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3724 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3725 });
3726 bool CallerAssignedToCloneOfFunc = false;
3727 if (EI != Clone->CallerEdges.end()) {
3728 const std::shared_ptr<ContextEdge> &Edge = *EI;
3729 PreviousAssignedFuncClone =
3730 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3731 CallerAssignedToCloneOfFunc = true;
3732 }
3733
3734 // Clone function and save it along with the CallInfo map created
3735 // during cloning in the FuncClonesToCallMap.
3736 std::map<CallInfo, CallInfo> NewCallMap;
3737 unsigned CloneNo = FuncClonesToCallMap.size();
3738 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3739 "should already exist in the map");
3740 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3741 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3742 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3743 FunctionClonesAnalysis++;
3744 Changed = true;
3745
3746 // If no caller callsites were already assigned to a clone of this
3747 // function, we can simply assign this clone to the new func clone
3748 // and update all callers to it, then skip to the next clone.
3749 if (!CallerAssignedToCloneOfFunc) {
3750 AssignCallsiteCloneToFuncClone(
3751 NewFuncClone, Call, Clone,
3752 AllocationCallToContextNodeMap.count(Call));
3753 for (auto &CE : Clone->CallerEdges) {
3754 // Ignore any caller that does not have a recorded callsite Call.
3755 if (!CE->Caller->hasCall())
3756 continue;
3757 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3758 }
3759 continue;
3760 }
3761
3762 // We may need to do additional node cloning in this case.
3763 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3764 // that were previously assigned to call PreviousAssignedFuncClone,
3765 // to record that they now call NewFuncClone.
3766 // The none type edge removal may remove some of this Clone's caller
3767 // edges, if it is reached via another of its caller's callees.
3768 // Iterate over a copy and skip any that were removed.
3769 auto CallerEdges = Clone->CallerEdges;
3770 for (auto CE : CallerEdges) {
3771 // Skip any that have been removed on an earlier iteration.
3772 if (CE->isRemoved()) {
3773 assert(!is_contained(Clone->CallerEdges, CE));
3774 continue;
3775 }
3776 assert(CE);
3777 // Ignore any caller that does not have a recorded callsite Call.
3778 if (!CE->Caller->hasCall())
3779 continue;
3780
3781 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3782 // We subsequently fall through to later handling that
3783 // will perform any additional cloning required for
3784 // callers that were calling other function clones.
3785 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3786 PreviousAssignedFuncClone)
3787 continue;
3788
3789 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3790
3791 // If we are cloning a function that was already assigned to some
3792 // callers, then essentially we are creating new callsite clones
3793 // of the other callsites in that function that are reached by those
3794 // callers. Clone the other callees of the current callsite's caller
3795 // that were already assigned to PreviousAssignedFuncClone
3796 // accordingly. This is important since we subsequently update the
3797 // calls from the nodes in the graph and their assignments to callee
3798 // functions recorded in CallsiteToCalleeFuncCloneMap.
3799 // The none type edge removal may remove some of this caller's
3800 // callee edges, if it is reached via another of its callees.
3801 // Iterate over a copy and skip any that were removed.
3802 auto CalleeEdges = CE->Caller->CalleeEdges;
3803 for (auto CalleeEdge : CalleeEdges) {
3804 // Skip any that have been removed on an earlier iteration when
3805 // cleaning up newly None type callee edges.
3806 if (CalleeEdge->isRemoved()) {
3807 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3808 continue;
3809 }
3810 assert(CalleeEdge);
3811 ContextNode *Callee = CalleeEdge->Callee;
3812 // Skip the current callsite, we are looking for other
3813 // callsites Caller calls, as well as any that does not have a
3814 // recorded callsite Call.
3815 if (Callee == Clone || !Callee->hasCall())
3816 continue;
3817 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3818 removeNoneTypeCalleeEdges(NewClone);
3819 // Moving the edge may have resulted in some none type
3820 // callee edges on the original Callee.
3821 removeNoneTypeCalleeEdges(Callee);
3822 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3823 // If the Callee node was already assigned to call a specific
3824 // function version, make sure its new clone is assigned to call
3825 // that same function clone.
3826 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3827 RecordCalleeFuncOfCallsite(
3828 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3829 // Update NewClone with the new Call clone of this callsite's Call
3830 // created for the new function clone created earlier.
3831 // Recall that we have already ensured when building the graph
3832 // that each caller can only call callsites within the same
3833 // function, so we are guaranteed that Callee Call is in the
3834 // current OrigFunc.
3835 // CallMap is set up as indexed by original Call at clone 0.
3836 CallInfo OrigCall(Callee->getOrigNode()->Call);
3837 OrigCall.setCloneNo(0);
3838 std::map<CallInfo, CallInfo> &CallMap =
3839 FuncClonesToCallMap[NewFuncClone];
3840 assert(CallMap.count(OrigCall));
3841 CallInfo NewCall(CallMap[OrigCall]);
3842 assert(NewCall);
3843 NewClone->setCall(NewCall);
3844 // Need to do the same for all matching calls.
3845 for (auto &MatchingCall : NewClone->MatchingCalls) {
3846 CallInfo OrigMatchingCall(MatchingCall);
3847 OrigMatchingCall.setCloneNo(0);
3848 assert(CallMap.count(OrigMatchingCall));
3849 CallInfo NewCall(CallMap[OrigMatchingCall]);
3850 assert(NewCall);
3851 // Updates the call in the list.
3852 MatchingCall = NewCall;
3853 }
3854 }
3855 }
3856 // Fall through to handling below to perform the recording of the
3857 // function for this callsite clone. This enables handling of cases
3858 // where the callers were assigned to different clones of a function.
3859 }
3860
3861 // See if we can use existing function clone. Walk through
3862 // all caller edges to see if any have already been assigned to
3863 // a clone of this callsite's function. If we can use it, do so. If not,
3864 // because that function clone is already assigned to a different clone
3865 // of this callsite, then we need to clone again.
3866 // Basically, this checking is needed to handle the case where different
3867 // caller functions/callsites may need versions of this function
3868 // containing different mixes of callsite clones across the different
3869 // callsites within the function. If that happens, we need to create
3870 // additional function clones to handle the various combinations.
3871 //
3872 // Keep track of any new clones of this callsite created by the
3873 // following loop, as well as any existing clone that we decided to
3874 // assign this clone to.
3875 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3876 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3877 // We need to be able to remove Edge from CallerEdges, so need to adjust
3878 // iterator in the loop.
3879 for (auto EI = Clone->CallerEdges.begin();
3880 EI != Clone->CallerEdges.end();) {
3881 auto Edge = *EI;
3882 // Ignore any caller that does not have a recorded callsite Call.
3883 if (!Edge->Caller->hasCall()) {
3884 EI++;
3885 continue;
3886 }
3887 // If this caller already assigned to call a version of OrigFunc, need
3888 // to ensure we can assign this callsite clone to that function clone.
3889 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3890 FuncInfo FuncCloneCalledByCaller =
3891 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3892 // First we need to confirm that this function clone is available
3893 // for use by this callsite node clone.
3894 //
3895 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3896 // its callsite clones, one of those callsite clones X could have
3897 // been assigned to the same function clone called by Edge's caller
3898 // - if Edge's caller calls another callsite within Node's original
3899 // function, and that callsite has another caller reaching clone X.
3900 // We need to clone Node again in this case.
3901 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3902 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3903 Clone) ||
3904 // Detect when we have multiple callers of this callsite that
3905 // have already been assigned to specific, and different, clones
3906 // of OrigFunc (due to other unrelated callsites in Func they
3907 // reach via call contexts). Is this Clone of callsite Node
3908 // assigned to a different clone of OrigFunc? If so, clone Node
3909 // again.
3910 (FuncCloneAssignedToCurCallsiteClone &&
3911 FuncCloneAssignedToCurCallsiteClone !=
3912 FuncCloneCalledByCaller)) {
3913 // We need to use a different newly created callsite clone, in
3914 // order to assign it to another new function clone on a
3915 // subsequent iteration over the Clones array (adjusted below).
3916 // Note we specifically do not reset the
3917 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3918 // when this new clone is processed later we know which version of
3919 // the function to copy (so that other callsite clones we have
3920 // assigned to that function clone are properly cloned over). See
3921 // comments in the function cloning handling earlier.
3922
3923 // Check if we already have cloned this callsite again while
3924 // walking through caller edges, for a caller calling the same
3925 // function clone. If so, we can move this edge to that new clone
3926 // rather than creating yet another new clone.
3927 if (FuncCloneToNewCallsiteCloneMap.count(
3928 FuncCloneCalledByCaller)) {
3929 ContextNode *NewClone =
3930 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3931 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3932 // Cleanup any none type edges cloned over.
3933 removeNoneTypeCalleeEdges(NewClone);
3934 } else {
3935 // Create a new callsite clone.
3936 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3937 removeNoneTypeCalleeEdges(NewClone);
3938 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3939 NewClone;
3940 // Add to list of clones and process later.
3941 ClonesWorklist.push_back(NewClone);
3942 assert(EI == Clone->CallerEdges.end() ||
3943 Clone->AllocTypes != (uint8_t)AllocationType::None);
3944 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3945 }
3946 // Moving the caller edge may have resulted in some none type
3947 // callee edges.
3948 removeNoneTypeCalleeEdges(Clone);
3949 // We will handle the newly created callsite clone in a subsequent
3950 // iteration over this Node's Clones. Continue here since we
3951 // already adjusted iterator EI while moving the edge.
3952 continue;
3953 }
3954
3955 // Otherwise, we can use the function clone already assigned to this
3956 // caller.
3957 if (!FuncCloneAssignedToCurCallsiteClone) {
3958 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3959 // Assign Clone to FuncCloneCalledByCaller
3960 AssignCallsiteCloneToFuncClone(
3961 FuncCloneCalledByCaller, Call, Clone,
3962 AllocationCallToContextNodeMap.count(Call));
3963 } else
3964 // Don't need to do anything - callsite is already calling this
3965 // function clone.
3966 assert(FuncCloneAssignedToCurCallsiteClone ==
3967 FuncCloneCalledByCaller);
3968
3969 } else {
3970 // We have not already assigned this caller to a version of
3971 // OrigFunc. Do the assignment now.
3972
3973 // First check if we have already assigned this callsite clone to a
3974 // clone of OrigFunc for another caller during this iteration over
3975 // its caller edges.
3976 if (!FuncCloneAssignedToCurCallsiteClone) {
3977 // Find first function in FuncClonesToCallMap without an assigned
3978 // clone of this callsite Node. We should always have one
3979 // available at this point due to the earlier cloning when the
3980 // FuncClonesToCallMap size was smaller than the clone number.
3981 for (auto &CF : FuncClonesToCallMap) {
3982 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3983 FuncCloneAssignedToCurCallsiteClone = CF.first;
3984 break;
3985 }
3986 }
3987 assert(FuncCloneAssignedToCurCallsiteClone);
3988 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3989 AssignCallsiteCloneToFuncClone(
3990 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3991 AllocationCallToContextNodeMap.count(Call));
3992 } else
3993 assert(FuncCloneToCurNodeCloneMap
3994 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3995 // Update callers to record function version called.
3996 RecordCalleeFuncOfCallsite(Edge->Caller,
3997 FuncCloneAssignedToCurCallsiteClone);
3998 }
3999
4000 EI++;
4001 }
4002 }
4003 if (VerifyCCG) {
4004 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4005 for (const auto &PE : Node->CalleeEdges)
4006 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4007 for (const auto &CE : Node->CallerEdges)
4008 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4009 for (auto *Clone : Node->Clones) {
4010 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4011 for (const auto &PE : Clone->CalleeEdges)
4012 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4013 for (const auto &CE : Clone->CallerEdges)
4014 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4015 }
4016 }
4017 }
4018 }
4019
4020 auto UpdateCalls = [&](ContextNode *Node,
4022 auto &&UpdateCalls) {
4023 auto Inserted = Visited.insert(Node);
4024 if (!Inserted.second)
4025 return;
4026
4027 for (auto *Clone : Node->Clones)
4028 UpdateCalls(Clone, Visited, UpdateCalls);
4029
4030 for (auto &Edge : Node->CallerEdges)
4031 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4032
4033 // Skip if either no call to update, or if we ended up with no context ids
4034 // (we moved all edges onto other clones).
4035 if (!Node->hasCall() || Node->emptyContextIds())
4036 return;
4037
4038 if (Node->IsAllocation) {
4039 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
4040 assert(Node->MatchingCalls.empty());
4041 return;
4042 }
4043
4044 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4045 return;
4046
4047 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4048 updateCall(Node->Call, CalleeFunc);
4049 // Update all the matching calls as well.
4050 for (auto &Call : Node->MatchingCalls)
4051 updateCall(Call, CalleeFunc);
4052 };
4053
4054 // Performs DFS traversal starting from allocation nodes to update calls to
4055 // reflect cloning decisions recorded earlier. For regular LTO this will
4056 // update the actual calls in the IR to call the appropriate function clone
4057 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4058 // are recorded in the summary entries.
4060 for (auto &Entry : AllocationCallToContextNodeMap)
4061 UpdateCalls(Entry.second, Visited, UpdateCalls);
4062
4063 return Changed;
4064}
4065
4067 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4069 &FuncToAliasMap) {
4070 // The first "clone" is the original copy, we should only call this if we
4071 // needed to create new clones.
4072 assert(NumClones > 1);
4074 VMaps.reserve(NumClones - 1);
4075 FunctionsClonedThinBackend++;
4076 for (unsigned I = 1; I < NumClones; I++) {
4077 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4078 auto *NewF = CloneFunction(&F, *VMaps.back());
4079 FunctionClonesThinBackend++;
4080 // Strip memprof and callsite metadata from clone as they are no longer
4081 // needed.
4082 for (auto &BB : *NewF) {
4083 for (auto &Inst : BB) {
4084 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4085 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4086 }
4087 }
4088 std::string Name = getMemProfFuncName(F.getName(), I);
4089 auto *PrevF = M.getFunction(Name);
4090 if (PrevF) {
4091 // We might have created this when adjusting callsite in another
4092 // function. It should be a declaration.
4093 assert(PrevF->isDeclaration());
4094 NewF->takeName(PrevF);
4095 PrevF->replaceAllUsesWith(NewF);
4096 PrevF->eraseFromParent();
4097 } else
4098 NewF->setName(Name);
4099 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4100 << "created clone " << ore::NV("NewFunction", NewF));
4101
4102 // Now handle aliases to this function, and clone those as well.
4103 if (!FuncToAliasMap.count(&F))
4104 continue;
4105 for (auto *A : FuncToAliasMap[&F]) {
4106 std::string Name = getMemProfFuncName(A->getName(), I);
4107 auto *PrevA = M.getNamedAlias(Name);
4108 auto *NewA = GlobalAlias::create(A->getValueType(),
4109 A->getType()->getPointerAddressSpace(),
4110 A->getLinkage(), Name, NewF);
4111 NewA->copyAttributesFrom(A);
4112 if (PrevA) {
4113 // We might have created this when adjusting callsite in another
4114 // function. It should be a declaration.
4115 assert(PrevA->isDeclaration());
4116 NewA->takeName(PrevA);
4117 PrevA->replaceAllUsesWith(NewA);
4118 PrevA->eraseFromParent();
4119 }
4120 }
4121 }
4122 return VMaps;
4123}
4124
4125// Locate the summary for F. This is complicated by the fact that it might
4126// have been internalized or promoted.
4128 const ModuleSummaryIndex *ImportSummary) {
4129 // FIXME: Ideally we would retain the original GUID in some fashion on the
4130 // function (e.g. as metadata), but for now do our best to locate the
4131 // summary without that information.
4132 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4133 if (!TheFnVI)
4134 // See if theFn was internalized, by checking index directly with
4135 // original name (this avoids the name adjustment done by getGUID() for
4136 // internal symbols).
4137 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4138 if (TheFnVI)
4139 return TheFnVI;
4140 // Now query with the original name before any promotion was performed.
4141 StringRef OrigName =
4143 std::string OrigId = GlobalValue::getGlobalIdentifier(
4144 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
4145 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4146 if (TheFnVI)
4147 return TheFnVI;
4148 // Could be a promoted local imported from another module. We need to pass
4149 // down more info here to find the original module id. For now, try with
4150 // the OrigName which might have been stored in the OidGuidMap in the
4151 // index. This would not work if there were same-named locals in multiple
4152 // modules, however.
4153 auto OrigGUID =
4154 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
4155 if (OrigGUID)
4156 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
4157 return TheFnVI;
4158}
4159
4160bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4161 Module &M) {
4162 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4163 Symtab = std::make_unique<InstrProfSymtab>();
4164 // Don't add canonical names, to avoid multiple functions to the symtab
4165 // when they both have the same root name with "." suffixes stripped.
4166 // If we pick the wrong one then this could lead to incorrect ICP and calling
4167 // a memprof clone that we don't actually create (resulting in linker unsats).
4168 // What this means is that the GUID of the function (or its PGOFuncName
4169 // metadata) *must* match that in the VP metadata to allow promotion.
4170 // In practice this should not be a limitation, since local functions should
4171 // have PGOFuncName metadata and global function names shouldn't need any
4172 // special handling (they should not get the ".llvm.*" suffix that the
4173 // canonicalization handling is attempting to strip).
4174 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4175 std::string SymtabFailure = toString(std::move(E));
4176 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4177 return false;
4178 }
4179 return true;
4180}
4181
4182bool MemProfContextDisambiguation::applyImport(Module &M) {
4183 assert(ImportSummary);
4184 bool Changed = false;
4185
4186 // We also need to clone any aliases that reference cloned functions, because
4187 // the modified callsites may invoke via the alias. Keep track of the aliases
4188 // for each function.
4189 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4190 FuncToAliasMap;
4191 for (auto &A : M.aliases()) {
4192 auto *Aliasee = A.getAliaseeObject();
4193 if (auto *F = dyn_cast<Function>(Aliasee))
4194 FuncToAliasMap[F].insert(&A);
4195 }
4196
4197 if (!initializeIndirectCallPromotionInfo(M))
4198 return false;
4199
4200 for (auto &F : M) {
4201 if (F.isDeclaration() || isMemProfClone(F))
4202 continue;
4203
4205
4207 bool ClonesCreated = false;
4208 unsigned NumClonesCreated = 0;
4209 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4210 // We should at least have version 0 which is the original copy.
4211 assert(NumClones > 0);
4212 // If only one copy needed use original.
4213 if (NumClones == 1)
4214 return;
4215 // If we already performed cloning of this function, confirm that the
4216 // requested number of clones matches (the thin link should ensure the
4217 // number of clones for each constituent callsite is consistent within
4218 // each function), before returning.
4219 if (ClonesCreated) {
4220 assert(NumClonesCreated == NumClones);
4221 return;
4222 }
4223 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4224 // The first "clone" is the original copy, which doesn't have a VMap.
4225 assert(VMaps.size() == NumClones - 1);
4226 Changed = true;
4227 ClonesCreated = true;
4228 NumClonesCreated = NumClones;
4229 };
4230
4231 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4232 Function *CalledFunction) {
4233 // Perform cloning if not yet done.
4234 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4235
4236 assert(!isMemProfClone(*CalledFunction));
4237
4238 // Update the calls per the summary info.
4239 // Save orig name since it gets updated in the first iteration
4240 // below.
4241 auto CalleeOrigName = CalledFunction->getName();
4242 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4243 // Do nothing if this version calls the original version of its
4244 // callee.
4245 if (!StackNode.Clones[J])
4246 continue;
4247 auto NewF = M.getOrInsertFunction(
4248 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4249 CalledFunction->getFunctionType());
4250 CallBase *CBClone;
4251 // Copy 0 is the original function.
4252 if (!J)
4253 CBClone = CB;
4254 else
4255 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4256 CBClone->setCalledFunction(NewF);
4257 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4258 << ore::NV("Call", CBClone) << " in clone "
4259 << ore::NV("Caller", CBClone->getFunction())
4260 << " assigned to call function clone "
4261 << ore::NV("Callee", NewF.getCallee()));
4262 }
4263 };
4264
4265 // Locate the summary for F.
4266 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4267 // If not found, this could be an imported local (see comment in
4268 // findValueInfoForFunc). Skip for now as it will be cloned in its original
4269 // module (where it would have been promoted to global scope so should
4270 // satisfy any reference in this module).
4271 if (!TheFnVI)
4272 continue;
4273
4274 auto *GVSummary =
4275 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4276 if (!GVSummary) {
4277 // Must have been imported, use the summary which matches the definition。
4278 // (might be multiple if this was a linkonce_odr).
4279 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4280 assert(SrcModuleMD &&
4281 "enable-import-metadata is needed to emit thinlto_src_module");
4282 StringRef SrcModule =
4283 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4284 for (auto &GVS : TheFnVI.getSummaryList()) {
4285 if (GVS->modulePath() == SrcModule) {
4286 GVSummary = GVS.get();
4287 break;
4288 }
4289 }
4290 assert(GVSummary && GVSummary->modulePath() == SrcModule);
4291 }
4292
4293 // If this was an imported alias skip it as we won't have the function
4294 // summary, and it should be cloned in the original module.
4295 if (isa<AliasSummary>(GVSummary))
4296 continue;
4297
4298 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4299
4300 if (FS->allocs().empty() && FS->callsites().empty())
4301 continue;
4302
4303 auto SI = FS->callsites().begin();
4304 auto AI = FS->allocs().begin();
4305
4306 // To handle callsite infos synthesized for tail calls which have missing
4307 // frames in the profiled context, map callee VI to the synthesized callsite
4308 // info.
4309 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4310 // Iterate the callsites for this function in reverse, since we place all
4311 // those synthesized for tail calls at the end.
4312 for (auto CallsiteIt = FS->callsites().rbegin();
4313 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4314 auto &Callsite = *CallsiteIt;
4315 // Stop as soon as we see a non-synthesized callsite info (see comment
4316 // above loop). All the entries added for discovered tail calls have empty
4317 // stack ids.
4318 if (!Callsite.StackIdIndices.empty())
4319 break;
4320 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4321 }
4322
4323 // Keeps track of needed ICP for the function.
4324 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4325
4326 // Assume for now that the instructions are in the exact same order
4327 // as when the summary was created, but confirm this is correct by
4328 // matching the stack ids.
4329 for (auto &BB : F) {
4330 for (auto &I : BB) {
4331 auto *CB = dyn_cast<CallBase>(&I);
4332 // Same handling as when creating module summary.
4333 if (!mayHaveMemprofSummary(CB))
4334 continue;
4335
4336 auto *CalledValue = CB->getCalledOperand();
4337 auto *CalledFunction = CB->getCalledFunction();
4338 if (CalledValue && !CalledFunction) {
4339 CalledValue = CalledValue->stripPointerCasts();
4340 // Stripping pointer casts can reveal a called function.
4341 CalledFunction = dyn_cast<Function>(CalledValue);
4342 }
4343 // Check if this is an alias to a function. If so, get the
4344 // called aliasee for the checks below.
4345 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4346 assert(!CalledFunction &&
4347 "Expected null called function in callsite for alias");
4348 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4349 }
4350
4352 I.getMetadata(LLVMContext::MD_callsite));
4353 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4354
4355 // Include allocs that were already assigned a memprof function
4356 // attribute in the statistics.
4357 if (CB->getAttributes().hasFnAttr("memprof")) {
4358 assert(!MemProfMD);
4359 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4360 ? AllocTypeColdThinBackend++
4361 : AllocTypeNotColdThinBackend++;
4362 OrigAllocsThinBackend++;
4363 AllocVersionsThinBackend++;
4364 if (!MaxAllocVersionsThinBackend)
4365 MaxAllocVersionsThinBackend = 1;
4366 continue;
4367 }
4368
4369 if (MemProfMD) {
4370 // Consult the next alloc node.
4371 assert(AI != FS->allocs().end());
4372 auto &AllocNode = *(AI++);
4373
4374 // Sanity check that the MIB stack ids match between the summary and
4375 // instruction metadata.
4376 auto MIBIter = AllocNode.MIBs.begin();
4377 for (auto &MDOp : MemProfMD->operands()) {
4378 assert(MIBIter != AllocNode.MIBs.end());
4379 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4380 MIBIter->StackIdIndices.begin();
4381 auto *MIBMD = cast<const MDNode>(MDOp);
4382 MDNode *StackMDNode = getMIBStackNode(MIBMD);
4383 assert(StackMDNode);
4384 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4385 auto ContextIterBegin =
4386 StackContext.beginAfterSharedPrefix(CallsiteContext);
4387 // Skip the checking on the first iteration.
4388 uint64_t LastStackContextId =
4389 (ContextIterBegin != StackContext.end() &&
4390 *ContextIterBegin == 0)
4391 ? 1
4392 : 0;
4393 for (auto ContextIter = ContextIterBegin;
4394 ContextIter != StackContext.end(); ++ContextIter) {
4395 // If this is a direct recursion, simply skip the duplicate
4396 // entries, to be consistent with how the summary ids were
4397 // generated during ModuleSummaryAnalysis.
4398 if (LastStackContextId == *ContextIter)
4399 continue;
4400 LastStackContextId = *ContextIter;
4401 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4402 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4403 *ContextIter);
4404 StackIdIndexIter++;
4405 }
4406 MIBIter++;
4407 }
4408
4409 // Perform cloning if not yet done.
4410 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4411
4412 OrigAllocsThinBackend++;
4413 AllocVersionsThinBackend += AllocNode.Versions.size();
4414 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4415 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4416
4417 // If there is only one version that means we didn't end up
4418 // considering this function for cloning, and in that case the alloc
4419 // will still be none type or should have gotten the default NotCold.
4420 // Skip that after calling clone helper since that does some sanity
4421 // checks that confirm we haven't decided yet that we need cloning.
4422 if (AllocNode.Versions.size() == 1) {
4423 assert((AllocationType)AllocNode.Versions[0] ==
4425 (AllocationType)AllocNode.Versions[0] ==
4427 UnclonableAllocsThinBackend++;
4428 continue;
4429 }
4430
4431 // All versions should have a singular allocation type.
4432 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4433 return Type == ((uint8_t)AllocationType::NotCold |
4434 (uint8_t)AllocationType::Cold);
4435 }));
4436
4437 // Update the allocation types per the summary info.
4438 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4439 // Ignore any that didn't get an assigned allocation type.
4440 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4441 continue;
4442 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4443 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4444 : AllocTypeNotColdThinBackend++;
4445 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4446 auto A = llvm::Attribute::get(F.getContext(), "memprof",
4447 AllocTypeString);
4448 CallBase *CBClone;
4449 // Copy 0 is the original function.
4450 if (!J)
4451 CBClone = CB;
4452 else
4453 // Since VMaps are only created for new clones, we index with
4454 // clone J-1 (J==0 is the original clone and does not have a VMaps
4455 // entry).
4456 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4457 CBClone->addFnAttr(A);
4458 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4459 << ore::NV("AllocationCall", CBClone) << " in clone "
4460 << ore::NV("Caller", CBClone->getFunction())
4461 << " marked with memprof allocation attribute "
4462 << ore::NV("Attribute", AllocTypeString));
4463 }
4464 } else if (!CallsiteContext.empty()) {
4465 if (!CalledFunction) {
4466#ifndef NDEBUG
4467 // We should have skipped inline assembly calls.
4468 auto *CI = dyn_cast<CallInst>(CB);
4469 assert(!CI || !CI->isInlineAsm());
4470#endif
4471 // We should have skipped direct calls via a Constant.
4472 assert(CalledValue && !isa<Constant>(CalledValue));
4473
4474 // This is an indirect call, see if we have profile information and
4475 // whether any clones were recorded for the profiled targets (that
4476 // we synthesized CallsiteInfo summary records for when building the
4477 // index).
4478 auto NumClones =
4479 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4480
4481 // Perform cloning if not yet done. This is done here in case
4482 // we don't need to do ICP, but might need to clone this
4483 // function as it is the target of other cloned calls.
4484 if (NumClones)
4485 CloneFuncIfNeeded(NumClones);
4486 }
4487
4488 else {
4489 // Consult the next callsite node.
4490 assert(SI != FS->callsites().end());
4491 auto &StackNode = *(SI++);
4492
4493#ifndef NDEBUG
4494 // Sanity check that the stack ids match between the summary and
4495 // instruction metadata.
4496 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4497 for (auto StackId : CallsiteContext) {
4498 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4499 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4500 StackId);
4501 StackIdIndexIter++;
4502 }
4503#endif
4504
4505 CloneCallsite(StackNode, CB, CalledFunction);
4506 }
4507 } else if (CB->isTailCall() && CalledFunction) {
4508 // Locate the synthesized callsite info for the callee VI, if any was
4509 // created, and use that for cloning.
4510 ValueInfo CalleeVI =
4511 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
4512 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4513 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4514 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4515 CloneCallsite(Callsite->second, CB, CalledFunction);
4516 }
4517 }
4518 }
4519 }
4520
4521 // Now do any promotion required for cloning.
4522 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4523 }
4524
4525 // We skip some of the functions and instructions above, so remove all the
4526 // metadata in a single sweep here.
4527 for (auto &F : M) {
4528 // We can skip memprof clones because createFunctionClones already strips
4529 // the metadata from the newly created clones.
4530 if (F.isDeclaration() || isMemProfClone(F))
4531 continue;
4532 for (auto &BB : F) {
4533 for (auto &I : BB) {
4534 if (!isa<CallBase>(I))
4535 continue;
4536 I.setMetadata(LLVMContext::MD_memprof, nullptr);
4537 I.setMetadata(LLVMContext::MD_callsite, nullptr);
4538 }
4539 }
4540 }
4541
4542 return Changed;
4543}
4544
4545unsigned MemProfContextDisambiguation::recordICPInfo(
4546 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4548 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4549 // First see if we have profile information for this indirect call.
4550 uint32_t NumCandidates;
4551 uint64_t TotalCount;
4552 auto CandidateProfileData =
4553 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4554 NumCandidates);
4555 if (CandidateProfileData.empty())
4556 return 0;
4557
4558 // Iterate through all of the candidate profiled targets along with the
4559 // CallsiteInfo summary records synthesized for them when building the index,
4560 // and see if any are cloned and/or refer to clones.
4561 bool ICPNeeded = false;
4562 unsigned NumClones = 0;
4563 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4564 for (const auto &Candidate : CandidateProfileData) {
4565#ifndef NDEBUG
4566 auto CalleeValueInfo =
4567#endif
4568 ImportSummary->getValueInfo(Candidate.Value);
4569 // We might not have a ValueInfo if this is a distributed
4570 // ThinLTO backend and decided not to import that function.
4571 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4572 assert(SI != AllCallsites.end());
4573 auto &StackNode = *(SI++);
4574 // See if any of the clones of the indirect callsite for this
4575 // profiled target should call a cloned version of the profiled
4576 // target. We only need to do the ICP here if so.
4577 ICPNeeded |= llvm::any_of(StackNode.Clones,
4578 [](unsigned CloneNo) { return CloneNo != 0; });
4579 // Every callsite in the same function should have been cloned the same
4580 // number of times.
4581 assert(!NumClones || NumClones == StackNode.Clones.size());
4582 NumClones = StackNode.Clones.size();
4583 }
4584 if (!ICPNeeded)
4585 return NumClones;
4586 // Save information for ICP, which is performed later to avoid messing up the
4587 // current function traversal.
4588 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4589 TotalCount, CallsiteInfoStartIndex});
4590 return NumClones;
4591}
4592
4593void MemProfContextDisambiguation::performICP(
4594 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4595 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4596 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4598 // Now do any promotion required for cloning. Specifically, for each
4599 // recorded ICP candidate (which was only recorded because one clone of that
4600 // candidate should call a cloned target), we perform ICP (speculative
4601 // devirtualization) for each clone of the callsite, and update its callee
4602 // to the appropriate clone. Note that the ICP compares against the original
4603 // version of the target, which is what is in the vtable.
4604 for (auto &Info : ICallAnalysisInfo) {
4605 auto *CB = Info.CB;
4606 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4607 auto TotalCount = Info.TotalCount;
4608 unsigned NumPromoted = 0;
4609 unsigned NumClones = 0;
4610
4611 for (auto &Candidate : Info.CandidateProfileData) {
4612 auto &StackNode = AllCallsites[CallsiteIndex++];
4613
4614 // All calls in the same function must have the same number of clones.
4615 assert(!NumClones || NumClones == StackNode.Clones.size());
4616 NumClones = StackNode.Clones.size();
4617
4618 // See if the target is in the module. If it wasn't imported, it is
4619 // possible that this profile could have been collected on a different
4620 // target (or version of the code), and we need to be conservative
4621 // (similar to what is done in the ICP pass).
4622 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4623 if (TargetFunction == nullptr ||
4624 // Any ThinLTO global dead symbol removal should have already
4625 // occurred, so it should be safe to promote when the target is a
4626 // declaration.
4627 // TODO: Remove internal option once more fully tested.
4629 TargetFunction->isDeclaration())) {
4630 ORE.emit([&]() {
4631 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4632 << "Memprof cannot promote indirect call: target with md5sum "
4633 << ore::NV("target md5sum", Candidate.Value) << " not found";
4634 });
4635 // FIXME: See if we can use the new declaration importing support to
4636 // at least get the declarations imported for this case. Hot indirect
4637 // targets should have been imported normally, however.
4638 continue;
4639 }
4640
4641 // Check if legal to promote
4642 const char *Reason = nullptr;
4643 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4644 ORE.emit([&]() {
4645 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4646 << "Memprof cannot promote indirect call to "
4647 << ore::NV("TargetFunction", TargetFunction)
4648 << " with count of " << ore::NV("TotalCount", TotalCount)
4649 << ": " << Reason;
4650 });
4651 continue;
4652 }
4653
4654 assert(!isMemProfClone(*TargetFunction));
4655
4656 // Handle each call clone, applying ICP so that each clone directly
4657 // calls the specified callee clone, guarded by the appropriate ICP
4658 // check.
4659 CallBase *CBClone = CB;
4660 for (unsigned J = 0; J < NumClones; J++) {
4661 // Copy 0 is the original function.
4662 if (J > 0)
4663 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4664 // We do the promotion using the original name, so that the comparison
4665 // is against the name in the vtable. Then just below, change the new
4666 // direct call to call the cloned function.
4667 auto &DirectCall =
4668 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4669 TotalCount, isSamplePGO, &ORE);
4670 auto *TargetToUse = TargetFunction;
4671 // Call original if this version calls the original version of its
4672 // callee.
4673 if (StackNode.Clones[J]) {
4674 TargetToUse =
4675 cast<Function>(M.getOrInsertFunction(
4676 getMemProfFuncName(TargetFunction->getName(),
4677 StackNode.Clones[J]),
4678 TargetFunction->getFunctionType())
4679 .getCallee());
4680 }
4681 DirectCall.setCalledFunction(TargetToUse);
4682 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4683 << ore::NV("Call", CBClone) << " in clone "
4684 << ore::NV("Caller", CBClone->getFunction())
4685 << " promoted and assigned to call function clone "
4686 << ore::NV("Callee", TargetToUse));
4687 }
4688
4689 // Update TotalCount (all clones should get same count above)
4690 TotalCount -= Candidate.Count;
4691 NumPromoted++;
4692 }
4693 // Adjust the MD.prof metadata for all clones, now that we have the new
4694 // TotalCount and the number promoted.
4695 CallBase *CBClone = CB;
4696 for (unsigned J = 0; J < NumClones; J++) {
4697 // Copy 0 is the original function.
4698 if (J > 0)
4699 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4700 // First delete the old one.
4701 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4702 // If all promoted, we don't need the MD.prof metadata.
4703 // Otherwise we need update with the un-promoted records back.
4704 if (TotalCount != 0)
4706 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4707 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4708 }
4709 }
4710}
4711
4712template <typename DerivedCCG, typename FuncTy, typename CallTy>
4713bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4714 if (DumpCCG) {
4715 dbgs() << "CCG before cloning:\n";
4716 dbgs() << *this;
4717 }
4718 if (ExportToDot)
4719 exportToDot("postbuild");
4720
4721 if (VerifyCCG) {
4722 check();
4723 }
4724
4725 identifyClones();
4726
4727 if (VerifyCCG) {
4728 check();
4729 }
4730
4731 if (DumpCCG) {
4732 dbgs() << "CCG after cloning:\n";
4733 dbgs() << *this;
4734 }
4735 if (ExportToDot)
4736 exportToDot("cloned");
4737
4738 bool Changed = assignFunctions();
4739
4740 if (DumpCCG) {
4741 dbgs() << "CCG after assigning function clones:\n";
4742 dbgs() << *this;
4743 }
4744 if (ExportToDot)
4745 exportToDot("clonefuncassign");
4746
4748 printTotalSizes(errs());
4749
4750 return Changed;
4751}
4752
4753bool MemProfContextDisambiguation::processModule(
4754 Module &M,
4756
4757 // If we have an import summary, then the cloning decisions were made during
4758 // the thin link on the index. Apply them and return.
4759 if (ImportSummary)
4760 return applyImport(M);
4761
4762 // TODO: If/when other types of memprof cloning are enabled beyond just for
4763 // hot and cold, we will need to change this to individually control the
4764 // AllocationType passed to addStackNodesForMIB during CCG construction.
4765 // Note that we specifically check this after applying imports above, so that
4766 // the option isn't needed to be passed to distributed ThinLTO backend
4767 // clang processes, which won't necessarily have visibility into the linker
4768 // dependences. Instead the information is communicated from the LTO link to
4769 // the backends via the combined summary index.
4770 if (!SupportsHotColdNew)
4771 return false;
4772
4773 ModuleCallsiteContextGraph CCG(M, OREGetter);
4774 return CCG.process();
4775}
4776
4778 const ModuleSummaryIndex *Summary, bool isSamplePGO)
4779 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4780 if (ImportSummary) {
4781 // The MemProfImportSummary should only be used for testing ThinLTO
4782 // distributed backend handling via opt, in which case we don't have a
4783 // summary from the pass pipeline.
4785 return;
4786 }
4787 if (MemProfImportSummary.empty())
4788 return;
4789
4790 auto ReadSummaryFile =
4792 if (!ReadSummaryFile) {
4793 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
4794 "Error loading file '" + MemProfImportSummary +
4795 "': ");
4796 return;
4797 }
4798 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
4799 if (!ImportSummaryForTestingOrErr) {
4800 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
4801 "Error parsing file '" + MemProfImportSummary +
4802 "': ");
4803 return;
4804 }
4805 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4806 ImportSummary = ImportSummaryForTesting.get();
4807}
4808
4811 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4812 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4814 };
4815 if (!processModule(M, OREGetter))
4816 return PreservedAnalyses::all();
4817 return PreservedAnalyses::none();
4818}
4819
4823 isPrevailing) {
4824 // TODO: If/when other types of memprof cloning are enabled beyond just for
4825 // hot and cold, we will need to change this to individually control the
4826 // AllocationType passed to addStackNodesForMIB during CCG construction.
4827 // The index was set from the option, so these should be in sync.
4828 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4829 if (!SupportsHotColdNew)
4830 return;
4831
4832 IndexCallsiteContextGraph CCG(Index, isPrevailing);
4833 CCG.process();
4834}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static bool isMemProfClone(const Function &F)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:249
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1482
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1388
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:557
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:184
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:99
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:211
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1231
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
const char * toString(DWARFSectionKind Kind)
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const